]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/reiser4-for-2.6.22-2.patch
Kernel 2.6.22.1 - leider ohne OpenSwan.
[people/pmueller/ipfire-2.x.git] / src / patches / reiser4-for-2.6.22-2.patch
CommitLineData
71430cf6
MT
1The same as reiser4-for-2.6.22.patch plus a fix for file conversion
2related bug wich caused metadata corruption when REISER4_DEBUG is on.
3
4diff -urN linux-2.6.22.orig/arch/i386/lib/usercopy.c linux-2.6.22/arch/i386/lib/usercopy.c
5--- linux-2.6.22.orig/arch/i386/lib/usercopy.c 2007-07-21 00:32:46.973831675 +0400
6+++ linux-2.6.22/arch/i386/lib/usercopy.c 2007-07-29 00:25:34.800676805 +0400
7@@ -817,6 +817,7 @@
8 #endif
9 return n;
10 }
11+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
12
13 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
14 unsigned long n)
15@@ -831,6 +832,7 @@
16 #endif
17 return n;
18 }
19+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
20
21 /**
22 * copy_to_user: - Copy a block of data into user space.
23diff -urN linux-2.6.22.orig/Documentation/Changes linux-2.6.22/Documentation/Changes
24--- linux-2.6.22.orig/Documentation/Changes 2007-07-21 00:31:57.012856483 +0400
25+++ linux-2.6.22/Documentation/Changes 2007-07-29 00:25:34.800676805 +0400
26@@ -36,6 +36,7 @@
44254afd
MT
27 o e2fsprogs 1.29 # tune2fs
28 o jfsutils 1.1.3 # fsck.jfs -V
29 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
30+o reiser4progs 1.0.0 # fsck.reiser4 -V
31 o xfsprogs 2.6.0 # xfs_db -V
71430cf6
MT
32 o pcmciautils 004 # pccardctl -V
33 o quota-tools 3.09 # quota -V
34@@ -144,6 +145,13 @@
44254afd
MT
35 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
36 reiserfsck. These utils work on both i386 and alpha platforms.
37
38+Reiser4progs
39+------------
40+
41+The reiser4progs package contains utilities for the reiser4 file system.
42+Detailed instructions are provided in the README file located at:
43+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
44+
45 Xfsprogs
46 --------
47
71430cf6 48@@ -322,6 +330,10 @@
44254afd
MT
49 -------------
50 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
51
52+Reiser4progs
53+------------
54+o <ftp://ftp.namesys.com/pub/reiser4progs/>
55+
56 Xfsprogs
57 --------
58 o <ftp://oss.sgi.com/projects/xfs/download/>
71430cf6
MT
59diff -urN linux-2.6.22.orig/Documentation/filesystems/reiser4.txt linux-2.6.22/Documentation/filesystems/reiser4.txt
60--- linux-2.6.22.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
61+++ linux-2.6.22/Documentation/filesystems/reiser4.txt 2007-07-29 00:25:34.800676805 +0400
44254afd
MT
62@@ -0,0 +1,75 @@
63+Reiser4 filesystem
64+==================
65+Reiser4 is a file system based on dancing tree algorithms, and is
66+described at http://www.namesys.com
67+
68+
69+References
70+==========
71+web page http://namesys.com/v4/v4.html
72+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
73+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
74+install page http://www.namesys.com/install_v4.html
75+
76+Compile options
77+===============
78+Enable reiser4 debug mode
79+ This checks everything imaginable while reiser4
80+ runs
81+
82+Mount options
83+=============
84+tmgr.atom_max_size=N
85+ Atoms containing more than N blocks will be forced to commit.
86+ N is decimal.
87+ Default is nr_free_pagecache_pages() / 2 at mount time.
88+
89+tmgr.atom_max_age=N
90+ Atoms older than N seconds will be forced to commit. N is decimal.
91+ Default is 600.
92+
93+tmgr.atom_max_flushers=N
94+ Limit of concurrent flushers for one atom. 0 means no limit.
95+ Default is 0.
96+
97+tree.cbk_cache.nr_slots=N
98+ Number of slots in the cbk cache.
99+
100+flush.relocate_threshold=N
101+ If flush finds more than N adjacent dirty leaf-level blocks it
102+ will force them to be relocated.
103+ Default is 64.
104+
105+flush.relocate_distance=N
106+ If flush finds can find a block allocation closer than at most
107+ N from the preceder it will relocate to that position.
108+ Default is 64.
109+
110+flush.scan_maxnodes=N
111+ The maximum number of nodes to scan left on a level during
112+ flush.
113+ Default is 10000.
114+
115+optimal_io_size=N
116+ Preferred IO size. This value is used to set st_blksize of
117+ struct stat.
118+ Default is 65536.
119+
120+bsdgroups
121+ Turn on BSD-style gid assignment.
122+
123+32bittimes
124+ By default file in reiser4 have 64 bit timestamps. Files
125+ created when filesystem is mounted with 32bittimes mount
126+ option will get 32 bit timestamps.
127+
128+mtflush
129+ Turn off concurrent flushing.
130+
131+nopseudo
132+ Disable pseudo files support. See
133+ http://namesys.com/v4/pseudo.html for more about pseudo files.
134+
135+dont_load_bitmap
136+ Don't load all bitmap blocks at mount time, it is useful for
137+ machines with tiny RAM and large disks.
71430cf6
MT
138diff -urN linux-2.6.22.orig/fs/fs-writeback.c linux-2.6.22/fs/fs-writeback.c
139--- linux-2.6.22.orig/fs/fs-writeback.c 2007-07-21 00:32:04.502801671 +0400
140+++ linux-2.6.22/fs/fs-writeback.c 2007-07-29 00:25:34.808678876 +0400
141@@ -296,8 +296,6 @@
44254afd
MT
142 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
143 * that it can be located for waiting on in __writeback_single_inode().
144 *
145- * Called under inode_lock.
146- *
147 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
148 * This function assumes that the blockdev superblock's inodes are backed by
149 * a variety of queues, so all inodes are searched. For other superblocks,
71430cf6 150@@ -313,11 +311,13 @@
44254afd
MT
151 * on the writer throttling path, and we get decent balancing between many
152 * throttled threads: we don't want them all piling up on __wait_on_inode.
153 */
154-static void
155-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
156+void
157+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
158 {
159 const unsigned long start = jiffies; /* livelock avoidance */
160
161+ spin_lock(&inode_lock);
162+
163 if (!wbc->for_kupdate || list_empty(&sb->s_io))
164 list_splice_init(&sb->s_dirty, &sb->s_io);
165
71430cf6 166@@ -397,8 +397,19 @@
44254afd
MT
167 if (wbc->nr_to_write <= 0)
168 break;
169 }
170+ spin_unlock(&inode_lock);
171 return; /* Leave any unwritten inodes on s_io */
172 }
173+EXPORT_SYMBOL(generic_sync_sb_inodes);
174+
175+static void
176+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
177+{
178+ if (sb->s_op->sync_inodes)
179+ sb->s_op->sync_inodes(sb, wbc);
180+ else
181+ generic_sync_sb_inodes(sb, wbc);
182+}
183
184 /*
185 * Start writeback of dirty pagecache data against all unlocked inodes.
71430cf6 186@@ -439,11 +450,8 @@
44254afd
MT
187 * be unmounted by the time it is released.
188 */
189 if (down_read_trylock(&sb->s_umount)) {
190- if (sb->s_root) {
191- spin_lock(&inode_lock);
192+ if (sb->s_root)
193 sync_sb_inodes(sb, wbc);
194- spin_unlock(&inode_lock);
195- }
196 up_read(&sb->s_umount);
197 }
198 spin_lock(&sb_lock);
71430cf6 199@@ -481,9 +489,7 @@
44254afd
MT
200 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
201 nr_dirty + nr_unstable;
202 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
203- spin_lock(&inode_lock);
204 sync_sb_inodes(sb, &wbc);
205- spin_unlock(&inode_lock);
206 }
207
208 /*
71430cf6
MT
209diff -urN linux-2.6.22.orig/fs/Kconfig linux-2.6.22/fs/Kconfig
210--- linux-2.6.22.orig/fs/Kconfig 2007-07-21 00:32:57.540575927 +0400
211+++ linux-2.6.22/fs/Kconfig 2007-07-29 00:25:34.812679911 +0400
212@@ -272,6 +272,8 @@
213 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
214 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
215
216+source "fs/reiser4/Kconfig"
44254afd 217+
71430cf6
MT
218 config REISERFS_FS
219 tristate "Reiserfs support"
220 help
221diff -urN linux-2.6.22.orig/fs/Makefile linux-2.6.22/fs/Makefile
222--- linux-2.6.22.orig/fs/Makefile 2007-07-21 00:32:57.544576967 +0400
223+++ linux-2.6.22/fs/Makefile 2007-07-29 00:25:34.812679911 +0400
224@@ -66,6 +66,7 @@
225
226 # Do not add any filesystems before this line
227 obj-$(CONFIG_REISERFS_FS) += reiserfs/
228+obj-$(CONFIG_REISER4_FS) += reiser4/
229 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
230 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
231 obj-$(CONFIG_JBD) += jbd/
232diff -urN linux-2.6.22.orig/fs/reiser4/as_ops.c linux-2.6.22/fs/reiser4/as_ops.c
233--- linux-2.6.22.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
234+++ linux-2.6.22/fs/reiser4/as_ops.c 2007-07-29 00:25:34.816680947 +0400
235@@ -0,0 +1,337 @@
44254afd
MT
236+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
237+
238+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
239+
240+#include "forward.h"
241+#include "debug.h"
242+#include "dformat.h"
243+#include "coord.h"
244+#include "plugin/item/item.h"
245+#include "plugin/file/file.h"
246+#include "plugin/security/perm.h"
247+#include "plugin/disk_format/disk_format.h"
248+#include "plugin/plugin.h"
249+#include "plugin/plugin_set.h"
250+#include "plugin/object.h"
251+#include "txnmgr.h"
252+#include "jnode.h"
253+#include "znode.h"
254+#include "block_alloc.h"
255+#include "tree.h"
256+#include "vfs_ops.h"
257+#include "inode.h"
258+#include "page_cache.h"
259+#include "ktxnmgrd.h"
260+#include "super.h"
261+#include "reiser4.h"
262+#include "entd.h"
263+
264+#include <linux/profile.h>
265+#include <linux/types.h>
266+#include <linux/mount.h>
267+#include <linux/vfs.h>
268+#include <linux/mm.h>
269+#include <linux/buffer_head.h>
270+#include <linux/dcache.h>
271+#include <linux/list.h>
272+#include <linux/pagemap.h>
273+#include <linux/slab.h>
274+#include <linux/seq_file.h>
275+#include <linux/init.h>
276+#include <linux/module.h>
277+#include <linux/writeback.h>
278+#include <linux/backing-dev.h>
279+#include <linux/quotaops.h>
280+#include <linux/security.h>
281+
282+/* address space operations */
283+
284+/**
285+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
286+ * @page: page to be dirtied
287+ *
288+ * Operation of struct address_space_operations. This implementation is used by
71430cf6 289+ * unix and cryptcompress file plugins.
44254afd
MT
290+ *
291+ * This is called when reiser4 page gets dirtied outside of reiser4, for
292+ * example, when dirty bit is moved from pte to physical page.
293+ *
294+ * Tags page in the mapping's page tree with special tag so that it is possible
295+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
296+ * capturing by an atom) later because it can not be done in the contexts where
297+ * set_page_dirty is called.
298+ */
299+int reiser4_set_page_dirty(struct page *page)
300+{
301+ /* this page can be unformatted only */
302+ assert("vs-1734", (page->mapping &&
303+ page->mapping->host &&
71430cf6 304+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
44254afd 305+ page->mapping->host
71430cf6 306+ && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
44254afd 307+ page->mapping->host
71430cf6 308+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
44254afd
MT
309+ page->mapping->host));
310+
311+ if (!TestSetPageDirty(page)) {
312+ struct address_space *mapping = page->mapping;
313+
314+ if (mapping) {
315+ write_lock_irq(&mapping->tree_lock);
316+
317+ /* check for race with truncate */
318+ if (page->mapping) {
319+ assert("vs-1652", page->mapping == mapping);
320+ if (mapping_cap_account_dirty(mapping))
71430cf6
MT
321+ inc_zone_page_state(page,
322+ NR_FILE_DIRTY);
44254afd
MT
323+ radix_tree_tag_set(&mapping->page_tree,
324+ page->index,
325+ PAGECACHE_TAG_REISER4_MOVED);
326+ }
327+ write_unlock_irq(&mapping->tree_lock);
328+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
329+ }
330+ }
331+ return 0;
332+}
333+
44254afd
MT
334+/* ->invalidatepage method for reiser4 */
335+
336+/*
337+ * this is called for each truncated page from
338+ * truncate_inode_pages()->truncate_{complete,partial}_page().
339+ *
340+ * At the moment of call, page is under lock, and outstanding io (if any) has
341+ * completed.
342+ */
343+
344+/**
345+ * reiser4_invalidatepage
346+ * @page: page to invalidate
347+ * @offset: starting offset for partial invalidation
348+ *
349+ */
71430cf6 350+void reiser4_invalidatepage(struct page *page, unsigned long offset)
44254afd
MT
351+{
352+ int ret = 0;
353+ reiser4_context *ctx;
354+ struct inode *inode;
355+ jnode *node;
356+
357+ /*
358+ * This is called to truncate file's page.
359+ *
360+ * Originally, reiser4 implemented truncate in a standard way
361+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
362+ * first, then file system ->truncate() call-back is invoked).
363+ *
364+ * This lead to the problem when ->invalidatepage() was called on a
365+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
366+ * process. That is, truncate was bypassing transactions. To avoid
367+ * this, try_capture_page_to_invalidate() call was added here.
368+ *
369+ * After many troubles with vmtruncate() based truncate (including
370+ * races with flush, tail conversion, etc.) it was re-written in the
71430cf6
MT
371+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
372+ * and pages belonging to extent are invalidated in kill_hook_extent().
373+ * So probably now additional call to capture is not needed here.
44254afd
MT
374+ */
375+
376+ assert("nikita-3137", PageLocked(page));
377+ assert("nikita-3138", !PageWriteback(page));
378+ inode = page->mapping->host;
379+
380+ /*
381+ * ->invalidatepage() should only be called for the unformatted
382+ * jnodes. Destruction of all other types of jnodes is performed
383+ * separately. But, during some corner cases (like handling errors
384+ * during mount) it is simpler to let ->invalidatepage to be called on
385+ * them. Check for this, and do nothing.
386+ */
71430cf6
MT
387+ if (reiser4_get_super_fake(inode->i_sb) == inode)
388+ return;
389+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
390+ return;
391+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
392+ return;
44254afd
MT
393+ assert("vs-1426", PagePrivate(page));
394+ assert("vs-1427",
395+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
396+ assert("", jprivate(page) != NULL);
397+ assert("", ergo(inode_file_plugin(inode) !=
71430cf6
MT
398+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
399+ offset == 0));
44254afd 400+
71430cf6 401+ ctx = reiser4_init_context(inode->i_sb);
44254afd 402+ if (IS_ERR(ctx))
71430cf6 403+ return;
44254afd
MT
404+
405+ node = jprivate(page);
406+ spin_lock_jnode(node);
407+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
408+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
409+ /* there is not need to capture */
410+ jref(node);
411+ JF_SET(node, JNODE_HEARD_BANSHEE);
412+ page_clear_jnode(page, node);
71430cf6 413+ reiser4_uncapture_jnode(node);
44254afd
MT
414+ unhash_unformatted_jnode(node);
415+ jput(node);
416+ reiser4_exit_context(ctx);
71430cf6 417+ return;
44254afd
MT
418+ }
419+ spin_unlock_jnode(node);
420+
421+ /* capture page being truncated. */
422+ ret = try_capture_page_to_invalidate(page);
423+ if (ret != 0)
424+ warning("nikita-3141", "Cannot capture: %i", ret);
425+
426+ if (offset == 0) {
427+ /* remove jnode from transaction and detach it from page. */
428+ jref(node);
429+ JF_SET(node, JNODE_HEARD_BANSHEE);
430+ /* page cannot be detached from jnode concurrently, because it
431+ * is locked */
71430cf6 432+ reiser4_uncapture_page(page);
44254afd
MT
433+
434+ /* this detaches page from jnode, so that jdelete will not try
435+ * to lock page which is already locked */
436+ spin_lock_jnode(node);
437+ page_clear_jnode(page, node);
438+ spin_unlock_jnode(node);
439+ unhash_unformatted_jnode(node);
440+
441+ jput(node);
442+ }
443+
444+ reiser4_exit_context(ctx);
44254afd
MT
445+}
446+
447+/* help function called from reiser4_releasepage(). It returns true if jnode
448+ * can be detached from its page and page released. */
449+int jnode_is_releasable(jnode * node /* node to check */ )
450+{
451+ assert("nikita-2781", node != NULL);
452+ assert_spin_locked(&(node->guard));
453+ assert_spin_locked(&(node->load));
454+
455+ /* is some thread is currently using jnode page, later cannot be
456+ * detached */
457+ if (atomic_read(&node->d_count) != 0) {
458+ return 0;
459+ }
460+
461+ assert("vs-1214", !jnode_is_loaded(node));
462+
463+ /*
464+ * can only release page if real block number is assigned to it. Simple
465+ * check for ->atom wouldn't do, because it is possible for node to be
466+ * clean, not it atom yet, and still having fake block number. For
467+ * example, node just created in jinit_new().
468+ */
71430cf6 469+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
44254afd
MT
470+ return 0;
471+
472+ /*
473+ * pages prepared for write can not be released anyway, so avoid
474+ * detaching jnode from the page
475+ */
476+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
477+ return 0;
478+
479+ /*
480+ * dirty jnode cannot be released. It can however be submitted to disk
481+ * as part of early flushing, but only after getting flush-prepped.
482+ */
483+ if (JF_ISSET(node, JNODE_DIRTY))
484+ return 0;
485+
486+ /* overwrite set is only written by log writer. */
487+ if (JF_ISSET(node, JNODE_OVRWR))
488+ return 0;
489+
490+ /* jnode is already under writeback */
491+ if (JF_ISSET(node, JNODE_WRITEBACK))
492+ return 0;
493+
494+ /* don't flush bitmaps or journal records */
495+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
496+ return 0;
497+
498+ return 1;
499+}
500+
501+/*
502+ * ->releasepage method for reiser4
503+ *
504+ * This is called by VM scanner when it comes across clean page. What we have
505+ * to do here is to check whether page can really be released (freed that is)
506+ * and if so, detach jnode from it and remove page from the page cache.
507+ *
508+ * Check for releasability is done by releasable() function.
509+ */
510+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
511+{
512+ jnode *node;
513+
514+ assert("nikita-2257", PagePrivate(page));
515+ assert("nikita-2259", PageLocked(page));
516+ assert("nikita-2892", !PageWriteback(page));
71430cf6 517+ assert("nikita-3019", reiser4_schedulable());
44254afd
MT
518+
519+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
520+ is not clear what to do in this case. A lot of deadlocks seems be
521+ possible. */
522+
523+ node = jnode_by_page(page);
524+ assert("nikita-2258", node != NULL);
525+ assert("reiser4-4", page->mapping != NULL);
526+ assert("reiser4-5", page->mapping->host != NULL);
527+
528+ if (PageDirty(page))
529+ return 0;
530+
71430cf6
MT
531+ /* extra page reference is used by reiser4 to protect
532+ * jnode<->page link from this ->releasepage(). */
44254afd
MT
533+ if (page_count(page) > 3)
534+ return 0;
535+
536+ /* releasable() needs jnode lock, because it looks at the jnode fields
537+ * and we need jload_lock here to avoid races with jload(). */
538+ spin_lock_jnode(node);
539+ spin_lock(&(node->load));
540+ if (jnode_is_releasable(node)) {
541+ struct address_space *mapping;
542+
543+ mapping = page->mapping;
544+ jref(node);
545+ /* there is no need to synchronize against
546+ * jnode_extent_write() here, because pages seen by
547+ * jnode_extent_write() are !releasable(). */
548+ page_clear_jnode(page, node);
549+ spin_unlock(&(node->load));
550+ spin_unlock_jnode(node);
551+
552+ /* we are under memory pressure so release jnode also. */
553+ jput(node);
554+
555+ return 1;
556+ } else {
557+ spin_unlock(&(node->load));
558+ spin_unlock_jnode(node);
71430cf6 559+ assert("nikita-3020", reiser4_schedulable());
44254afd
MT
560+ return 0;
561+ }
562+}
563+
564+/* Make Linus happy.
565+ Local variables:
566+ c-indentation-style: "K&R"
567+ mode-name: "LC"
568+ c-basic-offset: 8
569+ tab-width: 8
570+ fill-column: 120
571+ End:
572+*/
71430cf6
MT
573diff -urN linux-2.6.22.orig/fs/reiser4/block_alloc.c linux-2.6.22/fs/reiser4/block_alloc.c
574--- linux-2.6.22.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
575+++ linux-2.6.22/fs/reiser4/block_alloc.c 2007-07-29 00:25:34.816680947 +0400
576@@ -0,0 +1,1137 @@
44254afd
MT
577+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
578+
579+#include "debug.h"
580+#include "dformat.h"
581+#include "plugin/plugin.h"
582+#include "txnmgr.h"
583+#include "znode.h"
584+#include "block_alloc.h"
585+#include "tree.h"
586+#include "super.h"
587+
588+#include <linux/types.h> /* for __u?? */
589+#include <linux/fs.h> /* for struct super_block */
590+#include <linux/spinlock.h>
591+
592+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
593+
594+/* We need to be able to reserve enough disk space to ensure that an atomic
595+ operation will have enough disk space to flush (see flush.c and
596+ http://namesys.com/v4/v4.html) and commit it once it is started.
597+
598+ In our design a call for reserving disk space may fail but not an actual
599+ block allocation.
600+
601+ All free blocks, already allocated blocks, and all kinds of reserved blocks
602+ are counted in different per-fs block counters.
603+
604+ A reiser4 super block's set of block counters currently is:
605+
606+ free -- free blocks,
607+ used -- already allocated blocks,
608+
609+ grabbed -- initially reserved for performing an fs operation, those blocks
610+ are taken from free blocks, then grabbed disk space leaks from grabbed
611+ blocks counter to other counters like "fake allocated", "flush
612+ reserved", "used", the rest of not used grabbed space is returned to
613+ free space at the end of fs operation;
614+
615+ fake allocated -- counts all nodes without real disk block numbers assigned,
616+ we have separate accounting for formatted and unformatted
617+ nodes (for easier debugging);
618+
619+ flush reserved -- disk space needed for flushing and committing an atom.
620+ Each dirty already allocated block could be written as a
621+ part of atom's overwrite set or as a part of atom's
622+ relocate set. In both case one additional block is needed,
623+ it is used as a wandered block if we do overwrite or as a
624+ new location for a relocated block.
625+
626+ In addition, blocks in some states are counted on per-thread and per-atom
627+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
628+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
629+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
630+ blocks, which are reserved for flush processing and atom commit. */
631+
632+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
633+ number of blocks to grab for most expensive case of balancing when the leaf
634+ node we insert new item to gets split and new leaf node is allocated.
635+
636+ So, we need to grab blocks for
637+
638+ 1) one block for possible dirtying the node we insert an item to. That block
639+ would be used for node relocation at flush time or for allocating of a
640+ wandered one, it depends what will be a result (what set, relocate or
641+ overwrite the node gets assigned to) of the node processing by the flush
642+ algorithm.
643+
644+ 2) one block for either allocating a new node, or dirtying of right or left
645+ clean neighbor, only one case may happen.
646+
647+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
648+ node, and creation of new node. have I forgotten something? email me.
649+
650+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
651+ counter and in the fs-wide one (both ctx->grabbed_blocks and
652+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
653+ decremented by 2.
654+
655+ Suppose both two blocks were spent for dirtying of an already allocated clean
656+ node (one block went from "grabbed" to "flush reserved") and for new block
657+ allocating (one block went from "grabbed" to "fake allocated formatted").
658+
659+ Inserting of a child pointer to the parent node caused parent node to be
660+ split, the balancing code takes care about this grabbing necessary space
661+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
662+ "can use the 5% reserved disk space".
663+
664+ At this moment insertion completes and grabbed blocks (if they were not used)
665+ should be returned to the free space counter.
666+
667+ However the atom life-cycle is not completed. The atom had one "flush
668+ reserved" block added by our insertion and the new fake allocated node is
669+ counted as a "fake allocated formatted" one. The atom has to be fully
670+ processed by flush before commit. Suppose that the flush moved the first,
671+ already allocated node to the atom's overwrite list, the new fake allocated
672+ node, obviously, went into the atom relocate set. The reiser4 flush
673+ allocates the new node using one unit from "fake allocated formatted"
674+ counter, the log writer uses one from "flush reserved" for wandered block
675+ allocation.
676+
677+ And, it is not the end. When the wandered block is deallocated after the
678+ atom gets fully played (see wander.c for term description), the disk space
679+ occupied for it is returned to free blocks. */
680+
681+/* BLOCK NUMBERS */
682+
683+/* Any reiser4 node has a block number assigned to it. We use these numbers for
684+ indexing in hash tables, so if a block has not yet been assigned a location
685+ on disk we need to give it a temporary fake block number.
686+
687+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
688+ use highest bit in 64-bit block number to distinguish fake and real block
689+ numbers. So, only 63 bits may be used to addressing of real device
690+ blocks. That "fake" block numbers space is divided into subspaces of fake
691+ block numbers for data blocks and for shadow (working) bitmap blocks.
692+
693+ Fake block numbers for data blocks are generated by a cyclic counter, which
694+ gets incremented after each real block allocation. We assume that it is
695+ impossible to overload this counter during one transaction life. */
696+
697+/* Initialize a blocknr hint. */
71430cf6 698+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
44254afd
MT
699+{
700+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
701+}
702+
703+/* Release any resources of a blocknr hint. */
71430cf6 704+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
44254afd
MT
705+{
706+ /* No resources should be freed in current blocknr_hint implementation. */
707+}
708+
709+/* see above for explanation of fake block number. */
710+/* Audited by: green(2002.06.11) */
71430cf6 711+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
44254afd
MT
712+{
713+ /* The reason for not simply returning result of '&' operation is that
714+ while return value is (possibly 32bit) int, the reiser4_block_nr is
715+ at least 64 bits long, and high bit (which is the only possible
716+ non zero bit after the masking) would be stripped off */
717+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
718+}
719+
720+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
721+ arithmetic. Mostly, they are isolated to not to code same assertions in
722+ several places. */
723+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
724+{
725+ BUG_ON(ctx->grabbed_blocks < count);
726+ assert("zam-527", ctx->grabbed_blocks >= count);
727+ ctx->grabbed_blocks -= count;
728+}
729+
730+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
731+{
732+ ctx->grabbed_blocks += count;
733+}
734+
735+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
736+{
737+ assert("zam-525", sbinfo->blocks_grabbed >= count);
738+ sbinfo->blocks_grabbed -= count;
739+}
740+
741+/* Decrease the counter of block reserved for flush in super block. */
742+static void
743+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
744+{
745+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
746+ sbinfo->blocks_flush_reserved -= count;
747+}
748+
749+static void
750+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
751+ reiser4_ba_flags_t flags)
752+{
753+ if (flags & BA_FORMATTED) {
754+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
755+ sbinfo->blocks_fake_allocated -= count;
756+ } else {
757+ assert("zam-528",
758+ sbinfo->blocks_fake_allocated_unformatted >= count);
759+ sbinfo->blocks_fake_allocated_unformatted -= count;
760+ }
761+}
762+
763+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
764+{
765+ assert("zam-530",
766+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
767+ sbinfo->blocks_used -= count;
768+}
769+
770+static void
771+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
772+{
773+ assert("edward-501", sbinfo->blocks_clustered >= count);
774+ sbinfo->blocks_clustered -= count;
775+}
776+
777+/* Increase the counter of block reserved for flush in atom. */
778+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
779+{
780+ assert("zam-772", atom != NULL);
781+ assert_spin_locked(&(atom->alock));
782+ atom->flush_reserved += count;
783+}
784+
785+/* Decrease the counter of block reserved for flush in atom. */
786+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
787+{
788+ assert("zam-774", atom != NULL);
789+ assert_spin_locked(&(atom->alock));
790+ assert("nikita-2790", atom->flush_reserved >= count);
791+ atom->flush_reserved -= count;
792+}
793+
794+/* super block has 6 counters: free, used, grabbed, fake allocated
795+ (formatted and unformatted) and flush reserved. Their sum must be
796+ number of blocks on a device. This function checks this */
71430cf6 797+int reiser4_check_block_counters(const struct super_block *super)
44254afd
MT
798+{
799+ __u64 sum;
800+
801+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
802+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
71430cf6 803+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
44254afd
MT
804+ reiser4_clustered_blocks(super);
805+ if (reiser4_block_count(super) != sum) {
806+ printk("super block counters: "
807+ "used %llu, free %llu, "
808+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
809+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
810+ (unsigned long long)reiser4_data_blocks(super),
811+ (unsigned long long)reiser4_free_blocks(super),
812+ (unsigned long long)reiser4_grabbed_blocks(super),
813+ (unsigned long long)reiser4_fake_allocated(super),
814+ (unsigned long long)
815+ reiser4_fake_allocated_unformatted(super),
71430cf6 816+ (unsigned long long)reiser4_flush_reserved(super),
44254afd
MT
817+ (unsigned long long)reiser4_clustered_blocks(super),
818+ (unsigned long long)sum,
819+ (unsigned long long)reiser4_block_count(super));
820+ return 0;
821+ }
822+ return 1;
823+}
824+
825+/* Adjust "working" free blocks counter for number of blocks we are going to
826+ allocate. Record number of grabbed blocks in fs-wide and per-thread
827+ counters. This function should be called before bitmap scanning or
828+ allocating fake block numbers
829+
830+ @super -- pointer to reiser4 super block;
831+ @count -- number of blocks we reserve;
832+
833+ @return -- 0 if success, -ENOSPC, if all
834+ free blocks are preserved or already allocated.
835+*/
836+
837+static int
838+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
839+{
840+ __u64 free_blocks;
841+ int ret = 0, use_reserved = flags & BA_RESERVED;
842+ reiser4_super_info_data *sbinfo;
843+
844+ assert("vs-1276", ctx == get_current_context());
845+
846+ /* Do not grab anything on ro-mounted fs. */
847+ if (rofs_super(ctx->super)) {
848+ ctx->grab_enabled = 0;
849+ return 0;
850+ }
851+
852+ sbinfo = get_super_private(ctx->super);
853+
854+ spin_lock_reiser4_super(sbinfo);
855+
856+ free_blocks = sbinfo->blocks_free;
857+
858+ if ((use_reserved && free_blocks < count) ||
859+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
860+ ret = RETERR(-ENOSPC);
861+ goto unlock_and_ret;
862+ }
863+
864+ add_to_ctx_grabbed(ctx, count);
865+
866+ sbinfo->blocks_grabbed += count;
867+ sbinfo->blocks_free -= count;
868+
869+#if REISER4_DEBUG
870+ if (ctx->grabbed_initially == 0)
871+ ctx->grabbed_initially = count;
872+#endif
873+
71430cf6 874+ assert("nikita-2986", reiser4_check_block_counters(ctx->super));
44254afd
MT
875+
876+ /* disable grab space in current context */
877+ ctx->grab_enabled = 0;
878+
879+ unlock_and_ret:
880+ spin_unlock_reiser4_super(sbinfo);
881+
882+ return ret;
883+}
884+
885+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
886+{
887+ int ret;
888+ reiser4_context *ctx;
889+
890+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
891+ lock_stack_isclean(get_current_lock_stack
892+ ())));
893+ ctx = get_current_context();
894+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
895+ return 0;
896+ }
897+
898+ ret = reiser4_grab(ctx, count, flags);
899+ if (ret == -ENOSPC) {
900+
901+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
902+ if (flags & BA_CAN_COMMIT) {
903+ txnmgr_force_commit_all(ctx->super, 0);
904+ ctx->grab_enabled = 1;
905+ ret = reiser4_grab(ctx, count, flags);
906+ }
907+ }
908+ /*
909+ * allocation from reserved pool cannot fail. This is severe error.
910+ */
911+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
912+ return ret;
913+}
914+
915+/*
916+ * SPACE RESERVED FOR UNLINK/TRUNCATE
917+ *
918+ * Unlink and truncate require space in transaction (to update stat data, at
919+ * least). But we don't want rm(1) to fail with "No space on device" error.
920+ *
921+ * Solution is to reserve 5% of disk space for truncates and
922+ * unlinks. Specifically, normal space grabbing requests don't grab space from
923+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
71430cf6 924+ * drain it. Per super block delete mutex is used to allow only one
44254afd
MT
925+ * thread at a time to grab from reserved area.
926+ *
927+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
928+ * flag.
929+ *
930+ */
931+
932+int reiser4_grab_reserved(struct super_block *super,
933+ __u64 count, reiser4_ba_flags_t flags)
934+{
935+ reiser4_super_info_data *sbinfo = get_super_private(super);
936+
937+ assert("nikita-3175", flags & BA_CAN_COMMIT);
938+
71430cf6 939+ /* Check the delete mutex already taken by us, we assume that
44254afd 940+ * reading of machine word is atomic. */
71430cf6 941+ if (sbinfo->delete_mutex_owner == current) {
44254afd
MT
942+ if (reiser4_grab_space
943+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
944+ warning("zam-1003",
945+ "nested call of grab_reserved fails count=(%llu)",
946+ (unsigned long long)count);
947+ reiser4_release_reserved(super);
948+ return RETERR(-ENOSPC);
949+ }
950+ return 0;
951+ }
952+
953+ if (reiser4_grab_space(count, flags)) {
71430cf6
MT
954+ mutex_lock(&sbinfo->delete_mutex);
955+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
956+ sbinfo->delete_mutex_owner = current;
44254afd
MT
957+
958+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
959+ warning("zam-833",
960+ "reserved space is not enough (%llu)",
961+ (unsigned long long)count);
962+ reiser4_release_reserved(super);
963+ return RETERR(-ENOSPC);
964+ }
965+ }
966+ return 0;
967+}
968+
969+void reiser4_release_reserved(struct super_block *super)
970+{
971+ reiser4_super_info_data *info;
972+
973+ info = get_super_private(super);
71430cf6
MT
974+ if (info->delete_mutex_owner == current) {
975+ info->delete_mutex_owner = NULL;
976+ mutex_unlock(&info->delete_mutex);
44254afd
MT
977+ }
978+}
979+
980+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
981+{
982+ reiser4_context *ctx;
983+ reiser4_super_info_data *sbinfo;
984+
985+ ctx = get_current_context();
986+ sub_from_ctx_grabbed(ctx, count);
987+
988+ sbinfo = get_super_private(ctx->super);
989+ spin_lock_reiser4_super(sbinfo);
990+
991+ sub_from_sb_grabbed(sbinfo, count);
992+ /* return sbinfo locked */
993+ return sbinfo;
994+}
995+
996+/* is called after @count fake block numbers are allocated and pointer to
997+ those blocks are inserted into tree. */
998+static void grabbed2fake_allocated_formatted(void)
999+{
1000+ reiser4_super_info_data *sbinfo;
1001+
1002+ sbinfo = grabbed2fake_allocated_head(1);
1003+ sbinfo->blocks_fake_allocated++;
1004+
71430cf6 1005+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1006+
1007+ spin_unlock_reiser4_super(sbinfo);
1008+}
1009+
1010+/**
1011+ * grabbed2fake_allocated_unformatted
1012+ * @count:
1013+ *
1014+ */
1015+static void grabbed2fake_allocated_unformatted(int count)
1016+{
1017+ reiser4_super_info_data *sbinfo;
1018+
1019+ sbinfo = grabbed2fake_allocated_head(count);
1020+ sbinfo->blocks_fake_allocated_unformatted += count;
1021+
71430cf6 1022+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1023+
1024+ spin_unlock_reiser4_super(sbinfo);
1025+}
1026+
1027+void grabbed2cluster_reserved(int count)
1028+{
1029+ reiser4_context *ctx;
1030+ reiser4_super_info_data *sbinfo;
1031+
1032+ ctx = get_current_context();
1033+ sub_from_ctx_grabbed(ctx, count);
1034+
1035+ sbinfo = get_super_private(ctx->super);
1036+ spin_lock_reiser4_super(sbinfo);
1037+
1038+ sub_from_sb_grabbed(sbinfo, count);
1039+ sbinfo->blocks_clustered += count;
1040+
71430cf6 1041+ assert("edward-504", reiser4_check_block_counters(ctx->super));
44254afd
MT
1042+
1043+ spin_unlock_reiser4_super(sbinfo);
1044+}
1045+
1046+void cluster_reserved2grabbed(int count)
1047+{
1048+ reiser4_context *ctx;
1049+ reiser4_super_info_data *sbinfo;
1050+
1051+ ctx = get_current_context();
1052+
1053+ sbinfo = get_super_private(ctx->super);
1054+ spin_lock_reiser4_super(sbinfo);
1055+
1056+ sub_from_cluster_reserved(sbinfo, count);
1057+ sbinfo->blocks_grabbed += count;
1058+
71430cf6 1059+ assert("edward-505", reiser4_check_block_counters(ctx->super));
44254afd
MT
1060+
1061+ spin_unlock_reiser4_super(sbinfo);
1062+ add_to_ctx_grabbed(ctx, count);
1063+}
1064+
1065+void cluster_reserved2free(int count)
1066+{
1067+ reiser4_context *ctx;
1068+ reiser4_super_info_data *sbinfo;
1069+
44254afd
MT
1070+ ctx = get_current_context();
1071+ sbinfo = get_super_private(ctx->super);
44254afd 1072+
71430cf6
MT
1073+ cluster_reserved2grabbed(count);
1074+ grabbed2free(ctx, sbinfo, count);
44254afd
MT
1075+}
1076+
1077+static DEFINE_SPINLOCK(fake_lock);
1078+static reiser4_block_nr fake_gen = 0;
1079+
1080+/**
1081+ * assign_fake_blocknr
1082+ * @blocknr:
1083+ * @count:
1084+ *
1085+ * Obtain a fake block number for new node which will be used to refer to
1086+ * this newly allocated node until real allocation is done.
1087+ */
1088+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1089+{
1090+ spin_lock(&fake_lock);
1091+ *blocknr = fake_gen;
1092+ fake_gen += count;
1093+ spin_unlock(&fake_lock);
1094+
1095+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1096+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1097+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1098+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1099+}
1100+
1101+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1102+{
1103+ assign_fake_blocknr(blocknr, 1);
1104+ grabbed2fake_allocated_formatted();
1105+ return 0;
1106+}
1107+
1108+/**
1109+ * fake_blocknrs_unformatted
1110+ * @count: number of fake numbers to get
1111+ *
1112+ * Allocates @count fake block numbers which will be assigned to jnodes
1113+ */
1114+reiser4_block_nr fake_blocknr_unformatted(int count)
1115+{
1116+ reiser4_block_nr blocknr;
1117+
1118+ assign_fake_blocknr(&blocknr, count);
1119+ grabbed2fake_allocated_unformatted(count);
1120+
1121+ return blocknr;
1122+}
1123+
1124+/* adjust sb block counters, if real (on-disk) block allocation immediately
1125+ follows grabbing of free disk space. */
71430cf6
MT
1126+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1127+ __u64 count)
44254afd
MT
1128+{
1129+ sub_from_ctx_grabbed(ctx, count);
1130+
1131+ spin_lock_reiser4_super(sbinfo);
1132+
1133+ sub_from_sb_grabbed(sbinfo, count);
1134+ sbinfo->blocks_used += count;
1135+
71430cf6 1136+ assert("nikita-2679", reiser4_check_block_counters(ctx->super));
44254afd
MT
1137+
1138+ spin_unlock_reiser4_super(sbinfo);
1139+}
1140+
1141+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
71430cf6
MT
1142+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1143+ reiser4_ba_flags_t flags)
44254afd
MT
1144+{
1145+ spin_lock_reiser4_super(sbinfo);
1146+
1147+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1148+ sbinfo->blocks_used += count;
1149+
71430cf6
MT
1150+ assert("nikita-2680",
1151+ reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1152+
1153+ spin_unlock_reiser4_super(sbinfo);
1154+}
1155+
71430cf6 1156+static void flush_reserved2used(txn_atom * atom, __u64 count)
44254afd
MT
1157+{
1158+ reiser4_super_info_data *sbinfo;
1159+
1160+ assert("zam-787", atom != NULL);
1161+ assert_spin_locked(&(atom->alock));
1162+
1163+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1164+
1165+ sbinfo = get_current_super_private();
1166+ spin_lock_reiser4_super(sbinfo);
1167+
1168+ sub_from_sb_flush_reserved(sbinfo, count);
1169+ sbinfo->blocks_used += count;
1170+
71430cf6
MT
1171+ assert("zam-789",
1172+ reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1173+
1174+ spin_unlock_reiser4_super(sbinfo);
1175+}
1176+
1177+/* update the per fs blocknr hint default value. */
1178+void
1179+update_blocknr_hint_default(const struct super_block *s,
1180+ const reiser4_block_nr * block)
1181+{
1182+ reiser4_super_info_data *sbinfo = get_super_private(s);
1183+
71430cf6 1184+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
44254afd
MT
1185+
1186+ spin_lock_reiser4_super(sbinfo);
1187+ if (*block < sbinfo->block_count) {
1188+ sbinfo->blocknr_hint_default = *block;
1189+ } else {
1190+ warning("zam-676",
1191+ "block number %llu is too large to be used in a blocknr hint\n",
1192+ (unsigned long long)*block);
1193+ dump_stack();
1194+ DEBUGON(1);
1195+ }
1196+ spin_unlock_reiser4_super(sbinfo);
1197+}
1198+
1199+/* get current value of the default blocknr hint. */
1200+void get_blocknr_hint_default(reiser4_block_nr * result)
1201+{
1202+ reiser4_super_info_data *sbinfo = get_current_super_private();
1203+
1204+ spin_lock_reiser4_super(sbinfo);
1205+ *result = sbinfo->blocknr_hint_default;
1206+ assert("zam-677", *result < sbinfo->block_count);
1207+ spin_unlock_reiser4_super(sbinfo);
1208+}
1209+
1210+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1211+ * method. Blocks are allocated in one contiguous disk region. The plugin
1212+ * independent part accounts blocks by subtracting allocated amount from grabbed
1213+ * or fake block counter and add the same amount to the counter of allocated
1214+ * blocks.
1215+ *
1216+ * @hint -- a reiser4 blocknr hint object which contains further block
1217+ * allocation hints and parameters (search start, a stage of block
1218+ * which will be mapped to disk, etc.),
1219+ * @blk -- an out parameter for the beginning of the allocated region,
1220+ * @len -- in/out parameter, it should contain the maximum number of allocated
1221+ * blocks, after block allocation completes, it contains the length of
1222+ * allocated disk region.
1223+ * @flags -- see reiser4_ba_flags_t description.
1224+ *
1225+ * @return -- 0 if success, error code otherwise.
1226+ */
1227+int
1228+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1229+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1230+{
1231+ __u64 needed = *len;
1232+ reiser4_context *ctx;
1233+ reiser4_super_info_data *sbinfo;
1234+ int ret;
1235+
1236+ assert("zam-986", hint != NULL);
1237+
1238+ ctx = get_current_context();
1239+ sbinfo = get_super_private(ctx->super);
1240+
1241+ /* For write-optimized data we use default search start value, which is
1242+ * close to last write location. */
1243+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1244+ get_blocknr_hint_default(&hint->blk);
1245+ }
1246+
1247+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1248+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1249+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1250+ ret = reiser4_grab_space_force(*len, flags);
1251+ if (ret != 0)
1252+ return ret;
1253+ }
1254+
1255+ ret =
71430cf6
MT
1256+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1257+ hint, (int)needed, blk, len);
44254afd
MT
1258+
1259+ if (!ret) {
1260+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1261+ assert("zam-681",
1262+ *blk + *len <= reiser4_block_count(ctx->super));
1263+
1264+ if (flags & BA_PERMANENT) {
1265+ /* we assume that current atom exists at this moment */
1266+ txn_atom *atom = get_current_atom_locked();
1267+ atom->nr_blocks_allocated += *len;
1268+ spin_unlock_atom(atom);
1269+ }
1270+
1271+ switch (hint->block_stage) {
1272+ case BLOCK_NOT_COUNTED:
1273+ case BLOCK_GRABBED:
1274+ grabbed2used(ctx, sbinfo, *len);
1275+ break;
1276+ case BLOCK_UNALLOCATED:
1277+ fake_allocated2used(sbinfo, *len, flags);
1278+ break;
1279+ case BLOCK_FLUSH_RESERVED:
1280+ {
1281+ txn_atom *atom = get_current_atom_locked();
1282+ flush_reserved2used(atom, *len);
1283+ spin_unlock_atom(atom);
1284+ }
1285+ break;
1286+ default:
1287+ impossible("zam-531", "wrong block stage");
1288+ }
1289+ } else {
1290+ assert("zam-821",
1291+ ergo(hint->max_dist == 0
1292+ && !hint->backward, ret != -ENOSPC));
1293+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1294+ grabbed2free(ctx, sbinfo, needed);
1295+ }
1296+
1297+ return ret;
1298+}
1299+
1300+/* used -> fake_allocated -> grabbed -> free */
1301+
1302+/* adjust sb block counters when @count unallocated blocks get unmapped from
1303+ disk */
1304+static void
1305+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1306+ int formatted)
1307+{
1308+ spin_lock_reiser4_super(sbinfo);
1309+
1310+ if (formatted)
1311+ sbinfo->blocks_fake_allocated += count;
1312+ else
1313+ sbinfo->blocks_fake_allocated_unformatted += count;
1314+
1315+ sub_from_sb_used(sbinfo, count);
1316+
71430cf6
MT
1317+ assert("nikita-2681",
1318+ reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1319+
1320+ spin_unlock_reiser4_super(sbinfo);
1321+}
1322+
1323+static void
1324+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1325+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1326+{
1327+ assert("nikita-2791", atom != NULL);
1328+ assert_spin_locked(&(atom->alock));
1329+
1330+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1331+
1332+ spin_lock_reiser4_super(sbinfo);
1333+
1334+ sbinfo->blocks_flush_reserved += count;
1335+ /*add_to_sb_flush_reserved(sbinfo, count); */
1336+ sub_from_sb_used(sbinfo, count);
1337+
71430cf6
MT
1338+ assert("nikita-2681",
1339+ reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1340+
1341+ spin_unlock_reiser4_super(sbinfo);
1342+}
1343+
1344+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1345+static void
1346+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1347+ __u64 count, reiser4_ba_flags_t flags)
1348+{
1349+ add_to_ctx_grabbed(ctx, count);
1350+
1351+ spin_lock_reiser4_super(sbinfo);
1352+
71430cf6 1353+ assert("nikita-2682", reiser4_check_block_counters(ctx->super));
44254afd
MT
1354+
1355+ sbinfo->blocks_grabbed += count;
1356+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1357+
71430cf6 1358+ assert("nikita-2683", reiser4_check_block_counters(ctx->super));
44254afd
MT
1359+
1360+ spin_unlock_reiser4_super(sbinfo);
1361+}
1362+
1363+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1364+{
1365+ reiser4_context *ctx;
1366+ reiser4_super_info_data *sbinfo;
1367+
1368+ ctx = get_current_context();
1369+ sbinfo = get_super_private(ctx->super);
1370+
1371+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1372+ grabbed2free(ctx, sbinfo, count);
1373+}
1374+
1375+void grabbed2free_mark(__u64 mark)
1376+{
1377+ reiser4_context *ctx;
1378+ reiser4_super_info_data *sbinfo;
1379+
1380+ ctx = get_current_context();
1381+ sbinfo = get_super_private(ctx->super);
1382+
1383+ assert("nikita-3007", (__s64) mark >= 0);
1384+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1385+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1386+}
1387+
1388+/**
1389+ * grabbed2free - adjust grabbed and free block counters
1390+ * @ctx: context to update grabbed block counter of
1391+ * @sbinfo: super block to update grabbed and free block counters of
1392+ * @count: number of blocks to adjust counters by
1393+ *
1394+ * Decreases context's and per filesystem's counters of grabbed
1395+ * blocks. Increases per filesystem's counter of free blocks.
1396+ */
1397+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1398+ __u64 count)
1399+{
1400+ sub_from_ctx_grabbed(ctx, count);
1401+
1402+ spin_lock_reiser4_super(sbinfo);
1403+
1404+ sub_from_sb_grabbed(sbinfo, count);
1405+ sbinfo->blocks_free += count;
71430cf6 1406+ assert("nikita-2684", reiser4_check_block_counters(ctx->super));
44254afd
MT
1407+
1408+ spin_unlock_reiser4_super(sbinfo);
1409+}
1410+
1411+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1412+{
1413+ reiser4_context *ctx;
1414+ reiser4_super_info_data *sbinfo;
1415+
1416+ assert("vs-1095", atom);
1417+
1418+ ctx = get_current_context();
1419+ sbinfo = get_super_private(ctx->super);
1420+
1421+ sub_from_ctx_grabbed(ctx, count);
1422+
1423+ add_to_atom_flush_reserved_nolock(atom, count);
1424+
1425+ spin_lock_reiser4_super(sbinfo);
1426+
1427+ sbinfo->blocks_flush_reserved += count;
1428+ sub_from_sb_grabbed(sbinfo, count);
1429+
71430cf6 1430+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
44254afd
MT
1431+
1432+ spin_unlock_reiser4_super(sbinfo);
1433+}
1434+
1435+void grabbed2flush_reserved(__u64 count)
1436+{
1437+ txn_atom *atom = get_current_atom_locked();
1438+
1439+ grabbed2flush_reserved_nolock(atom, count);
1440+
1441+ spin_unlock_atom(atom);
1442+}
1443+
1444+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1445+{
1446+ reiser4_context *ctx;
1447+ reiser4_super_info_data *sbinfo;
1448+
1449+ assert("nikita-2788", atom != NULL);
1450+ assert_spin_locked(&(atom->alock));
1451+
1452+ ctx = get_current_context();
1453+ sbinfo = get_super_private(ctx->super);
1454+
1455+ add_to_ctx_grabbed(ctx, count);
1456+
1457+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1458+
1459+ spin_lock_reiser4_super(sbinfo);
1460+
1461+ sbinfo->blocks_grabbed += count;
1462+ sub_from_sb_flush_reserved(sbinfo, count);
1463+
71430cf6 1464+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
44254afd
MT
1465+
1466+ spin_unlock_reiser4_super(sbinfo);
1467+}
1468+
1469+/**
1470+ * all_grabbed2free - releases all blocks grabbed in context
1471+ *
1472+ * Decreases context's and super block's grabbed block counters by number of
1473+ * blocks grabbed by current context and increases super block's free block
1474+ * counter correspondingly.
1475+ */
1476+void all_grabbed2free(void)
1477+{
1478+ reiser4_context *ctx = get_current_context();
1479+
1480+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1481+}
1482+
1483+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1484+ after freeing, @count blocks become "grabbed". */
1485+static void
1486+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1487+ __u64 count)
1488+{
1489+ add_to_ctx_grabbed(ctx, count);
1490+
1491+ spin_lock_reiser4_super(sbinfo);
1492+
1493+ sbinfo->blocks_grabbed += count;
1494+ sub_from_sb_used(sbinfo, count);
1495+
71430cf6 1496+ assert("nikita-2685", reiser4_check_block_counters(ctx->super));
44254afd
MT
1497+
1498+ spin_unlock_reiser4_super(sbinfo);
1499+}
1500+
1501+/* this used to be done through used2grabbed and grabbed2free*/
1502+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1503+{
1504+ spin_lock_reiser4_super(sbinfo);
1505+
1506+ sbinfo->blocks_free += count;
1507+ sub_from_sb_used(sbinfo, count);
1508+
71430cf6
MT
1509+ assert("nikita-2685",
1510+ reiser4_check_block_counters(reiser4_get_current_sb()));
44254afd
MT
1511+
1512+ spin_unlock_reiser4_super(sbinfo);
1513+}
1514+
1515+#if REISER4_DEBUG
1516+
1517+/* check "allocated" state of given block range */
1518+static void
1519+reiser4_check_blocks(const reiser4_block_nr * start,
1520+ const reiser4_block_nr * len, int desired)
1521+{
1522+ sa_check_blocks(start, len, desired);
1523+}
1524+
1525+/* check "allocated" state of given block */
1526+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1527+{
1528+ const reiser4_block_nr one = 1;
1529+
1530+ reiser4_check_blocks(block, &one, desired);
1531+}
1532+
1533+#endif
1534+
1535+/* Blocks deallocation function may do an actual deallocation through space
1536+ plugin allocation or store deleted block numbers in atom's delete_set data
1537+ structure depend on @defer parameter. */
1538+
1539+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1540+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1541+ freed but disk space is still grabbed by current thread, or these blocks must
1542+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
1543+
1544+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1545+ distinguish blocks allocated for unformatted and formatted nodes */
1546+
1547+int
1548+reiser4_dealloc_blocks(const reiser4_block_nr * start,
1549+ const reiser4_block_nr * len,
1550+ block_stage_t target_stage, reiser4_ba_flags_t flags)
1551+{
1552+ txn_atom *atom = NULL;
1553+ int ret;
1554+ reiser4_context *ctx;
1555+ reiser4_super_info_data *sbinfo;
1556+
1557+ ctx = get_current_context();
1558+ sbinfo = get_super_private(ctx->super);
1559+
1560+ if (REISER4_DEBUG) {
1561+ assert("zam-431", *len != 0);
1562+ assert("zam-432", *start != 0);
71430cf6 1563+ assert("zam-558", !reiser4_blocknr_is_fake(start));
44254afd
MT
1564+
1565+ spin_lock_reiser4_super(sbinfo);
1566+ assert("zam-562", *start < sbinfo->block_count);
1567+ spin_unlock_reiser4_super(sbinfo);
1568+ }
1569+
1570+ if (flags & BA_DEFER) {
1571+ blocknr_set_entry *bsep = NULL;
1572+
1573+ /* storing deleted block numbers in a blocknr set
1574+ datastructure for further actual deletion */
1575+ do {
1576+ atom = get_current_atom_locked();
1577+ assert("zam-430", atom != NULL);
1578+
1579+ ret =
1580+ blocknr_set_add_extent(atom, &atom->delete_set,
1581+ &bsep, start, len);
1582+
1583+ if (ret == -ENOMEM)
1584+ return ret;
1585+
1586+ /* This loop might spin at most two times */
1587+ } while (ret == -E_REPEAT);
1588+
1589+ assert("zam-477", ret == 0);
1590+ assert("zam-433", atom != NULL);
1591+
1592+ spin_unlock_atom(atom);
1593+
1594+ } else {
1595+ assert("zam-425", get_current_super_private() != NULL);
71430cf6
MT
1596+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1597+ *start, *len);
44254afd
MT
1598+
1599+ if (flags & BA_PERMANENT) {
1600+ /* These blocks were counted as allocated, we have to revert it
1601+ * back if allocation is discarded. */
1602+ txn_atom *atom = get_current_atom_locked();
1603+ atom->nr_blocks_allocated -= *len;
1604+ spin_unlock_atom(atom);
1605+ }
1606+
1607+ switch (target_stage) {
1608+ case BLOCK_NOT_COUNTED:
1609+ assert("vs-960", flags & BA_FORMATTED);
1610+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1611+ used2free(sbinfo, *len);
1612+ break;
1613+
1614+ case BLOCK_GRABBED:
1615+ used2grabbed(ctx, sbinfo, *len);
1616+ break;
1617+
1618+ case BLOCK_UNALLOCATED:
1619+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1620+ break;
1621+
1622+ case BLOCK_FLUSH_RESERVED:{
1623+ txn_atom *atom;
1624+
1625+ atom = get_current_atom_locked();
1626+ used2flush_reserved(sbinfo, atom, *len,
1627+ flags & BA_FORMATTED);
1628+ spin_unlock_atom(atom);
1629+ break;
1630+ }
1631+ default:
1632+ impossible("zam-532", "wrong block stage");
1633+ }
1634+ }
1635+
1636+ return 0;
1637+}
1638+
1639+/* wrappers for block allocator plugin methods */
71430cf6 1640+int reiser4_pre_commit_hook(void)
44254afd
MT
1641+{
1642+ assert("zam-502", get_current_super_private() != NULL);
1643+ sa_pre_commit_hook();
1644+ return 0;
1645+}
1646+
1647+/* an actor which applies delete set to block allocator data */
1648+static int
1649+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1650+ const reiser4_block_nr * b, void *data UNUSED_ARG)
1651+{
1652+ reiser4_context *ctx;
1653+ reiser4_super_info_data *sbinfo;
1654+
1655+ __u64 len = 1;
1656+
1657+ ctx = get_current_context();
1658+ sbinfo = get_super_private(ctx->super);
1659+
1660+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1661+ assert("zam-552", sbinfo != NULL);
1662+
1663+ if (b != NULL)
1664+ len = *b;
1665+
1666+ if (REISER4_DEBUG) {
1667+ spin_lock_reiser4_super(sbinfo);
1668+
1669+ assert("zam-554", *a < reiser4_block_count(ctx->super));
1670+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1671+
1672+ spin_unlock_reiser4_super(sbinfo);
1673+ }
1674+
1675+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1676+ /* adjust sb block counters */
1677+ used2free(sbinfo, len);
1678+ return 0;
1679+}
1680+
71430cf6 1681+void reiser4_post_commit_hook(void)
44254afd
MT
1682+{
1683+ txn_atom *atom;
1684+
1685+ atom = get_current_atom_locked();
1686+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1687+ spin_unlock_atom(atom);
1688+
1689+ /* do the block deallocation which was deferred
1690+ until commit is done */
1691+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1692+
1693+ assert("zam-504", get_current_super_private() != NULL);
1694+ sa_post_commit_hook();
1695+}
1696+
71430cf6 1697+void reiser4_post_write_back_hook(void)
44254afd
MT
1698+{
1699+ assert("zam-504", get_current_super_private() != NULL);
1700+
1701+ sa_post_commit_hook();
1702+}
1703+
1704+/*
1705+ Local variables:
1706+ c-indentation-style: "K&R"
1707+ mode-name: "LC"
1708+ c-basic-offset: 8
1709+ tab-width: 8
1710+ fill-column: 120
1711+ scroll-step: 1
1712+ End:
1713+*/
71430cf6
MT
1714diff -urN linux-2.6.22.orig/fs/reiser4/block_alloc.h linux-2.6.22/fs/reiser4/block_alloc.h
1715--- linux-2.6.22.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1716+++ linux-2.6.22/fs/reiser4/block_alloc.h 2007-07-29 00:25:34.820681982 +0400
44254afd
MT
1717@@ -0,0 +1,175 @@
1718+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1719+
1720+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1721+#define __FS_REISER4_BLOCK_ALLOC_H__
1722+
1723+#include "dformat.h"
1724+#include "forward.h"
1725+
1726+#include <linux/types.h> /* for __u?? */
1727+#include <linux/fs.h>
1728+
1729+/* Mask when is applied to given block number shows is that block number is a fake one */
1730+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1731+/* Mask which isolates a type of object this fake block number was assigned to */
1732+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1733+
1734+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1735+ against these two values to understand is the object unallocated or bitmap
1736+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1737+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1738+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1739+
1740+/* specification how block allocation was counted in sb block counters */
1741+typedef enum {
1742+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1743+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1744+ of this block */
1745+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1746+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1747+ ( unallocated formatted or unformatted
1748+ node) */
1749+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1750+ number assigned */
1751+} block_stage_t;
1752+
1753+/* a hint for block allocator */
1754+struct reiser4_blocknr_hint {
1755+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1756+ is to prevent jnode_flush() calls from interleaving allocations on the same
1757+ bitmap, once a hint is established. */
1758+
1759+ /* search start hint */
1760+ reiser4_block_nr blk;
1761+ /* if not zero, it is a region size we search for free blocks in */
1762+ reiser4_block_nr max_dist;
1763+ /* level for allocation, may be useful have branch-level and higher
1764+ write-optimized. */
1765+ tree_level level;
1766+ /* block allocator assumes that blocks, which will be mapped to disk,
1767+ are in this specified block_stage */
1768+ block_stage_t block_stage;
1769+ /* If direction = 1 allocate blocks in backward direction from the end
1770+ * of disk to the beginning of disk. */
1771+ unsigned int backward:1;
1772+
1773+};
1774+
1775+/* These flags control block allocation/deallocation behavior */
1776+enum reiser4_ba_flags {
1777+ /* do allocatations from reserved (5%) area */
1778+ BA_RESERVED = (1 << 0),
1779+
1780+ /* block allocator can do commit trying to recover free space */
1781+ BA_CAN_COMMIT = (1 << 1),
1782+
1783+ /* if operation will be applied to formatted block */
1784+ BA_FORMATTED = (1 << 2),
1785+
1786+ /* defer actual block freeing until transaction commit */
1787+ BA_DEFER = (1 << 3),
1788+
1789+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
1790+ wandered of log blocks */
1791+ BA_PERMANENT = (1 << 4),
1792+
1793+ /* grab space even it was disabled */
1794+ BA_FORCE = (1 << 5),
1795+
1796+ /* use default start value for free blocks search. */
1797+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1798+};
1799+
1800+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1801+
71430cf6
MT
1802+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1803+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
44254afd
MT
1804+extern void update_blocknr_hint_default(const struct super_block *,
1805+ const reiser4_block_nr *);
1806+extern void get_blocknr_hint_default(reiser4_block_nr *);
1807+
1808+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1809+
1810+int assign_fake_blocknr_formatted(reiser4_block_nr *);
1811+reiser4_block_nr fake_blocknr_unformatted(int);
1812+
1813+/* free -> grabbed -> fake_allocated -> used */
1814+
1815+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1816+void all_grabbed2free(void);
1817+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1818+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1819+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1820+void grabbed2flush_reserved(__u64 count);
1821+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1822+ reiser4_block_nr * start,
1823+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
1824+int reiser4_dealloc_blocks(const reiser4_block_nr *,
1825+ const reiser4_block_nr *,
1826+ block_stage_t, reiser4_ba_flags_t flags);
1827+
1828+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1829+ reiser4_block_nr * start,
1830+ reiser4_ba_flags_t flags)
1831+{
1832+ reiser4_block_nr one = 1;
1833+ return reiser4_alloc_blocks(hint, start, &one, flags);
1834+}
1835+
1836+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1837+ block_stage_t stage,
1838+ reiser4_ba_flags_t flags)
1839+{
1840+ const reiser4_block_nr one = 1;
1841+ return reiser4_dealloc_blocks(block, &one, stage, flags);
1842+}
1843+
1844+#define reiser4_grab_space_force(count, flags) \
1845+ reiser4_grab_space(count, flags | BA_FORCE)
1846+
1847+extern void grabbed2free_mark(__u64 mark);
1848+extern int reiser4_grab_reserved(struct super_block *,
1849+ __u64, reiser4_ba_flags_t);
1850+extern void reiser4_release_reserved(struct super_block *super);
1851+
1852+/* grabbed -> fake_allocated */
1853+
1854+/* fake_allocated -> used */
1855+
1856+/* used -> fake_allocated -> grabbed -> free */
1857+
1858+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1859+
71430cf6 1860+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
44254afd
MT
1861+
1862+extern void grabbed2cluster_reserved(int count);
1863+extern void cluster_reserved2grabbed(int count);
1864+extern void cluster_reserved2free(int count);
1865+
71430cf6 1866+extern int reiser4_check_block_counters(const struct super_block *);
44254afd
MT
1867+
1868+#if REISER4_DEBUG
1869+
1870+extern void reiser4_check_block(const reiser4_block_nr *, int);
1871+
1872+#else
1873+
1874+# define reiser4_check_block(beg, val) noop
1875+
1876+#endif
1877+
71430cf6
MT
1878+extern int reiser4_pre_commit_hook(void);
1879+extern void reiser4_post_commit_hook(void);
1880+extern void reiser4_post_write_back_hook(void);
44254afd
MT
1881+
1882+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1883+
1884+/* Make Linus happy.
1885+ Local variables:
1886+ c-indentation-style: "K&R"
1887+ mode-name: "LC"
1888+ c-basic-offset: 8
1889+ tab-width: 8
1890+ fill-column: 120
1891+ End:
1892+*/
71430cf6
MT
1893diff -urN linux-2.6.22.orig/fs/reiser4/blocknrset.c linux-2.6.22/fs/reiser4/blocknrset.c
1894--- linux-2.6.22.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1895+++ linux-2.6.22/fs/reiser4/blocknrset.c 2007-07-29 00:25:34.820681982 +0400
44254afd
MT
1896@@ -0,0 +1,368 @@
1897+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1898+
1899+/* This file contains code for various block number sets used by the atom to
1900+ track the deleted set and wandered block mappings. */
1901+
1902+#include "debug.h"
1903+#include "dformat.h"
1904+#include "txnmgr.h"
1905+#include "context.h"
1906+
1907+#include <linux/slab.h>
1908+
1909+/* The proposed data structure for storing unordered block number sets is a
1910+ list of elements, each of which contains an array of block number or/and
1911+ array of block number pairs. That element called blocknr_set_entry is used
1912+ to store block numbers from the beginning and for extents from the end of
1913+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1914+ count numbers of blocks and extents.
1915+
1916+ +------------------- blocknr_set_entry->data ------------------+
1917+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1918+ +------------------------------------------------------------+
1919+
1920+ When current blocknr_set_entry is full, allocate a new one. */
1921+
1922+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1923+ * set (single blocks and block extents), in that case blocknr pair represent an
1924+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1925+ * there represent a (real block) -> (wandered block) mapping. */
1926+
71430cf6
MT
1927+/* Protection: blocknr sets belong to reiser4 atom, and
1928+ * their modifications are performed with the atom lock held */
44254afd
MT
1929+
1930+/* The total size of a blocknr_set_entry. */
1931+#define BLOCKNR_SET_ENTRY_SIZE 128
1932+
1933+/* The number of blocks that can fit the blocknr data area. */
1934+#define BLOCKNR_SET_ENTRIES_NUMBER \
1935+ ((BLOCKNR_SET_ENTRY_SIZE - \
1936+ 2 * sizeof (unsigned) - \
1937+ sizeof(struct list_head)) / \
1938+ sizeof(reiser4_block_nr))
1939+
1940+/* An entry of the blocknr_set */
1941+struct blocknr_set_entry {
1942+ unsigned nr_singles;
1943+ unsigned nr_pairs;
1944+ struct list_head link;
1945+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1946+};
1947+
1948+/* A pair of blocks as recorded in the blocknr_set_entry data. */
1949+struct blocknr_pair {
1950+ reiser4_block_nr a;
1951+ reiser4_block_nr b;
1952+};
1953+
1954+/* Return the number of blocknr slots available in a blocknr_set_entry. */
1955+/* Audited by: green(2002.06.11) */
1956+static unsigned bse_avail(blocknr_set_entry * bse)
1957+{
1958+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1959+
1960+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1961+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1962+
1963+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
1964+}
1965+
1966+/* Initialize a blocknr_set_entry. */
1967+static void bse_init(blocknr_set_entry *bse)
1968+{
1969+ bse->nr_singles = 0;
1970+ bse->nr_pairs = 0;
1971+ INIT_LIST_HEAD(&bse->link);
1972+}
1973+
1974+/* Allocate and initialize a blocknr_set_entry. */
1975+/* Audited by: green(2002.06.11) */
1976+static blocknr_set_entry *bse_alloc(void)
1977+{
1978+ blocknr_set_entry *e;
1979+
1980+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
71430cf6 1981+ reiser4_ctx_gfp_mask_get())) == NULL)
44254afd
MT
1982+ return NULL;
1983+
1984+ bse_init(e);
1985+
1986+ return e;
1987+}
1988+
1989+/* Free a blocknr_set_entry. */
1990+/* Audited by: green(2002.06.11) */
1991+static void bse_free(blocknr_set_entry * bse)
1992+{
1993+ kfree(bse);
1994+}
1995+
1996+/* Add a block number to a blocknr_set_entry */
1997+/* Audited by: green(2002.06.11) */
1998+static void
1999+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2000+{
2001+ assert("jmacd-5099", bse_avail(bse) >= 1);
2002+
2003+ bse->entries[bse->nr_singles++] = *block;
2004+}
2005+
2006+/* Get a pair of block numbers */
2007+/* Audited by: green(2002.06.11) */
71430cf6
MT
2008+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2009+ unsigned pno)
44254afd
MT
2010+{
2011+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2012+
71430cf6
MT
2013+ return (struct blocknr_pair *) (bse->entries +
2014+ BLOCKNR_SET_ENTRIES_NUMBER -
2015+ 2 * (pno + 1));
44254afd
MT
2016+}
2017+
2018+/* Add a pair of block numbers to a blocknr_set_entry */
2019+/* Audited by: green(2002.06.11) */
2020+static void
2021+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2022+ const reiser4_block_nr * b)
2023+{
71430cf6 2024+ struct blocknr_pair *pair;
44254afd
MT
2025+
2026+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2027+
2028+ pair = bse_get_pair(bse, bse->nr_pairs++);
2029+
2030+ pair->a = *a;
2031+ pair->b = *b;
2032+}
2033+
2034+/* Add either a block or pair of blocks to the block number set. The first
2035+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2036+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2037+ the call is made with the atom lock held. There may not be enough space in
2038+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2039+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2040+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2041+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2042+ returned with the atom unlocked for the operation to be tried again. If
2043+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2044+ used during the call, it will be freed automatically. */
71430cf6 2045+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
44254afd
MT
2046+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2047+ const reiser4_block_nr *b)
2048+{
2049+ blocknr_set_entry *bse;
2050+ unsigned entries_needed;
2051+
2052+ assert("jmacd-5101", a != NULL);
2053+
2054+ entries_needed = (b == NULL) ? 1 : 2;
71430cf6
MT
2055+ if (list_empty(bset) ||
2056+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
44254afd
MT
2057+ /* See if a bse was previously allocated. */
2058+ if (*new_bsep == NULL) {
2059+ spin_unlock_atom(atom);
2060+ *new_bsep = bse_alloc();
2061+ return (*new_bsep != NULL) ? -E_REPEAT :
2062+ RETERR(-ENOMEM);
2063+ }
2064+
2065+ /* Put it on the head of the list. */
71430cf6 2066+ list_add(&((*new_bsep)->link), bset);
44254afd
MT
2067+
2068+ *new_bsep = NULL;
2069+ }
2070+
2071+ /* Add the single or pair. */
71430cf6 2072+ bse = list_entry(bset->next, blocknr_set_entry, link);
44254afd
MT
2073+ if (b == NULL) {
2074+ bse_put_single(bse, a);
2075+ } else {
2076+ bse_put_pair(bse, a, b);
2077+ }
2078+
2079+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2080+ if (*new_bsep != NULL) {
2081+ bse_free(*new_bsep);
2082+ *new_bsep = NULL;
2083+ }
2084+
2085+ return 0;
2086+}
2087+
2088+/* Add an extent to the block set. If the length is 1, it is treated as a
2089+ single block (e.g., reiser4_set_add_block). */
2090+/* Audited by: green(2002.06.11) */
2091+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2092+ kmalloc might schedule. The only exception is atom spinlock, which is
2093+ properly freed. */
2094+int
2095+blocknr_set_add_extent(txn_atom * atom,
71430cf6 2096+ struct list_head * bset,
44254afd
MT
2097+ blocknr_set_entry ** new_bsep,
2098+ const reiser4_block_nr * start,
2099+ const reiser4_block_nr * len)
2100+{
2101+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2102+ return blocknr_set_add(atom, bset, new_bsep, start,
2103+ *len == 1 ? NULL : len);
2104+}
2105+
2106+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2107+ * by an assertion that both arguments are not null.*/
2108+/* Audited by: green(2002.06.11) */
2109+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2110+ kmalloc might schedule. The only exception is atom spinlock, which is
2111+ properly freed. */
2112+int
2113+blocknr_set_add_pair(txn_atom * atom,
71430cf6 2114+ struct list_head * bset,
44254afd
MT
2115+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2116+ const reiser4_block_nr * b)
2117+{
2118+ assert("jmacd-5103", a != NULL && b != NULL);
2119+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2120+}
2121+
2122+/* Initialize a blocknr_set. */
71430cf6 2123+void blocknr_set_init(struct list_head *bset)
44254afd 2124+{
71430cf6 2125+ INIT_LIST_HEAD(bset);
44254afd
MT
2126+}
2127+
2128+/* Release the entries of a blocknr_set. */
71430cf6 2129+void blocknr_set_destroy(struct list_head *bset)
44254afd
MT
2130+{
2131+ blocknr_set_entry *bse;
2132+
71430cf6
MT
2133+ while (!list_empty(bset)) {
2134+ bse = list_entry(bset->next, blocknr_set_entry, link);
44254afd
MT
2135+ list_del_init(&bse->link);
2136+ bse_free(bse);
2137+ }
2138+}
2139+
2140+/* Merge blocknr_set entries out of @from into @into. */
2141+/* Audited by: green(2002.06.11) */
2142+/* Auditor comments: This merge does not know if merged sets contain
2143+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2144+ overlapping ranges if there is some. So I believe it may lead to
2145+ some blocks being presented several times in one blocknr_set. To help
2146+ debugging such problems it might help to check for duplicate entries on
2147+ actual processing of this set. Testing this kind of stuff right here is
2148+ also complicated by the fact that these sets are not sorted and going
2149+ through whole set on each element addition is going to be CPU-heavy task */
71430cf6 2150+void blocknr_set_merge(struct list_head * from, struct list_head * into)
44254afd
MT
2151+{
2152+ blocknr_set_entry *bse_into = NULL;
2153+
2154+ /* If @from is empty, no work to perform. */
71430cf6 2155+ if (list_empty(from))
44254afd 2156+ return;
44254afd 2157+ /* If @into is not empty, try merging partial-entries. */
71430cf6 2158+ if (!list_empty(into)) {
44254afd
MT
2159+
2160+ /* Neither set is empty, pop the front to members and try to combine them. */
2161+ blocknr_set_entry *bse_from;
2162+ unsigned into_avail;
2163+
71430cf6 2164+ bse_into = list_entry(into->next, blocknr_set_entry, link);
44254afd 2165+ list_del_init(&bse_into->link);
71430cf6 2166+ bse_from = list_entry(from->next, blocknr_set_entry, link);
44254afd
MT
2167+ list_del_init(&bse_from->link);
2168+
2169+ /* Combine singles. */
2170+ for (into_avail = bse_avail(bse_into);
2171+ into_avail != 0 && bse_from->nr_singles != 0;
2172+ into_avail -= 1) {
2173+ bse_put_single(bse_into,
2174+ &bse_from->entries[--bse_from->
2175+ nr_singles]);
2176+ }
2177+
2178+ /* Combine pairs. */
2179+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2180+ into_avail -= 2) {
71430cf6
MT
2181+ struct blocknr_pair *pair =
2182+ bse_get_pair(bse_from, --bse_from->nr_pairs);
44254afd
MT
2183+ bse_put_pair(bse_into, &pair->a, &pair->b);
2184+ }
2185+
2186+ /* If bse_from is empty, delete it now. */
2187+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2188+ bse_free(bse_from);
2189+ } else {
2190+ /* Otherwise, bse_into is full or nearly full (e.g.,
2191+ it could have one slot avail and bse_from has one
2192+ pair left). Push it back onto the list. bse_from
2193+ becomes bse_into, which will be the new partial. */
71430cf6 2194+ list_add(&bse_into->link, into);
44254afd
MT
2195+ bse_into = bse_from;
2196+ }
2197+ }
2198+
2199+ /* Splice lists together. */
71430cf6 2200+ list_splice_init(from, into->prev);
44254afd
MT
2201+
2202+ /* Add the partial entry back to the head of the list. */
71430cf6
MT
2203+ if (bse_into != NULL)
2204+ list_add(&bse_into->link, into);
44254afd
MT
2205+}
2206+
2207+/* Iterate over all blocknr set elements. */
71430cf6 2208+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
44254afd
MT
2209+ blocknr_set_actor_f actor, void *data, int delete)
2210+{
2211+
2212+ blocknr_set_entry *entry;
2213+
2214+ assert("zam-429", atom != NULL);
2215+ assert("zam-430", atom_is_protected(atom));
2216+ assert("zam-431", bset != 0);
2217+ assert("zam-432", actor != NULL);
2218+
71430cf6
MT
2219+ entry = list_entry(bset->next, blocknr_set_entry, link);
2220+ while (bset != &entry->link) {
44254afd
MT
2221+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2222+ unsigned int i;
2223+ int ret;
2224+
2225+ for (i = 0; i < entry->nr_singles; i++) {
2226+ ret = actor(atom, &entry->entries[i], NULL, data);
2227+
2228+ /* We can't break a loop if delete flag is set. */
2229+ if (ret != 0 && !delete)
2230+ return ret;
2231+ }
2232+
2233+ for (i = 0; i < entry->nr_pairs; i++) {
2234+ struct blocknr_pair *ab;
2235+
2236+ ab = bse_get_pair(entry, i);
2237+
2238+ ret = actor(atom, &ab->a, &ab->b, data);
2239+
2240+ if (ret != 0 && !delete)
2241+ return ret;
2242+ }
2243+
2244+ if (delete) {
2245+ list_del(&entry->link);
2246+ bse_free(entry);
2247+ }
2248+
2249+ entry = tmp;
2250+ }
2251+
2252+ return 0;
2253+}
2254+
2255+/*
2256+ * Local variables:
2257+ * c-indentation-style: "K&R"
2258+ * mode-name: "LC"
2259+ * c-basic-offset: 8
2260+ * tab-width: 8
2261+ * fill-column: 79
2262+ * scroll-step: 1
2263+ * End:
2264+ */
71430cf6
MT
2265diff -urN linux-2.6.22.orig/fs/reiser4/carry.c linux-2.6.22/fs/reiser4/carry.c
2266--- linux-2.6.22.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2267+++ linux-2.6.22/fs/reiser4/carry.c 2007-07-29 00:25:34.820681982 +0400
2268@@ -0,0 +1,1391 @@
44254afd
MT
2269+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2270+/* Functions to "carry" tree modification(s) upward. */
2271+/* Tree is modified one level at a time. As we modify a level we accumulate a
2272+ set of changes that need to be propagated to the next level. We manage
2273+ node locking such that any searches that collide with carrying are
2274+ restarted, from the root if necessary.
2275+
2276+ Insertion of a new item may result in items being moved among nodes and
2277+ this requires the delimiting key to be updated at the least common parent
2278+ of the nodes modified to preserve search tree invariants. Also, insertion
2279+ may require allocation of a new node. A pointer to the new node has to be
2280+ inserted into some node on the parent level, etc.
2281+
2282+ Tree carrying is meant to be analogous to arithmetic carrying.
2283+
2284+ A carry operation is always associated with some node (&carry_node).
2285+
2286+ Carry process starts with some initial set of operations to be performed
2287+ and an initial set of already locked nodes. Operations are performed one
2288+ by one. Performing each single operation has following possible effects:
2289+
2290+ - content of carry node associated with operation is modified
2291+ - new carry nodes are locked and involved into carry process on this level
2292+ - new carry operations are posted to the next level
2293+
2294+ After all carry operations on this level are done, process is repeated for
2295+ the accumulated sequence on carry operations for the next level. This
2296+ starts by trying to lock (in left to right order) all carry nodes
2297+ associated with carry operations on the parent level. After this, we decide
2298+ whether more nodes are required on the left of already locked set. If so,
2299+ all locks taken on the parent level are released, new carry nodes are
2300+ added, and locking process repeats.
2301+
2302+ It may happen that balancing process fails owing to unrecoverable error on
2303+ some of upper levels of a tree (possible causes are io error, failure to
2304+ allocate new node, etc.). In this case we should unmount the filesystem,
2305+ rebooting if it is the root, and possibly advise the use of fsck.
2306+
2307+ USAGE:
2308+
2309+ int some_tree_operation( znode *node, ... )
2310+ {
2311+ // Allocate on a stack pool of carry objects: operations and nodes.
2312+ // Most carry processes will only take objects from here, without
2313+ // dynamic allocation.
2314+
2315+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2316+
2317+ carry_pool pool;
2318+ carry_level lowest_level;
2319+ carry_op *op;
2320+
2321+ init_carry_pool( &pool );
2322+ init_carry_level( &lowest_level, &pool );
2323+
2324+ // operation may be one of:
2325+ // COP_INSERT --- insert new item into node
2326+ // COP_CUT --- remove part of or whole node
2327+ // COP_PASTE --- increase size of item
2328+ // COP_DELETE --- delete pointer from parent node
2329+ // COP_UPDATE --- update delimiting key in least
2330+ // common ancestor of two
2331+
71430cf6 2332+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
44254afd
MT
2333+ if( IS_ERR( op ) || ( op == NULL ) ) {
2334+ handle error
2335+ } else {
2336+ // fill in remaining fields in @op, according to carry.h:carry_op
2337+ result = carry( &lowest_level, NULL );
2338+ }
2339+ done_carry_pool( &pool );
2340+ }
2341+
2342+ When you are implementing node plugin method that participates in carry
2343+ (shifting, insertion, deletion, etc.), do the following:
2344+
2345+ int foo_node_method( znode *node, ..., carry_level *todo )
2346+ {
2347+ carry_op *op;
2348+
2349+ ....
2350+
71430cf6 2351+ // note, that last argument to reiser4_post_carry() is non-null
44254afd
MT
2352+ // here, because @op is to be applied to the parent of @node, rather
2353+ // than to the @node itself as in the previous case.
2354+
2355+ op = node_post_carry( todo, operation, node, 1 );
2356+ // fill in remaining fields in @op, according to carry.h:carry_op
2357+
2358+ ....
2359+
2360+ }
2361+
2362+ BATCHING:
2363+
2364+ One of the main advantages of level-by-level balancing implemented here is
2365+ ability to batch updates on a parent level and to peform them more
2366+ efficiently as a result.
2367+
2368+ Description To Be Done (TBD).
2369+
2370+ DIFFICULTIES AND SUBTLE POINTS:
2371+
2372+ 1. complex plumbing is required, because:
2373+
2374+ a. effective allocation through pools is needed
2375+
2376+ b. target of operation is not exactly known when operation is
2377+ posted. This is worked around through bitfields in &carry_node and
2378+ logic in lock_carry_node()
2379+
2380+ c. of interaction with locking code: node should be added into sibling
2381+ list when pointer to it is inserted into its parent, which is some time
2382+ after node was created. Between these moments, node is somewhat in
2383+ suspended state and is only registered in the carry lists
2384+
2385+ 2. whole balancing logic is implemented here, in particular, insertion
2386+ logic is coded in make_space().
2387+
71430cf6
MT
2388+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2389+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
44254afd
MT
2390+ (insert_paste()) have to be handled.
2391+
2392+ 4. there is non-trivial interdependency between allocation of new nodes
2393+ and almost everything else. This is mainly due to the (1.c) above. I shall
2394+ write about this later.
2395+
2396+*/
2397+
2398+#include "forward.h"
2399+#include "debug.h"
2400+#include "key.h"
2401+#include "coord.h"
2402+#include "plugin/item/item.h"
2403+#include "plugin/item/extent.h"
2404+#include "plugin/node/node.h"
2405+#include "jnode.h"
2406+#include "znode.h"
2407+#include "tree_mod.h"
2408+#include "tree_walk.h"
2409+#include "block_alloc.h"
2410+#include "pool.h"
2411+#include "tree.h"
2412+#include "carry.h"
2413+#include "carry_ops.h"
2414+#include "super.h"
2415+#include "reiser4.h"
2416+
2417+#include <linux/types.h>
2418+
2419+/* level locking/unlocking */
2420+static int lock_carry_level(carry_level * level);
2421+static void unlock_carry_level(carry_level * level, int failure);
2422+static void done_carry_level(carry_level * level);
2423+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2424+
2425+int lock_carry_node(carry_level * level, carry_node * node);
2426+int lock_carry_node_tail(carry_node * node);
2427+
2428+/* carry processing proper */
2429+static int carry_on_level(carry_level * doing, carry_level * todo);
2430+
2431+static carry_op *add_op(carry_level * level, pool_ordering order,
2432+ carry_op * reference);
2433+
2434+/* handlers for carry operations. */
2435+
2436+static void fatal_carry_error(carry_level * doing, int ecode);
2437+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2438+
44254afd
MT
2439+static void print_level(const char *prefix, carry_level * level);
2440+
2441+#if REISER4_DEBUG
2442+typedef enum {
2443+ CARRY_TODO,
2444+ CARRY_DOING
2445+} carry_queue_state;
2446+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2447+#endif
2448+
2449+/* main entry point for tree balancing.
2450+
2451+ Tree carry performs operations from @doing and while doing so accumulates
2452+ information about operations to be performed on the next level ("carried"
2453+ to the parent level). Carried operations are performed, causing possibly
2454+ more operations to be carried upward etc. carry() takes care about
2455+ locking and pinning znodes while operating on them.
2456+
2457+ For usage, see comment at the top of fs/reiser4/carry.c
2458+
2459+*/
71430cf6
MT
2460+int reiser4_carry(carry_level * doing /* set of carry operations to be
2461+ * performed */ ,
2462+ carry_level * done /* set of nodes, already performed
2463+ * at the previous level.
2464+ * NULL in most cases */)
44254afd
MT
2465+{
2466+ int result = 0;
2467+ /* queue of new requests */
2468+ carry_level *todo;
2469+ ON_DEBUG(STORE_COUNTERS);
2470+
2471+ assert("nikita-888", doing != NULL);
2472+ BUG_ON(done != NULL);
2473+
2474+ todo = doing + 1;
2475+ init_carry_level(todo, doing->pool);
2476+
2477+ /* queue of requests preformed on the previous level */
2478+ done = todo + 1;
2479+ init_carry_level(done, doing->pool);
2480+
2481+ /* iterate until there is nothing more to do */
2482+ while (result == 0 && doing->ops_num > 0) {
2483+ carry_level *tmp;
2484+
2485+ /* at this point @done is locked. */
2486+ /* repeat lock/do/unlock while
2487+
2488+ (1) lock_carry_level() fails due to deadlock avoidance, or
2489+
2490+ (2) carry_on_level() decides that more nodes have to
2491+ be involved.
2492+
2493+ (3) some unexpected error occurred while balancing on the
2494+ upper levels. In this case all changes are rolled back.
2495+
2496+ */
2497+ while (1) {
2498+ result = lock_carry_level(doing);
2499+ if (result == 0) {
2500+ /* perform operations from @doing and
2501+ accumulate new requests in @todo */
2502+ result = carry_on_level(doing, todo);
2503+ if (result == 0)
2504+ break;
2505+ else if (result != -E_REPEAT ||
2506+ !doing->restartable) {
2507+ warning("nikita-1043",
2508+ "Fatal error during carry: %i",
2509+ result);
2510+ print_level("done", done);
2511+ print_level("doing", doing);
2512+ print_level("todo", todo);
2513+ /* do some rough stuff like aborting
2514+ all pending transcrashes and thus
2515+ pushing tree back to the consistent
2516+ state. Alternatvely, just panic.
2517+ */
2518+ fatal_carry_error(doing, result);
2519+ return result;
2520+ }
2521+ } else if (result != -E_REPEAT) {
2522+ fatal_carry_error(doing, result);
2523+ return result;
2524+ }
2525+ unlock_carry_level(doing, 1);
2526+ }
2527+ /* at this point @done can be safely unlocked */
2528+ done_carry_level(done);
2529+
2530+ /* cyclically shift queues */
2531+ tmp = done;
2532+ done = doing;
2533+ doing = todo;
2534+ todo = tmp;
2535+ init_carry_level(todo, doing->pool);
2536+
2537+ /* give other threads chance to run */
71430cf6 2538+ reiser4_preempt_point();
44254afd
MT
2539+ }
2540+ done_carry_level(done);
2541+
2542+ /* all counters, but x_refs should remain the same. x_refs can change
2543+ owing to transaction manager */
2544+ ON_DEBUG(CHECK_COUNTERS);
2545+ return result;
2546+}
2547+
2548+/* perform carry operations on given level.
2549+
2550+ Optimizations proposed by pooh:
2551+
2552+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2553+ required;
2554+
2555+ (2) unlock node if there are no more operations to be performed upon it and
2556+ node didn't add any operation to @todo. This can be implemented by
2557+ attaching to each node two counters: counter of operaions working on this
2558+ node and counter and operations carried upward from this node.
2559+
2560+*/
2561+static int carry_on_level(carry_level * doing /* queue of carry operations to
2562+ * do on this level */ ,
2563+ carry_level * todo /* queue where new carry
2564+ * operations to be performed on
2565+ * the * parent level are
2566+ * accumulated during @doing
2567+ * processing. */ )
2568+{
2569+ int result;
2570+ int (*f) (carry_op *, carry_level *, carry_level *);
2571+ carry_op *op;
2572+ carry_op *tmp_op;
2573+
2574+ assert("nikita-1034", doing != NULL);
2575+ assert("nikita-1035", todo != NULL);
2576+
2577+ /* @doing->nodes are locked. */
2578+
2579+ /* This function can be split into two phases: analysis and modification.
2580+
2581+ Analysis calculates precisely what items should be moved between
2582+ nodes. This information is gathered in some structures attached to
2583+ each carry_node in a @doing queue. Analysis also determines whether
2584+ new nodes are to be allocated etc.
2585+
2586+ After analysis is completed, actual modification is performed. Here
2587+ we can take advantage of "batch modification": if there are several
2588+ operations acting on the same node, modifications can be performed
2589+ more efficiently when batched together.
2590+
2591+ Above is an optimization left for the future.
2592+ */
2593+ /* Important, but delayed optimization: it's possible to batch
2594+ operations together and perform them more efficiently as a
2595+ result. For example, deletion of several neighboring items from a
2596+ node can be converted to a single ->cut() operation.
2597+
2598+ Before processing queue, it should be scanned and "mergeable"
2599+ operations merged.
2600+ */
2601+ result = 0;
2602+ for_all_ops(doing, op, tmp_op) {
2603+ carry_opcode opcode;
2604+
2605+ assert("nikita-1041", op != NULL);
2606+ opcode = op->op;
2607+ assert("nikita-1042", op->op < COP_LAST_OP);
2608+ f = op_dispatch_table[op->op].handler;
2609+ result = f(op, doing, todo);
2610+ /* locking can fail with -E_REPEAT. Any different error is fatal
2611+ and will be handled by fatal_carry_error() sledgehammer.
2612+ */
2613+ if (result != 0)
2614+ break;
2615+ }
2616+ if (result == 0) {
2617+ carry_plugin_info info;
2618+ carry_node *scan;
2619+ carry_node *tmp_scan;
2620+
2621+ info.doing = doing;
2622+ info.todo = todo;
2623+
2624+ assert("nikita-3002",
2625+ carry_level_invariant(doing, CARRY_DOING));
2626+ for_all_nodes(doing, scan, tmp_scan) {
2627+ znode *node;
2628+
71430cf6 2629+ node = reiser4_carry_real(scan);
44254afd
MT
2630+ assert("nikita-2547", node != NULL);
2631+ if (node_is_empty(node)) {
2632+ result =
2633+ node_plugin_by_node(node)->
2634+ prepare_removal(node, &info);
2635+ if (result != 0)
2636+ break;
2637+ }
2638+ }
2639+ }
2640+ return result;
2641+}
2642+
2643+/* post carry operation
2644+
2645+ This is main function used by external carry clients: node layout plugins
2646+ and tree operations to create new carry operation to be performed on some
2647+ level.
2648+
2649+ New operation will be included in the @level queue. To actually perform it,
2650+ call carry( level, ... ). This function takes write lock on @node. Carry
2651+ manages all its locks by itself, don't worry about this.
2652+
2653+ This function adds operation and node at the end of the queue. It is up to
2654+ caller to guarantee proper ordering of node queue.
2655+
2656+*/
71430cf6
MT
2657+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2658+ * is to be posted at */ ,
2659+ carry_opcode op /* opcode of operation */ ,
2660+ znode * node /* node on which this operation
2661+ * will operate */ ,
2662+ int apply_to_parent_p /* whether operation will
2663+ * operate directly on @node
2664+ * or on it parent. */)
44254afd
MT
2665+{
2666+ carry_op *result;
2667+ carry_node *child;
2668+
2669+ assert("nikita-1046", level != NULL);
2670+ assert("nikita-1788", znode_is_write_locked(node));
2671+
2672+ result = add_op(level, POOLO_LAST, NULL);
2673+ if (IS_ERR(result))
2674+ return result;
71430cf6 2675+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
44254afd
MT
2676+ if (IS_ERR(child)) {
2677+ reiser4_pool_free(&level->pool->op_pool, &result->header);
2678+ return (carry_op *) child;
2679+ }
2680+ result->node = child;
2681+ result->op = op;
2682+ child->parent = apply_to_parent_p;
2683+ if (ZF_ISSET(node, JNODE_ORPHAN))
2684+ child->left_before = 1;
2685+ child->node = node;
2686+ return result;
2687+}
2688+
2689+/* initialize carry queue */
2690+void init_carry_level(carry_level * level /* level to initialize */ ,
2691+ carry_pool * pool /* pool @level will allocate objects
2692+ * from */ )
2693+{
2694+ assert("nikita-1045", level != NULL);
2695+ assert("nikita-967", pool != NULL);
2696+
2697+ memset(level, 0, sizeof *level);
2698+ level->pool = pool;
2699+
2700+ INIT_LIST_HEAD(&level->nodes);
2701+ INIT_LIST_HEAD(&level->ops);
2702+}
2703+
2704+/* allocate carry pool and initialize pools within queue */
2705+carry_pool *init_carry_pool(int size)
2706+{
2707+ carry_pool *pool;
2708+
2709+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
71430cf6 2710+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
44254afd
MT
2711+ if (pool == NULL)
2712+ return ERR_PTR(RETERR(-ENOMEM));
2713+
2714+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2715+ (char *)pool->op);
2716+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2717+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2718+ return pool;
2719+}
2720+
2721+/* finish with queue pools */
2722+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2723+{
2724+ reiser4_done_pool(&pool->op_pool);
2725+ reiser4_done_pool(&pool->node_pool);
2726+ kfree(pool);
2727+}
2728+
2729+/* add new carry node to the @level.
2730+
2731+ Returns pointer to the new carry node allocated from pool. It's up to
2732+ callers to maintain proper order in the @level. Assumption is that if carry
2733+ nodes on one level are already sorted and modifications are peroformed from
2734+ left to right, carry nodes added on the parent level will be ordered
2735+ automatically. To control ordering use @order and @reference parameters.
2736+
2737+*/
71430cf6
MT
2738+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2739+ * node to */ ,
2740+ pool_ordering order /* where to insert:
2741+ * at the beginning of
2742+ * @level,
2743+ * before @reference,
2744+ * after @reference,
2745+ * at the end of @level
2746+ */ ,
2747+ carry_node * reference/* reference node for
2748+ * insertion */)
44254afd
MT
2749+{
2750+ ON_DEBUG(carry_node * orig_ref = reference);
2751+
2752+ if (order == POOLO_BEFORE) {
2753+ reference = find_left_carry(reference, level);
2754+ if (reference == NULL)
2755+ reference = list_entry(level->nodes.next, carry_node,
2756+ header.level_linkage);
2757+ else
2758+ reference = list_entry(reference->header.level_linkage.next,
2759+ carry_node, header.level_linkage);
2760+ } else if (order == POOLO_AFTER) {
2761+ reference = find_right_carry(reference, level);
2762+ if (reference == NULL)
2763+ reference = list_entry(level->nodes.prev, carry_node,
2764+ header.level_linkage);
2765+ else
2766+ reference = list_entry(reference->header.level_linkage.prev,
2767+ carry_node, header.level_linkage);
2768+ }
2769+ assert("nikita-2209",
2770+ ergo(orig_ref != NULL,
71430cf6
MT
2771+ reiser4_carry_real(reference) ==
2772+ reiser4_carry_real(orig_ref)));
2773+ return reiser4_add_carry(level, order, reference);
44254afd
MT
2774+}
2775+
71430cf6 2776+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
44254afd
MT
2777+ * to */ ,
2778+ pool_ordering order /* where to insert: at the
2779+ * beginning of @level, before
2780+ * @reference, after @reference,
2781+ * at the end of @level */ ,
2782+ carry_node * reference /* reference node for
2783+ * insertion */ )
2784+{
2785+ carry_node *result;
2786+
2787+ result =
71430cf6
MT
2788+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2789+ &level->nodes,
2790+ order, &reference->header);
44254afd
MT
2791+ if (!IS_ERR(result) && (result != NULL))
2792+ ++level->nodes_num;
2793+ return result;
2794+}
2795+
2796+/* add new carry operation to the @level.
2797+
2798+ Returns pointer to the new carry operations allocated from pool. It's up to
2799+ callers to maintain proper order in the @level. To control ordering use
2800+ @order and @reference parameters.
2801+
2802+*/
2803+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2804+ pool_ordering order /* where to insert: at the beginning of
2805+ * @level, before @reference, after
2806+ * @reference, at the end of @level */ ,
2807+ carry_op *
2808+ reference /* reference node for insertion */ )
2809+{
2810+ carry_op *result;
2811+
2812+ result =
71430cf6
MT
2813+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2814+ order, &reference->header);
44254afd
MT
2815+ if (!IS_ERR(result) && (result != NULL))
2816+ ++level->ops_num;
2817+ return result;
2818+}
2819+
2820+/* Return node on the right of which @node was created.
2821+
2822+ Each node is created on the right of some existing node (or it is new root,
2823+ which is special case not handled here).
2824+
2825+ @node is new node created on some level, but not yet inserted into its
2826+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2827+
2828+*/
2829+static carry_node *find_begetting_brother(carry_node * node /* node to start search
2830+ * from */ ,
2831+ carry_level * kin UNUSED_ARG /* level to
2832+ * scan */ )
2833+{
2834+ carry_node *scan;
2835+
2836+ assert("nikita-1614", node != NULL);
2837+ assert("nikita-1615", kin != NULL);
2838+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
71430cf6
MT
2839+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2840+ ZF_ISSET(reiser4_carry_real(node),
2841+ JNODE_ORPHAN)));
44254afd
MT
2842+ for (scan = node;;
2843+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
2844+ header.level_linkage)) {
2845+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2846+ if ((scan->node != node->node) &&
2847+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
71430cf6 2848+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
44254afd
MT
2849+ break;
2850+ }
2851+ }
2852+ return scan;
2853+}
2854+
2855+static cmp_t
2856+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2857+{
2858+ assert("nikita-2199", n1 != NULL);
2859+ assert("nikita-2200", n2 != NULL);
2860+
2861+ if (n1 == n2)
2862+ return EQUAL_TO;
2863+ while (1) {
2864+ n1 = carry_node_next(n1);
2865+ if (carry_node_end(level, n1))
2866+ return GREATER_THAN;
2867+ if (n1 == n2)
2868+ return LESS_THAN;
2869+ }
2870+ impossible("nikita-2201", "End of level reached");
2871+}
2872+
2873+carry_node *find_carry_node(carry_level * level, const znode * node)
2874+{
2875+ carry_node *scan;
2876+ carry_node *tmp_scan;
2877+
2878+ assert("nikita-2202", level != NULL);
2879+ assert("nikita-2203", node != NULL);
2880+
2881+ for_all_nodes(level, scan, tmp_scan) {
71430cf6 2882+ if (reiser4_carry_real(scan) == node)
44254afd
MT
2883+ return scan;
2884+ }
2885+ return NULL;
2886+}
2887+
71430cf6 2888+znode *reiser4_carry_real(const carry_node * node)
44254afd
MT
2889+{
2890+ assert("nikita-3061", node != NULL);
2891+
2892+ return node->lock_handle.node;
2893+}
2894+
2895+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2896+ const znode * node)
2897+{
2898+ carry_node *base;
2899+ carry_node *scan;
2900+ carry_node *tmp_scan;
2901+ carry_node *proj;
2902+
2903+ base = find_carry_node(doing, node);
2904+ assert("nikita-2204", base != NULL);
2905+
2906+ for_all_nodes(todo, scan, tmp_scan) {
2907+ proj = find_carry_node(doing, scan->node);
2908+ assert("nikita-2205", proj != NULL);
2909+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2910+ break;
2911+ }
2912+ return scan;
2913+}
2914+
2915+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2916+ znode * node)
2917+{
2918+ carry_node *reference;
2919+
2920+ assert("nikita-2994", doing != NULL);
2921+ assert("nikita-2995", todo != NULL);
2922+ assert("nikita-2996", node != NULL);
2923+
2924+ reference = insert_carry_node(doing, todo, node);
2925+ assert("nikita-2997", reference != NULL);
2926+
71430cf6 2927+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
44254afd
MT
2928+}
2929+
71430cf6
MT
2930+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2931+ This function is different from reiser4_post_carry() in that it finds proper
2932+ place to insert node in the queue. */
44254afd
MT
2933+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2934+ * passed down to node
2935+ * plugin */ ,
2936+ carry_opcode op /* opcode of operation */ ,
2937+ znode * node /* node on which this
2938+ * operation will operate */ ,
2939+ int apply_to_parent_p /* whether operation will
2940+ * operate directly on @node
2941+ * or on it parent. */ )
2942+{
2943+ carry_op *result;
2944+ carry_node *child;
2945+
2946+ assert("nikita-2207", info != NULL);
2947+ assert("nikita-2208", info->todo != NULL);
2948+
2949+ if (info->doing == NULL)
71430cf6
MT
2950+ return reiser4_post_carry(info->todo, op, node,
2951+ apply_to_parent_p);
44254afd
MT
2952+
2953+ result = add_op(info->todo, POOLO_LAST, NULL);
2954+ if (IS_ERR(result))
2955+ return result;
2956+ child = add_carry_atplace(info->doing, info->todo, node);
2957+ if (IS_ERR(child)) {
2958+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2959+ return (carry_op *) child;
2960+ }
2961+ result->node = child;
2962+ result->op = op;
2963+ child->parent = apply_to_parent_p;
2964+ if (ZF_ISSET(node, JNODE_ORPHAN))
2965+ child->left_before = 1;
2966+ child->node = node;
2967+ return result;
2968+}
2969+
2970+/* lock all carry nodes in @level */
2971+static int lock_carry_level(carry_level * level /* level to lock */ )
2972+{
2973+ int result;
2974+ carry_node *node;
2975+ carry_node *tmp_node;
2976+
2977+ assert("nikita-881", level != NULL);
2978+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
2979+
2980+ /* lock nodes from left to right */
2981+ result = 0;
2982+ for_all_nodes(level, node, tmp_node) {
2983+ result = lock_carry_node(level, node);
2984+ if (result != 0)
2985+ break;
2986+ }
2987+ return result;
2988+}
2989+
2990+/* Synchronize delimiting keys between @node and its left neighbor.
2991+
2992+ To reduce contention on dk key and simplify carry code, we synchronize
2993+ delimiting keys only when carry ultimately leaves tree level (carrying
2994+ changes upward) and unlocks nodes at this level.
2995+
2996+ This function first finds left neighbor of @node and then updates left
2997+ neighbor's right delimiting key to conincide with least key in @node.
2998+
2999+*/
3000+
3001+ON_DEBUG(extern atomic_t delim_key_version;
3002+ )
3003+
3004+static void sync_dkeys(znode * spot /* node to update */ )
3005+{
3006+ reiser4_key pivot;
3007+ reiser4_tree *tree;
3008+
3009+ assert("nikita-1610", spot != NULL);
3010+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3011+
3012+ tree = znode_get_tree(spot);
3013+ read_lock_tree(tree);
3014+ write_lock_dk(tree);
3015+
3016+ assert("nikita-2192", znode_is_loaded(spot));
3017+
3018+ /* sync left delimiting key of @spot with key in its leftmost item */
3019+ if (node_is_empty(spot))
3020+ pivot = *znode_get_rd_key(spot);
3021+ else
3022+ leftmost_key_in_node(spot, &pivot);
3023+
3024+ znode_set_ld_key(spot, &pivot);
3025+
3026+ /* there can be sequence of empty nodes pending removal on the left of
3027+ @spot. Scan them and update their left and right delimiting keys to
3028+ match left delimiting key of @spot. Also, update right delimiting
3029+ key of first non-empty left neighbor.
3030+ */
3031+ while (1) {
3032+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3033+ break;
3034+
3035+ spot = spot->left;
3036+ if (spot == NULL)
3037+ break;
3038+
3039+ znode_set_rd_key(spot, &pivot);
3040+ /* don't sink into the domain of another balancing */
3041+ if (!znode_is_write_locked(spot))
3042+ break;
3043+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3044+ znode_set_ld_key(spot, &pivot);
3045+ else
3046+ break;
3047+ }
3048+
3049+ write_unlock_dk(tree);
3050+ read_unlock_tree(tree);
3051+}
3052+
3053+/* unlock all carry nodes in @level */
3054+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3055+ int failure /* true if unlocking owing to
3056+ * failure */ )
3057+{
3058+ carry_node *node;
3059+ carry_node *tmp_node;
3060+
3061+ assert("nikita-889", level != NULL);
3062+
3063+ if (!failure) {
3064+ znode *spot;
3065+
3066+ spot = NULL;
3067+ /* update delimiting keys */
3068+ for_all_nodes(level, node, tmp_node) {
71430cf6
MT
3069+ if (reiser4_carry_real(node) != spot) {
3070+ spot = reiser4_carry_real(node);
44254afd
MT
3071+ sync_dkeys(spot);
3072+ }
3073+ }
3074+ }
3075+
3076+ /* nodes can be unlocked in arbitrary order. In preemptible
3077+ environment it's better to unlock in reverse order of locking,
3078+ though.
3079+ */
3080+ for_all_nodes_back(level, node, tmp_node) {
3081+ /* all allocated nodes should be already linked to their
3082+ parents at this moment. */
71430cf6
MT
3083+ assert("nikita-1631",
3084+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3085+ JNODE_ORPHAN)));
3086+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
44254afd
MT
3087+ unlock_carry_node(level, node, failure);
3088+ }
3089+ level->new_root = NULL;
3090+}
3091+
3092+/* finish with @level
3093+
3094+ Unlock nodes and release all allocated resources */
3095+static void done_carry_level(carry_level * level /* level to finish */ )
3096+{
3097+ carry_node *node;
3098+ carry_node *tmp_node;
3099+ carry_op *op;
3100+ carry_op *tmp_op;
3101+
3102+ assert("nikita-1076", level != NULL);
3103+
3104+ unlock_carry_level(level, 0);
3105+ for_all_nodes(level, node, tmp_node) {
3106+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3107+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3108+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3109+ }
3110+ for_all_ops(level, op, tmp_op)
3111+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3112+}
3113+
3114+/* helper function to complete locking of carry node
3115+
3116+ Finish locking of carry node. There are several ways in which new carry
3117+ node can be added into carry level and locked. Normal is through
3118+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3119+ function factors out common final part of all locking scenarios. It
3120+ supposes that @node -> lock_handle is lock handle for lock just taken and
3121+ fills ->real_node from this lock handle.
3122+
3123+*/
3124+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3125+{
3126+ assert("nikita-1052", node != NULL);
71430cf6 3127+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
44254afd
MT
3128+ assert("nikita-1188", !node->unlock);
3129+
3130+ node->unlock = 1;
3131+ /* Load node content into memory and install node plugin by
3132+ looking at the node header.
3133+
3134+ Most of the time this call is cheap because the node is
3135+ already in memory.
3136+
3137+ Corresponding zrelse() is in unlock_carry_node()
3138+ */
71430cf6 3139+ return zload(reiser4_carry_real(node));
44254afd
MT
3140+}
3141+
3142+/* lock carry node
3143+
3144+ "Resolve" node to real znode, lock it and mark as locked.
3145+ This requires recursive locking of znodes.
3146+
3147+ When operation is posted to the parent level, node it will be applied to is
3148+ not yet known. For example, when shifting data between two nodes,
3149+ delimiting has to be updated in parent or parents of nodes involved. But
3150+ their parents is not yet locked and, moreover said nodes can be reparented
3151+ by concurrent balancing.
3152+
3153+ To work around this, carry operation is applied to special "carry node"
3154+ rather than to the znode itself. Carry node consists of some "base" or
3155+ "reference" znode and flags indicating how to get to the target of carry
3156+ operation (->real_node field of carry_node) from base.
3157+
3158+*/
3159+int lock_carry_node(carry_level * level /* level @node is in */ ,
3160+ carry_node * node /* node to lock */ )
3161+{
3162+ int result;
3163+ znode *reference_point;
3164+ lock_handle lh;
3165+ lock_handle tmp_lh;
3166+ reiser4_tree *tree;
3167+
3168+ assert("nikita-887", level != NULL);
3169+ assert("nikita-882", node != NULL);
3170+
3171+ result = 0;
3172+ reference_point = node->node;
3173+ init_lh(&lh);
3174+ init_lh(&tmp_lh);
3175+ if (node->left_before) {
3176+ /* handling of new nodes, allocated on the previous level:
3177+
3178+ some carry ops were propably posted from the new node, but
3179+ this node neither has parent pointer set, nor is
3180+ connected. This will be done in ->create_hook() for
3181+ internal item.
3182+
3183+ No then less, parent of new node has to be locked. To do
3184+ this, first go to the "left" in the carry order. This
3185+ depends on the decision to always allocate new node on the
3186+ right of existing one.
3187+
3188+ Loop handles case when multiple nodes, all orphans, were
3189+ inserted.
3190+
3191+ Strictly speaking, taking tree lock is not necessary here,
3192+ because all nodes scanned by loop in
3193+ find_begetting_brother() are write-locked by this thread,
3194+ and thus, their sibling linkage cannot change.
3195+
3196+ */
3197+ tree = znode_get_tree(reference_point);
3198+ read_lock_tree(tree);
3199+ reference_point = find_begetting_brother(node, level)->node;
3200+ read_unlock_tree(tree);
3201+ assert("nikita-1186", reference_point != NULL);
3202+ }
3203+ if (node->parent && (result == 0)) {
3204+ result =
3205+ reiser4_get_parent(&tmp_lh, reference_point,
3206+ ZNODE_WRITE_LOCK);
3207+ if (result != 0) {
3208+ ; /* nothing */
3209+ } else if (znode_get_level(tmp_lh.node) == 0) {
3210+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3211+ result = add_new_root(level, node, tmp_lh.node);
3212+ if (result == 0) {
3213+ reference_point = level->new_root;
3214+ move_lh(&lh, &node->lock_handle);
3215+ }
3216+ } else if ((level->new_root != NULL)
3217+ && (level->new_root !=
3218+ znode_parent_nolock(reference_point))) {
3219+ /* parent of node exists, but this level aready
3220+ created different new root, so */
3221+ warning("nikita-1109",
3222+ /* it should be "radicis", but tradition is
3223+ tradition. do banshees read latin? */
3224+ "hodie natus est radici frater");
3225+ result = -EIO;
3226+ } else {
3227+ move_lh(&lh, &tmp_lh);
3228+ reference_point = lh.node;
3229+ }
3230+ }
3231+ if (node->left && (result == 0)) {
3232+ assert("nikita-1183", node->parent);
3233+ assert("nikita-883", reference_point != NULL);
3234+ result =
3235+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3236+ ZNODE_WRITE_LOCK,
3237+ GN_CAN_USE_UPPER_LEVELS);
3238+ if (result == 0) {
3239+ done_lh(&lh);
3240+ move_lh(&lh, &tmp_lh);
3241+ reference_point = lh.node;
3242+ }
3243+ }
3244+ if (!node->parent && !node->left && !node->left_before) {
3245+ result =
3246+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3247+ ZNODE_LOCK_HIPRI);
3248+ }
3249+ if (result == 0) {
3250+ move_lh(&node->lock_handle, &lh);
3251+ result = lock_carry_node_tail(node);
3252+ }
3253+ done_lh(&tmp_lh);
3254+ done_lh(&lh);
3255+ return result;
3256+}
3257+
3258+/* release a lock on &carry_node.
3259+
3260+ Release if necessary lock on @node. This opearion is pair of
3261+ lock_carry_node() and is idempotent: you can call it more than once on the
3262+ same node.
3263+
3264+*/
3265+static void
3266+unlock_carry_node(carry_level * level,
3267+ carry_node * node /* node to be released */ ,
3268+ int failure /* 0 if node is unlocked due
3269+ * to some error */ )
3270+{
3271+ znode *real_node;
3272+
3273+ assert("nikita-884", node != NULL);
3274+
71430cf6 3275+ real_node = reiser4_carry_real(node);
44254afd
MT
3276+ /* pair to zload() in lock_carry_node_tail() */
3277+ zrelse(real_node);
3278+ if (node->unlock && (real_node != NULL)) {
3279+ assert("nikita-899", real_node == node->lock_handle.node);
3280+ longterm_unlock_znode(&node->lock_handle);
3281+ }
3282+ if (failure) {
3283+ if (node->deallocate && (real_node != NULL)) {
3284+ /* free node in bitmap
3285+
3286+ Prepare node for removal. Last zput() will finish
3287+ with it.
3288+ */
3289+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3290+ }
3291+ if (node->free) {
3292+ assert("nikita-2177",
3293+ list_empty_careful(&node->lock_handle.locks_link));
3294+ assert("nikita-2112",
3295+ list_empty_careful(&node->lock_handle.owners_link));
3296+ reiser4_pool_free(&level->pool->node_pool,
3297+ &node->header);
3298+ }
3299+ }
3300+}
3301+
3302+/* fatal_carry_error() - all-catching error handling function
3303+
3304+ It is possible that carry faces unrecoverable error, like unability to
3305+ insert pointer at the internal level. Our simple solution is just panic in
3306+ this situation. More sophisticated things like attempt to remount
3307+ file-system as read-only can be implemented without much difficlties.
3308+
3309+ It is believed, that:
3310+
3311+ 1. in stead of panicking, all current transactions can be aborted rolling
3312+ system back to the consistent state.
3313+
3314+Umm, if you simply panic without doing anything more at all, then all current
3315+transactions are aborted and the system is rolled back to a consistent state,
3316+by virtue of the design of the transactional mechanism. Well, wait, let's be
3317+precise. If an internal node is corrupted on disk due to hardware failure,
3318+then there may be no consistent state that can be rolled back to, so instead
3319+we should say that it will rollback the transactions, which barring other
3320+factors means rolling back to a consistent state.
3321+
3322+# Nikita: there is a subtle difference between panic and aborting
3323+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3324+# don't using reiser4 (not that we care about such processes), or using other
3325+# reiser4 mounts (about them we do care) will simply continue to run. With
3326+# some luck, even application using aborted file system can survive: it will
3327+# get some error, like EBADF, from each file descriptor on failed file system,
3328+# but applications that do care about tolerance will cope with this (squid
3329+# will).
3330+
3331+It would be a nice feature though to support rollback without rebooting
3332+followed by remount, but this can wait for later versions.
3333+
3334+ 2. once isolated transactions will be implemented it will be possible to
3335+ roll back offending transaction.
3336+
3337+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3338+it more before deciding if it should be done. -Hans
3339+
3340+*/
3341+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3342+ * where
3343+ * unrecoverable
3344+ * error
3345+ * occurred */ ,
3346+ int ecode /* error code */ )
3347+{
3348+ assert("nikita-1230", doing != NULL);
3349+ assert("nikita-1231", ecode < 0);
3350+
3351+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3352+}
3353+
3354+/* add new root to the tree
3355+
3356+ This function itself only manages changes in carry structures and delegates
3357+ all hard work (allocation of znode for new root, changes of parent and
71430cf6 3358+ sibling pointers to the reiser4_add_tree_root().
44254afd
MT
3359+
3360+ Locking: old tree root is locked by carry at this point. Fake znode is also
3361+ locked.
3362+
3363+*/
3364+static int add_new_root(carry_level * level /* carry level in context of which
3365+ * operation is performed */ ,
3366+ carry_node * node /* carry node for existing root */ ,
3367+ znode * fake /* "fake" znode already locked by
3368+ * us */ )
3369+{
3370+ int result;
3371+
3372+ assert("nikita-1104", level != NULL);
3373+ assert("nikita-1105", node != NULL);
3374+
3375+ assert("nikita-1403", znode_is_write_locked(node->node));
3376+ assert("nikita-1404", znode_is_write_locked(fake));
3377+
3378+ /* trying to create new root. */
3379+ /* @node is root and it's already locked by us. This
3380+ means that nobody else can be trying to add/remove
3381+ tree root right now.
3382+ */
3383+ if (level->new_root == NULL)
71430cf6 3384+ level->new_root = reiser4_add_tree_root(node->node, fake);
44254afd
MT
3385+ if (!IS_ERR(level->new_root)) {
3386+ assert("nikita-1210", znode_is_root(level->new_root));
3387+ node->deallocate = 1;
3388+ result =
3389+ longterm_lock_znode(&node->lock_handle, level->new_root,
3390+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3391+ if (result == 0)
3392+ zput(level->new_root);
3393+ } else {
3394+ result = PTR_ERR(level->new_root);
3395+ level->new_root = NULL;
3396+ }
3397+ return result;
3398+}
3399+
3400+/* allocate new znode and add the operation that inserts the
3401+ pointer to it into the parent node into the todo level
3402+
3403+ Allocate new znode, add it into carry queue and post into @todo queue
3404+ request to add pointer to new node into its parent.
3405+
71430cf6 3406+ This is carry related routing that calls reiser4_new_node() to allocate new
44254afd
MT
3407+ node.
3408+*/
3409+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3410+ * node */ ,
3411+ carry_node * ref /* carry node after which new
3412+ * carry node is to be inserted
3413+ * into queue. This affects
3414+ * locking. */ ,
3415+ carry_level * doing /* carry queue where new node is
3416+ * to be added */ ,
3417+ carry_level * todo /* carry queue where COP_INSERT
3418+ * operation to add pointer to
3419+ * new node will ne added */ )
3420+{
3421+ carry_node *fresh;
3422+ znode *new_znode;
3423+ carry_op *add_pointer;
3424+ carry_plugin_info info;
3425+
3426+ assert("nikita-1048", brother != NULL);
3427+ assert("nikita-1049", todo != NULL);
3428+
3429+ /* There is a lot of possible variations here: to what parent
3430+ new node will be attached and where. For simplicity, always
3431+ do the following:
3432+
3433+ (1) new node and @brother will have the same parent.
3434+
3435+ (2) new node is added on the right of @brother
3436+
3437+ */
3438+
71430cf6
MT
3439+ fresh = reiser4_add_carry_skip(doing,
3440+ ref ? POOLO_AFTER : POOLO_LAST, ref);
44254afd
MT
3441+ if (IS_ERR(fresh))
3442+ return fresh;
3443+
3444+ fresh->deallocate = 1;
3445+ fresh->free = 1;
3446+
71430cf6 3447+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
44254afd
MT
3448+ if (IS_ERR(new_znode))
3449+ /* @fresh will be deallocated automatically by error
3450+ handling code in the caller. */
3451+ return (carry_node *) new_znode;
3452+
3453+ /* new_znode returned znode with x_count 1. Caller has to decrease
3454+ it. make_space() does. */
3455+
3456+ ZF_SET(new_znode, JNODE_ORPHAN);
3457+ fresh->node = new_znode;
3458+
71430cf6 3459+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
44254afd
MT
3460+ ref = carry_node_prev(ref);
3461+ assert("nikita-1606", !carry_node_end(doing, ref));
3462+ }
3463+
3464+ info.todo = todo;
3465+ info.doing = doing;
71430cf6
MT
3466+ add_pointer = node_post_carry(&info, COP_INSERT,
3467+ reiser4_carry_real(ref), 1);
44254afd
MT
3468+ if (IS_ERR(add_pointer)) {
3469+ /* no need to deallocate @new_znode here: it will be
3470+ deallocated during carry error handling. */
3471+ return (carry_node *) add_pointer;
3472+ }
3473+
3474+ add_pointer->u.insert.type = COPT_CHILD;
3475+ add_pointer->u.insert.child = fresh;
3476+ add_pointer->u.insert.brother = brother;
3477+ /* initially new node spawns empty key range */
3478+ write_lock_dk(znode_get_tree(brother));
3479+ znode_set_ld_key(new_znode,
3480+ znode_set_rd_key(new_znode,
3481+ znode_get_rd_key(brother)));
3482+ write_unlock_dk(znode_get_tree(brother));
3483+ return fresh;
3484+}
3485+
3486+/* DEBUGGING FUNCTIONS.
3487+
3488+ Probably we also should leave them on even when
3489+ debugging is turned off to print dumps at errors.
3490+*/
3491+#if REISER4_DEBUG
3492+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3493+{
3494+ carry_node *node;
3495+ carry_node *tmp_node;
3496+
3497+ if (level == NULL)
3498+ return 0;
3499+
3500+ if (level->track_type != 0 &&
3501+ level->track_type != CARRY_TRACK_NODE &&
3502+ level->track_type != CARRY_TRACK_CHANGE)
3503+ return 0;
3504+
3505+ /* check that nodes are in ascending order */
3506+ for_all_nodes(level, node, tmp_node) {
3507+ znode *left;
3508+ znode *right;
3509+
3510+ reiser4_key lkey;
3511+ reiser4_key rkey;
3512+
3513+ if (node != carry_node_front(level)) {
3514+ if (state == CARRY_TODO) {
3515+ right = node->node;
3516+ left = carry_node_prev(node)->node;
3517+ } else {
71430cf6
MT
3518+ right = reiser4_carry_real(node);
3519+ left = reiser4_carry_real(carry_node_prev(node));
44254afd
MT
3520+ }
3521+ if (right == NULL || left == NULL)
3522+ continue;
3523+ if (node_is_empty(right) || node_is_empty(left))
3524+ continue;
3525+ if (!keyle(leftmost_key_in_node(left, &lkey),
3526+ leftmost_key_in_node(right, &rkey))) {
3527+ warning("", "wrong key order");
3528+ return 0;
3529+ }
3530+ }
3531+ }
3532+ return 1;
3533+}
3534+#endif
3535+
3536+/* get symbolic name for boolean */
3537+static const char *tf(int boolean /* truth value */ )
3538+{
3539+ return boolean ? "t" : "f";
3540+}
3541+
3542+/* symbolic name for carry operation */
3543+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3544+{
3545+ switch (op) {
3546+ case COP_INSERT:
3547+ return "COP_INSERT";
3548+ case COP_DELETE:
3549+ return "COP_DELETE";
3550+ case COP_CUT:
3551+ return "COP_CUT";
3552+ case COP_PASTE:
3553+ return "COP_PASTE";
3554+ case COP_UPDATE:
3555+ return "COP_UPDATE";
3556+ case COP_EXTENT:
3557+ return "COP_EXTENT";
3558+ case COP_INSERT_FLOW:
3559+ return "COP_INSERT_FLOW";
3560+ default:{
3561+ /* not mt safe, but who cares? */
3562+ static char buf[20];
3563+
3564+ sprintf(buf, "unknown op: %x", op);
3565+ return buf;
3566+ }
3567+ }
3568+}
3569+
3570+/* dump information about carry node */
3571+static void print_carry(const char *prefix /* prefix to print */ ,
3572+ carry_node * node /* node to print */ )
3573+{
3574+ if (node == NULL) {
3575+ printk("%s: null\n", prefix);
3576+ return;
3577+ }
3578+ printk
3579+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3580+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3581+ tf(node->free), tf(node->deallocate));
3582+}
3583+
3584+/* dump information about carry operation */
3585+static void print_op(const char *prefix /* prefix to print */ ,
3586+ carry_op * op /* operation to print */ )
3587+{
3588+ if (op == NULL) {
3589+ printk("%s: null\n", prefix);
3590+ return;
3591+ }
3592+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3593+ print_carry("\tnode", op->node);
3594+ switch (op->op) {
3595+ case COP_INSERT:
3596+ case COP_PASTE:
3597+ print_coord("\tcoord",
3598+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
71430cf6
MT
3599+ reiser4_print_key("\tkey",
3600+ op->u.insert.d ? op->u.insert.d->key : NULL);
44254afd
MT
3601+ print_carry("\tchild", op->u.insert.child);
3602+ break;
3603+ case COP_DELETE:
3604+ print_carry("\tchild", op->u.delete.child);
3605+ break;
3606+ case COP_CUT:
3607+ if (op->u.cut_or_kill.is_cut) {
3608+ print_coord("\tfrom",
3609+ op->u.cut_or_kill.u.kill->params.from, 0);
3610+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3611+ 0);
3612+ } else {
3613+ print_coord("\tfrom",
3614+ op->u.cut_or_kill.u.cut->params.from, 0);
3615+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3616+ 0);
3617+ }
3618+ break;
3619+ case COP_UPDATE:
3620+ print_carry("\tleft", op->u.update.left);
3621+ break;
3622+ default:
3623+ /* do nothing */
3624+ break;
3625+ }
3626+}
3627+
3628+/* dump information about all nodes and operations in a @level */
3629+static void print_level(const char *prefix /* prefix to print */ ,
3630+ carry_level * level /* level to print */ )
3631+{
3632+ carry_node *node;
3633+ carry_node *tmp_node;
3634+ carry_op *op;
3635+ carry_op *tmp_op;
3636+
3637+ if (level == NULL) {
3638+ printk("%s: null\n", prefix);
3639+ return;
3640+ }
3641+ printk("%s: %p, restartable: %s\n",
3642+ prefix, level, tf(level->restartable));
3643+
3644+ for_all_nodes(level, node, tmp_node)
3645+ print_carry("\tcarry node", node);
3646+ for_all_ops(level, op, tmp_op)
3647+ print_op("\tcarry op", op);
3648+}
3649+
3650+/* Make Linus happy.
3651+ Local variables:
3652+ c-indentation-style: "K&R"
3653+ mode-name: "LC"
3654+ c-basic-offset: 8
3655+ tab-width: 8
3656+ fill-column: 120
3657+ scroll-step: 1
3658+ End:
3659+*/
71430cf6
MT
3660diff -urN linux-2.6.22.orig/fs/reiser4/carry.h linux-2.6.22/fs/reiser4/carry.h
3661--- linux-2.6.22.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3662+++ linux-2.6.22/fs/reiser4/carry.h 2007-07-29 00:25:34.824683017 +0400
44254afd
MT
3663@@ -0,0 +1,442 @@
3664+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3665+
3666+/* Functions and data types to "carry" tree modification(s) upward.
3667+ See fs/reiser4/carry.c for details. */
3668+
3669+#if !defined( __FS_REISER4_CARRY_H__ )
3670+#define __FS_REISER4_CARRY_H__
3671+
3672+#include "forward.h"
3673+#include "debug.h"
3674+#include "pool.h"
3675+#include "znode.h"
3676+
3677+#include <linux/types.h>
3678+
3679+/* &carry_node - "location" of carry node.
3680+
3681+ "location" of node that is involved or going to be involved into
3682+ carry process. Node where operation will be carried to on the
3683+ parent level cannot be recorded explicitly. Operation will be carried
3684+ usually to the parent of some node (where changes are performed at
3685+ the current level) or, to the left neighbor of its parent. But while
3686+ modifications are performed at the current level, parent may
3687+ change. So, we have to allow some indirection (or, positevly,
3688+ flexibility) in locating carry nodes.
3689+
3690+*/
3691+typedef struct carry_node {
3692+ /* pool linkage */
71430cf6 3693+ struct reiser4_pool_header header;
44254afd
MT
3694+
3695+ /* base node from which real_node is calculated. See
3696+ fs/reiser4/carry.c:lock_carry_node(). */
3697+ znode *node;
3698+
3699+ /* how to get ->real_node */
3700+ /* to get ->real_node obtain parent of ->node */
3701+ __u32 parent:1;
3702+ /* to get ->real_node obtain left neighbor of parent of
3703+ ->node */
3704+ __u32 left:1;
3705+ __u32 left_before:1;
3706+
3707+ /* locking */
3708+
3709+ /* this node was locked by carry process and should be
3710+ unlocked when carry leaves a level */
3711+ __u32 unlock:1;
3712+
3713+ /* disk block for this node was allocated by carry process and
3714+ should be deallocated when carry leaves a level */
3715+ __u32 deallocate:1;
3716+ /* this carry node was allocated by carry process and should be
3717+ freed when carry leaves a level */
3718+ __u32 free:1;
3719+
3720+ /* type of lock we want to take on this node */
3721+ lock_handle lock_handle;
3722+} carry_node;
3723+
3724+/* &carry_opcode - elementary operations that can be carried upward
3725+
3726+ Operations that carry() can handle. This list is supposed to be
3727+ expanded.
3728+
3729+ Each carry operation (cop) is handled by appropriate function defined
3730+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
3731+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3732+ call plugins of nodes affected by operation to modify nodes' content
3733+ and to gather operations to be performed on the next level.
3734+
3735+*/
3736+typedef enum {
3737+ /* insert new item into node. */
3738+ COP_INSERT,
3739+ /* delete pointer from parent node */
3740+ COP_DELETE,
3741+ /* remove part of or whole node. */
3742+ COP_CUT,
3743+ /* increase size of item. */
3744+ COP_PASTE,
3745+ /* insert extent (that is sequence of unformatted nodes). */
3746+ COP_EXTENT,
3747+ /* update delimiting key in least common ancestor of two
3748+ nodes. This is performed when items are moved between two
3749+ nodes.
3750+ */
3751+ COP_UPDATE,
3752+ /* insert flow */
3753+ COP_INSERT_FLOW,
3754+ COP_LAST_OP,
3755+} carry_opcode;
3756+
3757+#define CARRY_FLOW_NEW_NODES_LIMIT 20
3758+
3759+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3760+ item is determined. */
3761+typedef enum {
3762+ /* target item is one containing pointer to the ->child node */
3763+ COPT_CHILD,
3764+ /* target item is given explicitly by @coord */
3765+ COPT_ITEM_DATA,
3766+ /* target item is given by key */
3767+ COPT_KEY,
3768+ /* see insert_paste_common() for more comments on this. */
3769+ COPT_PASTE_RESTARTED,
3770+} cop_insert_pos_type;
3771+
3772+/* flags to cut and delete */
3773+typedef enum {
3774+ /* don't kill node even if it became completely empty as results of
3775+ * cut. This is needed for eottl handling. See carry_extent() for
3776+ * details. */
3777+ DELETE_RETAIN_EMPTY = (1 << 0)
3778+} cop_delete_flag;
3779+
3780+/*
3781+ * carry() implements "lock handle tracking" feature.
3782+ *
3783+ * Callers supply carry with node where to perform initial operation and lock
3784+ * handle on this node. Trying to optimize node utilization carry may actually
3785+ * move insertion point to different node. Callers expect that lock handle
3786+ * will rebe transferred to the new node also.
3787+ *
3788+ */
3789+typedef enum {
3790+ /* transfer lock handle along with insertion point */
3791+ CARRY_TRACK_CHANGE = 1,
3792+ /* acquire new lock handle to the node where insertion point is. This
3793+ * is used when carry() client doesn't initially possess lock handle
3794+ * on the insertion point node, for example, by extent insertion
3795+ * code. See carry_extent(). */
3796+ CARRY_TRACK_NODE = 2
3797+} carry_track_type;
3798+
3799+/* data supplied to COP_{INSERT|PASTE} by callers */
3800+typedef struct carry_insert_data {
3801+ /* position where new item is to be inserted */
3802+ coord_t *coord;
3803+ /* new item description */
3804+ reiser4_item_data *data;
3805+ /* key of new item */
3806+ const reiser4_key *key;
3807+} carry_insert_data;
3808+
3809+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3810+struct cut_kill_params {
3811+ /* coord where cut starts (inclusive) */
3812+ coord_t *from;
3813+ /* coord where cut stops (inclusive, this item/unit will also be
3814+ * cut) */
3815+ coord_t *to;
3816+ /* starting key. This is necessary when item and unit pos don't
3817+ * uniquely identify what portion or tree to remove. For example, this
3818+ * indicates what portion of extent unit will be affected. */
3819+ const reiser4_key *from_key;
3820+ /* exclusive stop key */
3821+ const reiser4_key *to_key;
3822+ /* if this is not NULL, smallest actually removed key is stored
3823+ * here. */
3824+ reiser4_key *smallest_removed;
3825+ /* kill_node_content() is called for file truncate */
3826+ int truncate;
3827+};
3828+
3829+struct carry_cut_data {
3830+ struct cut_kill_params params;
3831+};
3832+
3833+struct carry_kill_data {
3834+ struct cut_kill_params params;
3835+ /* parameter to be passed to the ->kill_hook() method of item
3836+ * plugin */
3837+ /*void *iplug_params; *//* FIXME: unused currently */
3838+ /* if not NULL---inode whose items are being removed. This is needed
3839+ * for ->kill_hook() of extent item to update VM structures when
3840+ * removing pages. */
3841+ struct inode *inode;
3842+ /* sibling list maintenance is complicated by existence of eottl. When
3843+ * eottl whose left and right neighbors are formatted leaves is
3844+ * removed, one has to connect said leaves in the sibling list. This
3845+ * cannot be done when extent removal is just started as locking rules
3846+ * require sibling list update to happen atomically with removal of
3847+ * extent item. Therefore: 1. pointers to left and right neighbors
3848+ * have to be passed down to the ->kill_hook() of extent item, and
3849+ * 2. said neighbors have to be locked. */
3850+ lock_handle *left;
3851+ lock_handle *right;
3852+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3853+ unsigned flags;
3854+ char *buf;
3855+};
3856+
3857+/* &carry_tree_op - operation to "carry" upward.
3858+
3859+ Description of an operation we want to "carry" to the upper level of
3860+ a tree: e.g, when we insert something and there is not enough space
3861+ we allocate a new node and "carry" the operation of inserting a
3862+ pointer to the new node to the upper level, on removal of empty node,
3863+ we carry up operation of removing appropriate entry from parent.
3864+
3865+ There are two types of carry ops: when adding or deleting node we
3866+ node at the parent level where appropriate modification has to be
3867+ performed is known in advance. When shifting items between nodes
3868+ (split, merge), delimiting key should be changed in the least common
3869+ parent of the nodes involved that is not known in advance.
3870+
3871+ For the operations of the first type we store in &carry_op pointer to
3872+ the &carry_node at the parent level. For the operation of the second
3873+ type we store &carry_node or parents of the left and right nodes
3874+ modified and keep track of them upward until they coincide.
3875+
3876+*/
3877+typedef struct carry_op {
3878+ /* pool linkage */
71430cf6 3879+ struct reiser4_pool_header header;
44254afd
MT
3880+ carry_opcode op;
3881+ /* node on which operation is to be performed:
3882+
3883+ for insert, paste: node where new item is to be inserted
3884+
3885+ for delete: node where pointer is to be deleted
3886+
3887+ for cut: node to cut from
3888+
3889+ for update: node where delimiting key is to be modified
3890+
3891+ for modify: parent of modified node
3892+
3893+ */
3894+ carry_node *node;
3895+ union {
3896+ struct {
3897+ /* (sub-)type of insertion/paste. Taken from
3898+ cop_insert_pos_type. */
3899+ __u8 type;
3900+ /* various operation flags. Taken from
3901+ cop_insert_flag. */
3902+ __u8 flags;
3903+ carry_insert_data *d;
3904+ carry_node *child;
3905+ znode *brother;
3906+ } insert, paste, extent;
3907+
3908+ struct {
3909+ int is_cut;
3910+ union {
3911+ carry_kill_data *kill;
3912+ carry_cut_data *cut;
3913+ } u;
3914+ } cut_or_kill;
3915+
3916+ struct {
3917+ carry_node *left;
3918+ } update;
3919+ struct {
3920+ /* changed child */
3921+ carry_node *child;
3922+ /* bitmask of changes. See &cop_modify_flag */
3923+ __u32 flag;
3924+ } modify;
3925+ struct {
3926+ /* flags to deletion operation. Are taken from
3927+ cop_delete_flag */
3928+ __u32 flags;
3929+ /* child to delete from parent. If this is
3930+ NULL, delete op->node. */
3931+ carry_node *child;
3932+ } delete;
3933+ struct {
3934+ /* various operation flags. Taken from
3935+ cop_insert_flag. */
3936+ __u32 flags;
3937+ flow_t *flow;
3938+ coord_t *insert_point;
3939+ reiser4_item_data *data;
3940+ /* flow insertion is limited by number of new blocks
3941+ added in that operation which do not get any data
3942+ but part of flow. This limit is set by macro
3943+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3944+ of nodes added already during one carry_flow */
3945+ int new_nodes;
3946+ } insert_flow;
3947+ } u;
3948+} carry_op;
3949+
3950+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3951+typedef struct carry_pool {
3952+ carry_op op[CARRIES_POOL_SIZE];
71430cf6 3953+ struct reiser4_pool op_pool;
44254afd 3954+ carry_node node[NODES_LOCKED_POOL_SIZE];
71430cf6 3955+ struct reiser4_pool node_pool;
44254afd
MT
3956+} carry_pool;
3957+
3958+/* &carry_tree_level - carry process on given level
3959+
3960+ Description of balancing process on the given level.
3961+
3962+ No need for locking here, as carry_tree_level is essentially per
3963+ thread thing (for now).
3964+
3965+*/
3966+struct carry_level {
3967+ /* this level may be restarted */
3968+ __u32 restartable:1;
3969+ /* list of carry nodes on this level, ordered by key order */
3970+ struct list_head nodes;
3971+ struct list_head ops;
3972+ /* pool where new objects are allocated from */
3973+ carry_pool *pool;
3974+ int ops_num;
3975+ int nodes_num;
3976+ /* new root created on this level, if any */
3977+ znode *new_root;
71430cf6
MT
3978+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
3979+ when they want ->tracked to automagically wander to the node where
44254afd
MT
3980+ insertion point moved after insert or paste.
3981+ */
3982+ carry_track_type track_type;
3983+ /* lock handle supplied by user that we are tracking. See
3984+ above. */
3985+ lock_handle *tracked;
3986+};
3987+
3988+/* information carry passes to plugin methods that may add new operations to
3989+ the @todo queue */
3990+struct carry_plugin_info {
3991+ carry_level *doing;
3992+ carry_level *todo;
3993+};
3994+
71430cf6 3995+int reiser4_carry(carry_level * doing, carry_level * done);
44254afd 3996+
71430cf6
MT
3997+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
3998+ carry_node * reference);
3999+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4000+ carry_node * reference);
44254afd
MT
4001+
4002+extern carry_node *insert_carry_node(carry_level * doing,
4003+ carry_level * todo, const znode * node);
4004+
4005+extern carry_pool *init_carry_pool(int);
4006+extern void done_carry_pool(carry_pool * pool);
4007+
4008+extern void init_carry_level(carry_level * level, carry_pool * pool);
4009+
71430cf6
MT
4010+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4011+ znode * node, int apply_to_parent);
44254afd
MT
4012+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4013+ znode * node, int apply_to_parent_p);
4014+
4015+carry_node *add_new_znode(znode * brother, carry_node * reference,
4016+ carry_level * doing, carry_level * todo);
4017+
4018+carry_node *find_carry_node(carry_level * level, const znode * node);
4019+
71430cf6 4020+extern znode *reiser4_carry_real(const carry_node * node);
44254afd
MT
4021+
4022+/* helper macros to iterate over carry queues */
4023+
4024+#define carry_node_next( node ) \
4025+ list_entry((node)->header.level_linkage.next, carry_node, \
4026+ header.level_linkage)
4027+
4028+#define carry_node_prev( node ) \
4029+ list_entry((node)->header.level_linkage.prev, carry_node, \
4030+ header.level_linkage)
4031+
4032+#define carry_node_front( level ) \
4033+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
4034+
4035+#define carry_node_back( level ) \
4036+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4037+
4038+#define carry_node_end( level, node ) \
4039+ (&(level)->nodes == &(node)->header.level_linkage)
4040+
4041+/* macro to iterate over all operations in a @level */
4042+#define for_all_ops( level /* carry level (of type carry_level *) */, \
4043+ op /* pointer to carry operation, modified by loop (of \
4044+ * type carry_op *) */, \
4045+ tmp /* pointer to carry operation (of type carry_op *), \
4046+ * used to make iterator stable in the face of \
4047+ * deletions from the level */ ) \
4048+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4049+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4050+ &op->header.level_linkage != &level->ops; \
4051+ op = tmp, \
4052+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4053+
4054+#if 0
4055+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4056+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4057+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4058+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4059+#endif
4060+
4061+/* macro to iterate over all nodes in a @level */ \
4062+#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4063+ node /* pointer to carry node, modified by loop (of \
4064+ * type carry_node *) */, \
4065+ tmp /* pointer to carry node (of type carry_node *), \
4066+ * used to make iterator stable in the face of * \
4067+ * deletions from the level */ ) \
4068+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4069+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4070+ &node->header.level_linkage != &level->nodes; \
4071+ node = tmp, \
4072+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4073+
4074+#if 0
4075+for( node = carry_node_front( level ), \
4076+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4077+ node = tmp, tmp = carry_node_next( node ) )
4078+#endif
4079+
4080+/* macro to iterate over all nodes in a @level in reverse order
4081+
4082+ This is used, because nodes are unlocked in reversed order of locking */
4083+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4084+ node /* pointer to carry node, modified by loop \
4085+ * (of type carry_node *) */, \
4086+ tmp /* pointer to carry node (of type carry_node \
4087+ * *), used to make iterator stable in the \
4088+ * face of deletions from the level */ ) \
4089+for( node = carry_node_back( level ), \
4090+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4091+ node = tmp, tmp = carry_node_prev( node ) )
4092+
4093+/* __FS_REISER4_CARRY_H__ */
4094+#endif
4095+
4096+/* Make Linus happy.
4097+ Local variables:
4098+ c-indentation-style: "K&R"
4099+ mode-name: "LC"
4100+ c-basic-offset: 8
4101+ tab-width: 8
4102+ fill-column: 120
4103+ scroll-step: 1
4104+ End:
4105+*/
71430cf6
MT
4106diff -urN linux-2.6.22.orig/fs/reiser4/carry_ops.c linux-2.6.22/fs/reiser4/carry_ops.c
4107--- linux-2.6.22.orig/fs/reiser4/carry_ops.c 1970-01-01 03:00:00.000000000 +0300
4108+++ linux-2.6.22/fs/reiser4/carry_ops.c 2007-07-29 00:25:34.828684053 +0400
4109@@ -0,0 +1,2131 @@
44254afd
MT
4110+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4111+
4112+/* implementation of carry operations */
4113+
4114+#include "forward.h"
4115+#include "debug.h"
4116+#include "key.h"
4117+#include "coord.h"
4118+#include "plugin/item/item.h"
4119+#include "plugin/node/node.h"
4120+#include "jnode.h"
4121+#include "znode.h"
4122+#include "block_alloc.h"
4123+#include "tree_walk.h"
4124+#include "pool.h"
4125+#include "tree_mod.h"
4126+#include "carry.h"
4127+#include "carry_ops.h"
4128+#include "tree.h"
4129+#include "super.h"
4130+#include "reiser4.h"
4131+
4132+#include <linux/types.h>
4133+#include <linux/err.h>
4134+
4135+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4136+ carry_level * doing, carry_level * todo,
4137+ unsigned int including_insert_coord_p);
4138+
4139+extern int lock_carry_node(carry_level * level, carry_node * node);
4140+extern int lock_carry_node_tail(carry_node * node);
4141+
4142+/* find left neighbor of a carry node
4143+
4144+ Look for left neighbor of @node and add it to the @doing queue. See
4145+ comments in the body.
4146+
4147+*/
4148+static carry_node *find_left_neighbor(carry_op * op /* node to find left
4149+ * neighbor of */ ,
4150+ carry_level * doing /* level to scan */ )
4151+{
4152+ int result;
4153+ carry_node *node;
4154+ carry_node *left;
4155+ int flags;
4156+ reiser4_tree *tree;
4157+
4158+ node = op->node;
4159+
4160+ tree = current_tree;
4161+ read_lock_tree(tree);
4162+ /* first, check whether left neighbor is already in a @doing queue */
71430cf6 4163+ if (reiser4_carry_real(node)->left != NULL) {
44254afd
MT
4164+ /* NOTE: there is locking subtlety here. Look into
4165+ * find_right_neighbor() for more info */
71430cf6
MT
4166+ if (find_carry_node(doing,
4167+ reiser4_carry_real(node)->left) != NULL) {
44254afd
MT
4168+ read_unlock_tree(tree);
4169+ left = node;
4170+ do {
4171+ left = list_entry(left->header.level_linkage.prev,
4172+ carry_node, header.level_linkage);
4173+ assert("nikita-3408", !carry_node_end(doing,
4174+ left));
71430cf6
MT
4175+ } while (reiser4_carry_real(left) ==
4176+ reiser4_carry_real(node));
44254afd
MT
4177+ return left;
4178+ }
4179+ }
4180+ read_unlock_tree(tree);
4181+
71430cf6 4182+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
44254afd
MT
4183+ if (IS_ERR(left))
4184+ return left;
4185+
4186+ left->node = node->node;
4187+ left->free = 1;
4188+
4189+ flags = GN_TRY_LOCK;
4190+ if (!op->u.insert.flags & COPI_LOAD_LEFT)
4191+ flags |= GN_NO_ALLOC;
4192+
4193+ /* then, feeling lucky, peek left neighbor in the cache. */
71430cf6
MT
4194+ result = reiser4_get_left_neighbor(&left->lock_handle,
4195+ reiser4_carry_real(node),
44254afd
MT
4196+ ZNODE_WRITE_LOCK, flags);
4197+ if (result == 0) {
4198+ /* ok, node found and locked. */
4199+ result = lock_carry_node_tail(left);
4200+ if (result != 0)
4201+ left = ERR_PTR(result);
4202+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4203+ /* node is leftmost node in a tree, or neighbor wasn't in
4204+ cache, or there is an extent on the left. */
4205+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4206+ left = NULL;
4207+ } else if (doing->restartable) {
4208+ /* if left neighbor is locked, and level is restartable, add
4209+ new node to @doing and restart. */
4210+ assert("nikita-913", node->parent != 0);
4211+ assert("nikita-914", node->node != NULL);
4212+ left->left = 1;
4213+ left->free = 0;
4214+ left = ERR_PTR(-E_REPEAT);
4215+ } else {
4216+ /* left neighbor is locked, level cannot be restarted. Just
4217+ ignore left neighbor. */
4218+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4219+ left = NULL;
4220+ }
4221+ return left;
4222+}
4223+
4224+/* find right neighbor of a carry node
4225+
4226+ Look for right neighbor of @node and add it to the @doing queue. See
4227+ comments in the body.
4228+
4229+*/
4230+static carry_node *find_right_neighbor(carry_op * op /* node to find right
4231+ * neighbor of */ ,
4232+ carry_level * doing /* level to scan */ )
4233+{
4234+ int result;
4235+ carry_node *node;
4236+ carry_node *right;
4237+ lock_handle lh;
4238+ int flags;
4239+ reiser4_tree *tree;
4240+
4241+ init_lh(&lh);
4242+
4243+ node = op->node;
4244+
4245+ tree = current_tree;
4246+ read_lock_tree(tree);
4247+ /* first, check whether right neighbor is already in a @doing queue */
71430cf6 4248+ if (reiser4_carry_real(node)->right != NULL) {
44254afd
MT
4249+ /*
4250+ * Tree lock is taken here anyway, because, even if _outcome_
4251+ * of (find_carry_node() != NULL) doesn't depends on
4252+ * concurrent updates to ->right, find_carry_node() cannot
4253+ * work with second argument NULL. Hence, following comment is
4254+ * of historic importance only.
4255+ *
4256+ * Subtle:
4257+ *
4258+ * Q: why don't we need tree lock here, looking for the right
4259+ * neighbor?
4260+ *
4261+ * A: even if value of node->real_node->right were changed
4262+ * during find_carry_node() execution, outcome of execution
4263+ * wouldn't change, because (in short) other thread cannot add
4264+ * elements to the @doing, and if node->real_node->right
4265+ * already was in @doing, value of node->real_node->right
4266+ * couldn't change, because node cannot be inserted between
4267+ * locked neighbors.
4268+ */
71430cf6
MT
4269+ if (find_carry_node(doing,
4270+ reiser4_carry_real(node)->right) != NULL) {
44254afd
MT
4271+ read_unlock_tree(tree);
4272+ /*
4273+ * What we are doing here (this is also applicable to
4274+ * the find_left_neighbor()).
4275+ *
4276+ * tree_walk.c code requires that insertion of a
4277+ * pointer to a child, modification of parent pointer
4278+ * in the child, and insertion of the child into
4279+ * sibling list are atomic (see
4280+ * plugin/item/internal.c:create_hook_internal()).
4281+ *
4282+ * carry allocates new node long before pointer to it
4283+ * is inserted into parent and, actually, long before
4284+ * parent is even known. Such allocated-but-orphaned
4285+ * nodes are only trackable through carry level lists.
4286+ *
4287+ * Situation that is handled here is following: @node
4288+ * has valid ->right pointer, but there is
4289+ * allocated-but-orphaned node in the carry queue that
4290+ * is logically between @node and @node->right. Here
4291+ * we are searching for it. Critical point is that
4292+ * this is only possible if @node->right is also in
4293+ * the carry queue (this is checked above), because
4294+ * this is the only way new orphaned node could be
4295+ * inserted between them (before inserting new node,
4296+ * make_space() first tries to shift to the right, so,
4297+ * right neighbor will be locked and queued).
4298+ *
4299+ */
4300+ right = node;
4301+ do {
4302+ right = list_entry(right->header.level_linkage.next,
4303+ carry_node, header.level_linkage);
4304+ assert("nikita-3408", !carry_node_end(doing,
4305+ right));
71430cf6
MT
4306+ } while (reiser4_carry_real(right) ==
4307+ reiser4_carry_real(node));
44254afd
MT
4308+ return right;
4309+ }
4310+ }
4311+ read_unlock_tree(tree);
4312+
4313+ flags = GN_CAN_USE_UPPER_LEVELS;
4314+ if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4315+ flags = GN_NO_ALLOC;
4316+
4317+ /* then, try to lock right neighbor */
4318+ init_lh(&lh);
71430cf6
MT
4319+ result = reiser4_get_right_neighbor(&lh,
4320+ reiser4_carry_real(node),
44254afd
MT
4321+ ZNODE_WRITE_LOCK, flags);
4322+ if (result == 0) {
4323+ /* ok, node found and locked. */
71430cf6 4324+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
44254afd
MT
4325+ if (!IS_ERR(right)) {
4326+ right->node = lh.node;
4327+ move_lh(&right->lock_handle, &lh);
4328+ right->free = 1;
4329+ result = lock_carry_node_tail(right);
4330+ if (result != 0)
4331+ right = ERR_PTR(result);
4332+ }
4333+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4334+ /* node is rightmost node in a tree, or neighbor wasn't in
4335+ cache, or there is an extent on the right. */
4336+ right = NULL;
4337+ } else
4338+ right = ERR_PTR(result);
4339+ done_lh(&lh);
4340+ return right;
4341+}
4342+
4343+/* how much free space in a @node is needed for @op
4344+
4345+ How much space in @node is required for completion of @op, where @op is
4346+ insert or paste operation.
4347+*/
4348+static unsigned int space_needed_for_op(znode * node /* znode data are
4349+ * inserted or
4350+ * pasted in */ ,
4351+ carry_op * op /* carry
4352+ operation */ )
4353+{
4354+ assert("nikita-919", op != NULL);
4355+
4356+ switch (op->op) {
4357+ default:
4358+ impossible("nikita-1701", "Wrong opcode");
4359+ case COP_INSERT:
4360+ return space_needed(node, NULL, op->u.insert.d->data, 1);
4361+ case COP_PASTE:
4362+ return space_needed(node, op->u.insert.d->coord,
4363+ op->u.insert.d->data, 0);
4364+ }
4365+}
4366+
4367+/* how much space in @node is required to insert or paste @data at
4368+ @coord. */
4369+unsigned int space_needed(const znode * node /* node data are inserted or
4370+ * pasted in */ ,
4371+ const coord_t * coord /* coord where data are
4372+ * inserted or pasted
4373+ * at */ ,
4374+ const reiser4_item_data * data /* data to insert or
4375+ * paste */ ,
4376+ int insertion /* non-0 is inserting, 0---paste */ )
4377+{
4378+ int result;
4379+ item_plugin *iplug;
4380+
4381+ assert("nikita-917", node != NULL);
4382+ assert("nikita-918", node_plugin_by_node(node) != NULL);
4383+ assert("vs-230", !insertion || (coord == NULL));
4384+
4385+ result = 0;
4386+ iplug = data->iplug;
4387+ if (iplug->b.estimate != NULL) {
4388+ /* ask item plugin how much space is needed to insert this
4389+ item */
4390+ result += iplug->b.estimate(insertion ? NULL : coord, data);
4391+ } else {
4392+ /* reasonable default */
4393+ result += data->length;
4394+ }
4395+ if (insertion) {
4396+ node_plugin *nplug;
4397+
4398+ nplug = node->nplug;
4399+ /* and add node overhead */
4400+ if (nplug->item_overhead != NULL) {
4401+ result += nplug->item_overhead(node, NULL);
4402+ }
4403+ }
4404+ return result;
4405+}
4406+
4407+/* find &coord in parent where pointer to new child is to be stored. */
4408+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4409+ * insert pointer to new
4410+ * child */ )
4411+{
4412+ int result;
4413+ znode *node;
4414+ znode *child;
4415+
4416+ assert("nikita-941", op != NULL);
4417+ assert("nikita-942", op->op == COP_INSERT);
4418+
71430cf6 4419+ node = reiser4_carry_real(op->node);
44254afd
MT
4420+ assert("nikita-943", node != NULL);
4421+ assert("nikita-944", node_plugin_by_node(node) != NULL);
4422+
71430cf6 4423+ child = reiser4_carry_real(op->u.insert.child);
44254afd
MT
4424+ result =
4425+ find_new_child_ptr(node, child, op->u.insert.brother,
4426+ op->u.insert.d->coord);
4427+
4428+ build_child_ptr_data(child, op->u.insert.d->data);
4429+ return result;
4430+}
4431+
4432+/* additional amount of free space in @node required to complete @op */
4433+static int free_space_shortage(znode * node /* node to check */ ,
4434+ carry_op * op /* operation being performed */ )
4435+{
4436+ assert("nikita-1061", node != NULL);
4437+ assert("nikita-1062", op != NULL);
4438+
4439+ switch (op->op) {
4440+ default:
4441+ impossible("nikita-1702", "Wrong opcode");
4442+ case COP_INSERT:
4443+ case COP_PASTE:
4444+ return space_needed_for_op(node, op) - znode_free_space(node);
4445+ case COP_EXTENT:
4446+ /* when inserting extent shift data around until insertion
4447+ point is utmost in the node. */
4448+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4449+ return +1;
4450+ else
4451+ return -1;
4452+ }
4453+}
4454+
4455+/* helper function: update node pointer in operation after insertion
4456+ point was probably shifted into @target. */
4457+static znode *sync_op(carry_op * op, carry_node * target)
4458+{
4459+ znode *insertion_node;
4460+
4461+ /* reget node from coord: shift might move insertion coord to
4462+ the neighbor */
4463+ insertion_node = op->u.insert.d->coord->node;
4464+ /* if insertion point was actually moved into new node,
4465+ update carry node pointer in operation. */
71430cf6 4466+ if (insertion_node != reiser4_carry_real(op->node)) {
44254afd 4467+ op->node = target;
71430cf6
MT
4468+ assert("nikita-2540",
4469+ reiser4_carry_real(target) == insertion_node);
44254afd
MT
4470+ }
4471+ assert("nikita-2541",
71430cf6 4472+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
44254afd
MT
4473+ return insertion_node;
4474+}
4475+
4476+/*
4477+ * complete make_space() call: update tracked lock handle if necessary. See
4478+ * comments for fs/reiser4/carry.h:carry_track_type
4479+ */
4480+static int
4481+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4482+{
4483+ int result;
4484+ carry_track_type tracking;
4485+ znode *node;
4486+
4487+ tracking = doing->track_type;
4488+ node = op->u.insert.d->coord->node;
4489+
4490+ if (tracking == CARRY_TRACK_NODE ||
4491+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4492+ /* inserting or pasting into node different from
4493+ original. Update lock handle supplied by caller. */
4494+ assert("nikita-1417", doing->tracked != NULL);
4495+ done_lh(doing->tracked);
4496+ init_lh(doing->tracked);
4497+ result = longterm_lock_znode(doing->tracked, node,
4498+ ZNODE_WRITE_LOCK,
4499+ ZNODE_LOCK_HIPRI);
4500+ } else
4501+ result = 0;
4502+ return result;
4503+}
4504+
4505+/* This is insertion policy function. It shifts data to the left and right
4506+ neighbors of insertion coord and allocates new nodes until there is enough
4507+ free space to complete @op.
4508+
4509+ See comments in the body.
4510+
4511+ Assumes that the node format favors insertions at the right end of the node
4512+ as node40 does.
4513+
4514+ See carry_flow() on detail about flow insertion
4515+*/
4516+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4517+ carry_level * doing /* current carry queue */ ,
4518+ carry_level * todo /* carry queue on the parent level */ )
4519+{
4520+ znode *node;
4521+ int result;
4522+ int not_enough_space;
4523+ int blk_alloc;
4524+ znode *orig_node;
4525+ __u32 flags;
4526+
4527+ coord_t *coord;
4528+
4529+ assert("nikita-890", op != NULL);
4530+ assert("nikita-891", todo != NULL);
4531+ assert("nikita-892",
4532+ op->op == COP_INSERT ||
4533+ op->op == COP_PASTE || op->op == COP_EXTENT);
4534+ assert("nikita-1607",
71430cf6 4535+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
44254afd
MT
4536+
4537+ flags = op->u.insert.flags;
4538+
4539+ /* NOTE check that new node can only be allocated after checking left
4540+ * and right neighbors. This is necessary for proper work of
4541+ * find_{left,right}_neighbor(). */
4542+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4543+ flags & COPI_DONT_SHIFT_LEFT));
4544+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4545+ flags & COPI_DONT_SHIFT_RIGHT));
4546+
4547+ coord = op->u.insert.d->coord;
4548+ orig_node = node = coord->node;
4549+
4550+ assert("nikita-908", node != NULL);
4551+ assert("nikita-909", node_plugin_by_node(node) != NULL);
4552+
4553+ result = 0;
4554+ /* If there is not enough space in a node, try to shift something to
4555+ the left neighbor. This is a bit tricky, as locking to the left is
4556+ low priority. This is handled by restart logic in carry().
4557+ */
4558+ not_enough_space = free_space_shortage(node, op);
4559+ if (not_enough_space <= 0)
4560+ /* it is possible that carry was called when there actually
4561+ was enough space in the node. For example, when inserting
4562+ leftmost item so that delimiting keys have to be updated.
4563+ */
4564+ return make_space_tail(op, doing, orig_node);
4565+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4566+ carry_node *left;
4567+ /* make note in statistics of an attempt to move
4568+ something into the left neighbor */
4569+ left = find_left_neighbor(op, doing);
4570+ if (unlikely(IS_ERR(left))) {
4571+ if (PTR_ERR(left) == -E_REPEAT)
4572+ return -E_REPEAT;
4573+ else {
4574+ /* some error other than restart request
4575+ occurred. This shouldn't happen. Issue a
4576+ warning and continue as if left neighbor
4577+ weren't existing.
4578+ */
4579+ warning("nikita-924",
4580+ "Error accessing left neighbor: %li",
4581+ PTR_ERR(left));
4582+ }
4583+ } else if (left != NULL) {
4584+
4585+ /* shift everything possible on the left of and
4586+ including insertion coord into the left neighbor */
4587+ result = carry_shift_data(LEFT_SIDE, coord,
71430cf6
MT
4588+ reiser4_carry_real(left),
4589+ doing, todo,
44254afd
MT
4590+ flags & COPI_GO_LEFT);
4591+
4592+ /* reget node from coord: shift_left() might move
4593+ insertion coord to the left neighbor */
4594+ node = sync_op(op, left);
4595+
4596+ not_enough_space = free_space_shortage(node, op);
4597+ /* There is not enough free space in @node, but
4598+ may be, there is enough free space in
4599+ @left. Various balancing decisions are valid here.
4600+ The same for the shifiting to the right.
4601+ */
4602+ }
4603+ }
4604+ /* If there still is not enough space, shift to the right */
4605+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4606+ carry_node *right;
4607+
4608+ right = find_right_neighbor(op, doing);
4609+ if (IS_ERR(right)) {
4610+ warning("nikita-1065",
4611+ "Error accessing right neighbor: %li",
4612+ PTR_ERR(right));
4613+ } else if (right != NULL) {
4614+ /* node containing insertion point, and its right
4615+ neighbor node are write locked by now.
4616+
4617+ shift everything possible on the right of but
4618+ excluding insertion coord into the right neighbor
4619+ */
4620+ result = carry_shift_data(RIGHT_SIDE, coord,
71430cf6 4621+ reiser4_carry_real(right),
44254afd
MT
4622+ doing, todo,
4623+ flags & COPI_GO_RIGHT);
4624+ /* reget node from coord: shift_right() might move
4625+ insertion coord to the right neighbor */
4626+ node = sync_op(op, right);
4627+ not_enough_space = free_space_shortage(node, op);
4628+ }
4629+ }
4630+ /* If there is still not enough space, allocate new node(s).
4631+
4632+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4633+ the carry operation flags (currently this is needed during flush
4634+ only).
4635+ */
4636+ for (blk_alloc = 0;
4637+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4638+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4639+ carry_node *fresh; /* new node we are allocating */
4640+ coord_t coord_shadow; /* remembered insertion point before
4641+ * shifting data into new node */
4642+ carry_node *node_shadow; /* remembered insertion node before
4643+ * shifting */
4644+ unsigned int gointo; /* whether insertion point should move
4645+ * into newly allocated node */
4646+
4647+ /* allocate new node on the right of @node. Znode and disk
4648+ fake block number for new node are allocated.
4649+
4650+ add_new_znode() posts carry operation COP_INSERT with
4651+ COPT_CHILD option to the parent level to add
4652+ pointer to newly created node to its parent.
4653+
4654+ Subtle point: if several new nodes are required to complete
4655+ insertion operation at this level, they will be inserted
4656+ into their parents in the order of creation, which means
4657+ that @node will be valid "cookie" at the time of insertion.
4658+
4659+ */
4660+ fresh = add_new_znode(node, op->node, doing, todo);
4661+ if (IS_ERR(fresh))
4662+ return PTR_ERR(fresh);
4663+
4664+ /* Try to shift into new node. */
4665+ result = lock_carry_node(doing, fresh);
71430cf6 4666+ zput(reiser4_carry_real(fresh));
44254afd
MT
4667+ if (result != 0) {
4668+ warning("nikita-947",
4669+ "Cannot lock new node: %i", result);
4670+ return result;
4671+ }
4672+
4673+ /* both nodes are write locked by now.
4674+
4675+ shift everything possible on the right of and
4676+ including insertion coord into the right neighbor.
4677+ */
4678+ coord_dup(&coord_shadow, op->u.insert.d->coord);
4679+ node_shadow = op->node;
4680+ /* move insertion point into newly created node if:
4681+
4682+ . insertion point is rightmost in the source node, or
4683+ . this is not the first node we are allocating in a row.
4684+ */
4685+ gointo =
4686+ (blk_alloc > 0) ||
4687+ coord_is_after_rightmost(op->u.insert.d->coord);
4688+
71430cf6
MT
4689+ if (gointo &&
4690+ op->op == COP_PASTE &&
4691+ coord_is_existing_item(op->u.insert.d->coord) &&
4692+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4693+ /* paste into solid (atomic) item, which can contain
4694+ only one unit, so we need to shift it right, where
4695+ insertion point supposed to be */
4696+
4697+ assert("edward-1444", op->u.insert.d->data->iplug ==
4698+ item_plugin_by_id(STATIC_STAT_DATA_ID));
4699+ assert("edward-1445",
4700+ op->u.insert.d->data->length >
4701+ node_plugin_by_node(coord->node)->free_space
4702+ (coord->node));
4703+
4704+ op->u.insert.d->coord->between = BEFORE_UNIT;
4705+ }
4706+
4707+ result = carry_shift_data(RIGHT_SIDE, coord,
4708+ reiser4_carry_real(fresh),
44254afd
MT
4709+ doing, todo, gointo);
4710+ /* if insertion point was actually moved into new node,
4711+ update carry node pointer in operation. */
4712+ node = sync_op(op, fresh);
4713+ not_enough_space = free_space_shortage(node, op);
4714+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4715+ /* there is not enough free in new node. Shift
4716+ insertion point back to the @shadow_node so that
4717+ next new node would be inserted between
4718+ @shadow_node and @fresh.
4719+ */
4720+ coord_normalize(&coord_shadow);
4721+ coord_dup(coord, &coord_shadow);
4722+ node = coord->node;
4723+ op->node = node_shadow;
4724+ if (1 || (flags & COPI_STEP_BACK)) {
4725+ /* still not enough space?! Maybe there is
4726+ enough space in the source node (i.e., node
4727+ data are moved from) now.
4728+ */
4729+ not_enough_space =
4730+ free_space_shortage(node, op);
4731+ }
4732+ }
4733+ }
4734+ if (not_enough_space > 0) {
4735+ if (!(flags & COPI_DONT_ALLOCATE))
4736+ warning("nikita-948", "Cannot insert new item");
4737+ result = -E_NODE_FULL;
4738+ }
4739+ assert("nikita-1622", ergo(result == 0,
71430cf6 4740+ reiser4_carry_real(op->node) == coord->node));
44254afd
MT
4741+ assert("nikita-2616", coord == op->u.insert.d->coord);
4742+ if (result == 0)
4743+ result = make_space_tail(op, doing, orig_node);
4744+ return result;
4745+}
4746+
4747+/* insert_paste_common() - common part of insert and paste operations
4748+
4749+ This function performs common part of COP_INSERT and COP_PASTE.
4750+
4751+ There are two ways in which insertion/paste can be requested:
4752+
4753+ . by directly supplying reiser4_item_data. In this case, op ->
4754+ u.insert.type is set to COPT_ITEM_DATA.
4755+
4756+ . by supplying child pointer to which is to inserted into parent. In this
4757+ case op -> u.insert.type == COPT_CHILD.
4758+
4759+ . by supplying key of new item/unit. This is currently only used during
4760+ extent insertion
4761+
4762+ This is required, because when new node is allocated we don't know at what
4763+ position pointer to it is to be stored in the parent. Actually, we don't
4764+ even know what its parent will be, because parent can be re-balanced
4765+ concurrently and new node re-parented, and because parent can be full and
4766+ pointer to the new node will go into some other node.
4767+
4768+ insert_paste_common() resolves pointer to child node into position in the
4769+ parent by calling find_new_child_coord(), that fills
4770+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
4771+
4772+ Another complication is with finding free space during pasting. It may
4773+ happen that while shifting items to the neighbors and newly allocated
4774+ nodes, insertion coord can no longer be in the item we wanted to paste
4775+ into. At this point, paste becomes (morphs) into insert. Moreover free
4776+ space analysis has to be repeated, because amount of space required for
4777+ insertion is different from that of paste (item header overhead, etc).
4778+
4779+ This function "unifies" different insertion modes (by resolving child
4780+ pointer or key into insertion coord), and then calls make_space() to free
4781+ enough space in the node by shifting data to the left and right and by
4782+ allocating new nodes if necessary. Carry operation knows amount of space
4783+ required for its completion. After enough free space is obtained, caller of
4784+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4785+ by calling item plugin method.
4786+
4787+*/
4788+static int insert_paste_common(carry_op * op /* carry operation being
4789+ * performed */ ,
4790+ carry_level * doing /* current carry level */ ,
4791+ carry_level * todo /* next carry level */ ,
4792+ carry_insert_data * cdata /* pointer to
4793+ * cdata */ ,
4794+ coord_t * coord /* insertion/paste coord */ ,
4795+ reiser4_item_data * data /* data to be
4796+ * inserted/pasted */ )
4797+{
4798+ assert("nikita-981", op != NULL);
4799+ assert("nikita-980", todo != NULL);
4800+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4801+ || (op->op == COP_EXTENT));
4802+
4803+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4804+ /* nothing to do. Fall through to make_space(). */
4805+ ;
4806+ } else if (op->u.insert.type == COPT_KEY) {
4807+ node_search_result intra_node;
4808+ znode *node;
4809+ /* Problem with doing batching at the lowest level, is that
4810+ operations here are given by coords where modification is
4811+ to be performed, and one modification can invalidate coords
4812+ of all following operations.
4813+
4814+ So, we are implementing yet another type for operation that
4815+ will use (the only) "locator" stable across shifting of
4816+ data between nodes, etc.: key (COPT_KEY).
4817+
4818+ This clause resolves key to the coord in the node.
4819+
4820+ But node can change also. Probably some pieces have to be
4821+ added to the lock_carry_node(), to lock node by its key.
4822+
4823+ */
4824+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4825+ if you need something else. */
4826+ op->u.insert.d->coord = coord;
71430cf6 4827+ node = reiser4_carry_real(op->node);
44254afd
MT
4828+ intra_node = node_plugin_by_node(node)->lookup
4829+ (node, op->u.insert.d->key, FIND_EXACT,
4830+ op->u.insert.d->coord);
4831+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4832+ warning("nikita-1715", "Intra node lookup failure: %i",
4833+ intra_node);
4834+ return intra_node;
4835+ }
4836+ } else if (op->u.insert.type == COPT_CHILD) {
4837+ /* if we are asked to insert pointer to the child into
4838+ internal node, first convert pointer to the child into
4839+ coord within parent node.
4840+ */
4841+ znode *child;
4842+ int result;
4843+
4844+ op->u.insert.d = cdata;
4845+ op->u.insert.d->coord = coord;
4846+ op->u.insert.d->data = data;
71430cf6 4847+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
44254afd 4848+ result = find_new_child_coord(op);
71430cf6 4849+ child = reiser4_carry_real(op->u.insert.child);
44254afd
MT
4850+ if (result != NS_NOT_FOUND) {
4851+ warning("nikita-993",
4852+ "Cannot find a place for child pointer: %i",
4853+ result);
4854+ return result;
4855+ }
4856+ /* This only happens when we did multiple insertions at
4857+ the previous level, trying to insert single item and
4858+ it so happened, that insertion of pointers to all new
4859+ nodes before this one already caused parent node to
4860+ split (may be several times).
4861+
4862+ I am going to come up with better solution.
4863+
4864+ You are not expected to understand this.
4865+ -- v6root/usr/sys/ken/slp.c
4866+
4867+ Basically, what happens here is the following: carry came
4868+ to the parent level and is about to insert internal item
4869+ pointing to the child node that it just inserted in the
4870+ level below. Position where internal item is to be inserted
4871+ was found by find_new_child_coord() above, but node of the
4872+ current carry operation (that is, parent node of child
4873+ inserted on the previous level), was determined earlier in
4874+ the lock_carry_level/lock_carry_node. It could so happen
4875+ that other carry operations already performed on the parent
4876+ level already split parent node, so that insertion point
4877+ moved into another node. Handle this by creating new carry
4878+ node for insertion point if necessary.
4879+ */
71430cf6
MT
4880+ if (reiser4_carry_real(op->node) !=
4881+ op->u.insert.d->coord->node) {
44254afd
MT
4882+ pool_ordering direction;
4883+ znode *z1;
4884+ znode *z2;
4885+ reiser4_key k1;
4886+ reiser4_key k2;
4887+
4888+ /*
4889+ * determine in what direction insertion point
4890+ * moved. Do this by comparing delimiting keys.
4891+ */
4892+ z1 = op->u.insert.d->coord->node;
71430cf6 4893+ z2 = reiser4_carry_real(op->node);
44254afd
MT
4894+ if (keyle(leftmost_key_in_node(z1, &k1),
4895+ leftmost_key_in_node(z2, &k2)))
4896+ /* insertion point moved to the left */
4897+ direction = POOLO_BEFORE;
4898+ else
4899+ /* insertion point moved to the right */
4900+ direction = POOLO_AFTER;
4901+
71430cf6
MT
4902+ op->node = reiser4_add_carry_skip(doing,
4903+ direction, op->node);
44254afd
MT
4904+ if (IS_ERR(op->node))
4905+ return PTR_ERR(op->node);
4906+ op->node->node = op->u.insert.d->coord->node;
4907+ op->node->free = 1;
4908+ result = lock_carry_node(doing, op->node);
4909+ if (result != 0)
4910+ return result;
4911+ }
4912+
4913+ /*
4914+ * set up key of an item being inserted: we are inserting
4915+ * internal item and its key is (by the very definition of
4916+ * search tree) is leftmost key in the child node.
4917+ */
4918+ write_lock_dk(znode_get_tree(child));
4919+ op->u.insert.d->key = leftmost_key_in_node(child,
4920+ znode_get_ld_key(child));
4921+ write_unlock_dk(znode_get_tree(child));
4922+ op->u.insert.d->data->arg = op->u.insert.brother;
4923+ } else {
4924+ assert("vs-243", op->u.insert.d->coord != NULL);
71430cf6 4925+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
44254afd
MT
4926+ }
4927+
4928+ /* find free space. */
4929+ return make_space(op, doing, todo);
4930+}
4931+
4932+/* handle carry COP_INSERT operation.
4933+
4934+ Insert new item into node. New item can be given in one of two ways:
4935+
4936+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4937+ only applicable at the leaf/twig level.
4938+
4939+ - by passing a child node pointer to which is to be inserted by this
4940+ operation.
4941+
4942+*/
4943+static int carry_insert(carry_op * op /* operation to perform */ ,
4944+ carry_level * doing /* queue of operations @op
4945+ * is part of */ ,
4946+ carry_level * todo /* queue where new operations
4947+ * are accumulated */ )
4948+{
4949+ znode *node;
4950+ carry_insert_data cdata;
4951+ coord_t coord;
4952+ reiser4_item_data data;
4953+ carry_plugin_info info;
4954+ int result;
4955+
4956+ assert("nikita-1036", op != NULL);
4957+ assert("nikita-1037", todo != NULL);
4958+ assert("nikita-1038", op->op == COP_INSERT);
4959+
4960+ coord_init_zero(&coord);
4961+
4962+ /* perform common functionality of insert and paste. */
4963+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
4964+ if (result != 0)
4965+ return result;
4966+
4967+ node = op->u.insert.d->coord->node;
4968+ assert("nikita-1039", node != NULL);
4969+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
4970+
4971+ assert("nikita-949",
4972+ space_needed_for_op(node, op) <= znode_free_space(node));
4973+
4974+ /* ask node layout to create new item. */
4975+ info.doing = doing;
4976+ info.todo = todo;
4977+ result = node_plugin_by_node(node)->create_item
4978+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
4979+ &info);
4980+ doing->restartable = 0;
4981+ znode_make_dirty(node);
4982+
4983+ return result;
4984+}
4985+
4986+/*
4987+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
4988+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
4989+ * by slicing into multiple items.
4990+ */
4991+
4992+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
4993+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
4994+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
4995+
4996+static size_t item_data_overhead(carry_op * op)
4997+{
4998+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
4999+ return 0;
5000+ return (flow_insert_data(op)->iplug->b.
5001+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5002+ flow_insert_data(op)->length);
5003+}
5004+
5005+/* FIXME-VS: this is called several times during one make_flow_for_insertion
5006+ and it will always return the same result. Some optimization could be made
5007+ by calculating this value once at the beginning and passing it around. That
5008+ would reduce some flexibility in future changes
5009+*/
5010+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5011+static size_t flow_insertion_overhead(carry_op * op)
5012+{
5013+ znode *node;
5014+ size_t insertion_overhead;
5015+
5016+ node = flow_insert_point(op)->node;
5017+ insertion_overhead = 0;
5018+ if (node->nplug->item_overhead &&
5019+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5020+ flow_insert_data(op)))
5021+ insertion_overhead =
5022+ node->nplug->item_overhead(node, NULL) +
5023+ item_data_overhead(op);
5024+ return insertion_overhead;
5025+}
5026+
5027+/* how many bytes of flow does fit to the node */
5028+static int what_can_fit_into_node(carry_op * op)
5029+{
5030+ size_t free, overhead;
5031+
5032+ overhead = flow_insertion_overhead(op);
5033+ free = znode_free_space(flow_insert_point(op)->node);
5034+ if (free <= overhead)
5035+ return 0;
5036+ free -= overhead;
5037+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5038+ if (free < op->u.insert_flow.flow->length)
5039+ return free;
5040+ return (int)op->u.insert_flow.flow->length;
5041+}
5042+
5043+/* in make_space_for_flow_insertion we need to check either whether whole flow
5044+ fits into a node or whether minimal fraction of flow fits into a node */
5045+static int enough_space_for_whole_flow(carry_op * op)
5046+{
5047+ return (unsigned)what_can_fit_into_node(op) ==
5048+ op->u.insert_flow.flow->length;
5049+}
5050+
5051+#define MIN_FLOW_FRACTION 1
5052+static int enough_space_for_min_flow_fraction(carry_op * op)
5053+{
5054+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5055+
5056+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5057+}
5058+
5059+/* this returns 0 if left neighbor was obtained successfully and everything
5060+ upto insertion point including it were shifted and left neighbor still has
5061+ some free space to put minimal fraction of flow into it */
5062+static int
5063+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5064+{
5065+ carry_node *left;
5066+ znode *orig;
5067+
5068+ left = find_left_neighbor(op, doing);
5069+ if (unlikely(IS_ERR(left))) {
5070+ warning("vs-899",
5071+ "make_space_by_shift_left: "
5072+ "error accessing left neighbor: %li", PTR_ERR(left));
5073+ return 1;
5074+ }
5075+ if (left == NULL)
5076+ /* left neighbor either does not exist or is unformatted
5077+ node */
5078+ return 1;
5079+
5080+ orig = flow_insert_point(op)->node;
5081+ /* try to shift content of node @orig from its head upto insert point
5082+ including insertion point into the left neighbor */
71430cf6
MT
5083+ carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5084+ reiser4_carry_real(left), doing, todo,
5085+ 1 /* including insert point */);
5086+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
44254afd
MT
5087+ /* insertion point did not move */
5088+ return 1;
5089+ }
5090+
5091+ /* insertion point is set after last item in the node */
5092+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5093+
5094+ if (!enough_space_for_min_flow_fraction(op)) {
5095+ /* insertion point node does not have enough free space to put
5096+ even minimal portion of flow into it, therefore, move
5097+ insertion point back to orig node (before first item) */
5098+ coord_init_before_first_item(flow_insert_point(op), orig);
5099+ return 1;
5100+ }
5101+
5102+ /* part of flow is to be written to the end of node */
5103+ op->node = left;
5104+ return 0;
5105+}
5106+
5107+/* this returns 0 if right neighbor was obtained successfully and everything to
5108+ the right of insertion point was shifted to it and node got enough free
5109+ space to put minimal fraction of flow into it */
5110+static int
5111+make_space_by_shift_right(carry_op * op, carry_level * doing,
5112+ carry_level * todo)
5113+{
5114+ carry_node *right;
5115+
5116+ right = find_right_neighbor(op, doing);
5117+ if (unlikely(IS_ERR(right))) {
5118+ warning("nikita-1065", "shift_right_excluding_insert_point: "
5119+ "error accessing right neighbor: %li", PTR_ERR(right));
5120+ return 1;
5121+ }
5122+ if (right) {
5123+ /* shift everything possible on the right of but excluding
5124+ insertion coord into the right neighbor */
71430cf6
MT
5125+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5126+ reiser4_carry_real(right), doing, todo,
5127+ 0 /* not including insert point */);
44254afd
MT
5128+ } else {
5129+ /* right neighbor either does not exist or is unformatted
5130+ node */
5131+ ;
5132+ }
5133+ if (coord_is_after_rightmost(flow_insert_point(op))) {
5134+ if (enough_space_for_min_flow_fraction(op)) {
5135+ /* part of flow is to be written to the end of node */
5136+ return 0;
5137+ }
5138+ }
5139+
5140+ /* new node is to be added if insert point node did not get enough
5141+ space for whole flow */
5142+ return 1;
5143+}
5144+
5145+/* this returns 0 when insert coord is set at the node end and fraction of flow
5146+ fits into that node */
5147+static int
5148+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5149+{
5150+ int result;
5151+ znode *node;
5152+ carry_node *new;
5153+
5154+ node = flow_insert_point(op)->node;
5155+
5156+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5157+ return RETERR(-E_NODE_FULL);
5158+ /* add new node after insert point node */
5159+ new = add_new_znode(node, op->node, doing, todo);
5160+ if (unlikely(IS_ERR(new))) {
5161+ return PTR_ERR(new);
5162+ }
5163+ result = lock_carry_node(doing, new);
71430cf6 5164+ zput(reiser4_carry_real(new));
44254afd
MT
5165+ if (unlikely(result)) {
5166+ return result;
5167+ }
5168+ op->u.insert_flow.new_nodes++;
5169+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
71430cf6
MT
5170+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5171+ reiser4_carry_real(new), doing, todo,
5172+ 0 /* not including insert point */);
44254afd
MT
5173+ assert("vs-901",
5174+ coord_is_after_rightmost(flow_insert_point(op)));
5175+
5176+ if (enough_space_for_min_flow_fraction(op)) {
5177+ return 0;
5178+ }
5179+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5180+ return RETERR(-E_NODE_FULL);
5181+
5182+ /* add one more new node */
5183+ new = add_new_znode(node, op->node, doing, todo);
5184+ if (unlikely(IS_ERR(new))) {
5185+ return PTR_ERR(new);
5186+ }
5187+ result = lock_carry_node(doing, new);
71430cf6 5188+ zput(reiser4_carry_real(new));
44254afd
MT
5189+ if (unlikely(result)) {
5190+ return result;
5191+ }
5192+ op->u.insert_flow.new_nodes++;
5193+ }
5194+
5195+ /* move insertion point to new node */
71430cf6
MT
5196+ coord_init_before_first_item(flow_insert_point(op),
5197+ reiser4_carry_real(new));
44254afd
MT
5198+ op->node = new;
5199+ return 0;
5200+}
5201+
5202+static int
5203+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5204+ carry_level * todo)
5205+{
5206+ __u32 flags = op->u.insert_flow.flags;
5207+
5208+ if (enough_space_for_whole_flow(op)) {
5209+ /* whole flow fits into insert point node */
5210+ return 0;
5211+ }
5212+
5213+ if (!(flags & COPI_DONT_SHIFT_LEFT)
5214+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
5215+ /* insert point is shifted to left neighbor of original insert
5216+ point node and is set after last unit in that node. It has
5217+ enough space to fit at least minimal fraction of flow. */
5218+ return 0;
5219+ }
5220+
5221+ if (enough_space_for_whole_flow(op)) {
5222+ /* whole flow fits into insert point node */
5223+ return 0;
5224+ }
5225+
5226+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
5227+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
5228+ /* insert point is still set to the same node, but there is
5229+ nothing to the right of insert point. */
5230+ return 0;
5231+ }
5232+
5233+ if (enough_space_for_whole_flow(op)) {
5234+ /* whole flow fits into insert point node */
5235+ return 0;
5236+ }
5237+
5238+ return make_space_by_new_nodes(op, doing, todo);
5239+}
5240+
5241+/* implements COP_INSERT_FLOW operation */
5242+static int
5243+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5244+{
5245+ int result;
5246+ flow_t *f;
5247+ coord_t *insert_point;
5248+ node_plugin *nplug;
5249+ carry_plugin_info info;
5250+ znode *orig_node;
5251+ lock_handle *orig_lh;
5252+
5253+ f = op->u.insert_flow.flow;
5254+ result = 0;
5255+
5256+ /* carry system needs this to work */
5257+ info.doing = doing;
5258+ info.todo = todo;
5259+
5260+ orig_node = flow_insert_point(op)->node;
5261+ orig_lh = doing->tracked;
5262+
5263+ while (f->length) {
5264+ result = make_space_for_flow_insertion(op, doing, todo);
5265+ if (result)
5266+ break;
5267+
5268+ insert_point = flow_insert_point(op);
5269+ nplug = node_plugin_by_node(insert_point->node);
5270+
5271+ /* compose item data for insertion/pasting */
5272+ flow_insert_data(op)->data = f->data;
5273+ flow_insert_data(op)->length = what_can_fit_into_node(op);
5274+
5275+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5276+ /* insert point is set to item of file we are writing to and we have to append to it */
5277+ assert("vs-903", insert_point->between == AFTER_UNIT);
5278+ nplug->change_item_size(insert_point,
5279+ flow_insert_data(op)->length);
5280+ flow_insert_data(op)->iplug->b.paste(insert_point,
5281+ flow_insert_data
5282+ (op), &info);
5283+ } else {
5284+ /* new item must be inserted */
5285+ pos_in_node_t new_pos;
5286+ flow_insert_data(op)->length += item_data_overhead(op);
5287+
5288+ /* FIXME-VS: this is because node40_create_item changes
5289+ insert_point for obscure reasons */
5290+ switch (insert_point->between) {
5291+ case AFTER_ITEM:
5292+ new_pos = insert_point->item_pos + 1;
5293+ break;
5294+ case EMPTY_NODE:
5295+ new_pos = 0;
5296+ break;
5297+ case BEFORE_ITEM:
5298+ assert("vs-905", insert_point->item_pos == 0);
5299+ new_pos = 0;
5300+ break;
5301+ default:
5302+ impossible("vs-906",
5303+ "carry_insert_flow: invalid coord");
5304+ new_pos = 0;
5305+ break;
5306+ }
5307+
5308+ nplug->create_item(insert_point, &f->key,
5309+ flow_insert_data(op), &info);
5310+ coord_set_item_pos(insert_point, new_pos);
5311+ }
5312+ coord_init_after_item_end(insert_point);
5313+ doing->restartable = 0;
5314+ znode_make_dirty(insert_point->node);
5315+
5316+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5317+ }
5318+
5319+ if (orig_node != flow_insert_point(op)->node) {
5320+ /* move lock to new insert point */
5321+ done_lh(orig_lh);
5322+ init_lh(orig_lh);
5323+ result =
5324+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5325+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5326+ }
5327+
5328+ return result;
5329+}
5330+
5331+/* implements COP_DELETE operation
5332+
5333+ Remove pointer to @op -> u.delete.child from it's parent.
5334+
5335+ This function also handles killing of a tree root is last pointer from it
5336+ was removed. This is complicated by our handling of "twig" level: root on
5337+ twig level is never killed.
5338+
5339+*/
5340+static int carry_delete(carry_op * op /* operation to be performed */ ,
5341+ carry_level * doing UNUSED_ARG /* current carry
5342+ * level */ ,
5343+ carry_level * todo /* next carry level */ )
5344+{
5345+ int result;
5346+ coord_t coord;
5347+ coord_t coord2;
5348+ znode *parent;
5349+ znode *child;
5350+ carry_plugin_info info;
5351+ reiser4_tree *tree;
5352+
5353+ /*
5354+ * This operation is called to delete internal item pointing to the
5355+ * child node that was removed by carry from the tree on the previous
5356+ * tree level.
5357+ */
5358+
5359+ assert("nikita-893", op != NULL);
5360+ assert("nikita-894", todo != NULL);
5361+ assert("nikita-895", op->op == COP_DELETE);
5362+
5363+ coord_init_zero(&coord);
5364+ coord_init_zero(&coord2);
5365+
71430cf6 5366+ parent = reiser4_carry_real(op->node);
44254afd 5367+ child = op->u.delete.child ?
71430cf6 5368+ reiser4_carry_real(op->u.delete.child) : op->node->node;
44254afd
MT
5369+ tree = znode_get_tree(child);
5370+ read_lock_tree(tree);
5371+
5372+ /*
5373+ * @parent was determined when carry entered parent level
5374+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
5375+ * @child node could change due to other carry operations performed on
5376+ * the parent level. Check for this.
5377+ */
5378+
5379+ if (znode_parent(child) != parent) {
5380+ /* NOTE-NIKITA add stat counter for this. */
5381+ parent = znode_parent(child);
5382+ assert("nikita-2581", find_carry_node(doing, parent));
5383+ }
5384+ read_unlock_tree(tree);
5385+
5386+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5387+
5388+ /* Twig level horrors: tree should be of height at least 2. So, last
5389+ pointer from the root at twig level is preserved even if child is
5390+ empty. This is ugly, but so it was architectured.
5391+ */
5392+
5393+ if (znode_is_root(parent) &&
5394+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5395+ node_num_items(parent) == 1) {
5396+ /* Delimiting key manipulations. */
5397+ write_lock_dk(tree);
71430cf6
MT
5398+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5399+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
44254afd
MT
5400+ ZF_SET(child, JNODE_DKSET);
5401+ write_unlock_dk(tree);
5402+
5403+ /* @child escaped imminent death! */
5404+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
5405+ return 0;
5406+ }
5407+
5408+ /* convert child pointer to the coord_t */
5409+ result = find_child_ptr(parent, child, &coord);
5410+ if (result != NS_FOUND) {
5411+ warning("nikita-994", "Cannot find child pointer: %i", result);
5412+ print_coord_content("coord", &coord);
5413+ return result;
5414+ }
5415+
5416+ coord_dup(&coord2, &coord);
5417+ info.doing = doing;
5418+ info.todo = todo;
5419+ {
5420+ /*
5421+ * Actually kill internal item: prepare structure with
5422+ * arguments for ->cut_and_kill() method...
5423+ */
5424+
5425+ struct carry_kill_data kdata;
5426+ kdata.params.from = &coord;
5427+ kdata.params.to = &coord2;
5428+ kdata.params.from_key = NULL;
5429+ kdata.params.to_key = NULL;
5430+ kdata.params.smallest_removed = NULL;
5431+ kdata.params.truncate = 1;
5432+ kdata.flags = op->u.delete.flags;
5433+ kdata.inode = NULL;
5434+ kdata.left = NULL;
5435+ kdata.right = NULL;
5436+ kdata.buf = NULL;
5437+ /* ... and call it. */
5438+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5439+ &info);
5440+ }
5441+ doing->restartable = 0;
5442+
5443+ /* check whether root should be killed violently */
5444+ if (znode_is_root(parent) &&
5445+ /* don't kill roots at and lower than twig level */
5446+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5447+ node_num_items(parent) == 1) {
71430cf6 5448+ result = reiser4_kill_tree_root(coord.node);
44254afd
MT
5449+ }
5450+
5451+ return result < 0 ? : 0;
5452+}
5453+
5454+/* implements COP_CUT opration
5455+
5456+ Cuts part or whole content of node.
5457+
5458+*/
5459+static int carry_cut(carry_op * op /* operation to be performed */ ,
5460+ carry_level * doing /* current carry level */ ,
5461+ carry_level * todo /* next carry level */ )
5462+{
5463+ int result;
5464+ carry_plugin_info info;
5465+ node_plugin *nplug;
5466+
5467+ assert("nikita-896", op != NULL);
5468+ assert("nikita-897", todo != NULL);
5469+ assert("nikita-898", op->op == COP_CUT);
5470+
5471+ info.doing = doing;
5472+ info.todo = todo;
5473+
71430cf6 5474+ nplug = node_plugin_by_node(reiser4_carry_real(op->node));
44254afd
MT
5475+ if (op->u.cut_or_kill.is_cut)
5476+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5477+ else
5478+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5479+
5480+ doing->restartable = 0;
5481+ return result < 0 ? : 0;
5482+}
5483+
5484+/* helper function for carry_paste(): returns true if @op can be continued as
5485+ paste */
5486+static int
5487+can_paste(coord_t * icoord, const reiser4_key * key,
5488+ const reiser4_item_data * data)
5489+{
5490+ coord_t circa;
5491+ item_plugin *new_iplug;
5492+ item_plugin *old_iplug;
5493+ int result = 0; /* to keep gcc shut */
5494+
5495+ assert("", icoord->between != AT_UNIT);
5496+
5497+ /* obviously, one cannot paste when node is empty---there is nothing
5498+ to paste into. */
5499+ if (node_is_empty(icoord->node))
5500+ return 0;
5501+ /* if insertion point is at the middle of the item, then paste */
5502+ if (!coord_is_between_items(icoord))
5503+ return 1;
5504+ coord_dup(&circa, icoord);
5505+ circa.between = AT_UNIT;
5506+
5507+ old_iplug = item_plugin_by_coord(&circa);
5508+ new_iplug = data->iplug;
5509+
5510+ /* check whether we can paste to the item @icoord is "at" when we
5511+ ignore ->between field */
5512+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5513+ result = 1;
5514+ } else if (icoord->between == BEFORE_UNIT
5515+ || icoord->between == BEFORE_ITEM) {
5516+ /* otherwise, try to glue to the item at the left, if any */
5517+ coord_dup(&circa, icoord);
5518+ if (coord_set_to_left(&circa)) {
5519+ result = 0;
5520+ coord_init_before_item(icoord);
5521+ } else {
5522+ old_iplug = item_plugin_by_coord(&circa);
5523+ result = (old_iplug == new_iplug)
5524+ && item_can_contain_key(icoord, key, data);
5525+ if (result) {
5526+ coord_dup(icoord, &circa);
5527+ icoord->between = AFTER_UNIT;
5528+ }
5529+ }
5530+ } else if (icoord->between == AFTER_UNIT
5531+ || icoord->between == AFTER_ITEM) {
5532+ coord_dup(&circa, icoord);
5533+ /* otherwise, try to glue to the item at the right, if any */
5534+ if (coord_set_to_right(&circa)) {
5535+ result = 0;
5536+ coord_init_after_item(icoord);
5537+ } else {
5538+ int (*cck) (const coord_t *, const reiser4_key *,
5539+ const reiser4_item_data *);
5540+
5541+ old_iplug = item_plugin_by_coord(&circa);
5542+
5543+ cck = old_iplug->b.can_contain_key;
5544+ if (cck == NULL)
5545+ /* item doesn't define ->can_contain_key
5546+ method? So it is not expandable. */
5547+ result = 0;
5548+ else {
5549+ result = (old_iplug == new_iplug)
5550+ && cck(&circa /*icoord */ , key, data);
5551+ if (result) {
5552+ coord_dup(icoord, &circa);
5553+ icoord->between = BEFORE_UNIT;
5554+ }
5555+ }
5556+ }
5557+ } else
5558+ impossible("nikita-2513", "Nothing works");
5559+ if (result) {
5560+ if (icoord->between == BEFORE_ITEM) {
5561+ assert("vs-912", icoord->unit_pos == 0);
5562+ icoord->between = BEFORE_UNIT;
5563+ } else if (icoord->between == AFTER_ITEM) {
5564+ coord_init_after_item_end(icoord);
5565+ }
5566+ }
5567+ return result;
5568+}
5569+
5570+/* implements COP_PASTE operation
5571+
5572+ Paste data into existing item. This is complicated by the fact that after
5573+ we shifted something to the left or right neighbors trying to free some
5574+ space, item we were supposed to paste into can be in different node than
5575+ insertion coord. If so, we are no longer doing paste, but insert. See
5576+ comments in insert_paste_common().
5577+
5578+*/
5579+static int carry_paste(carry_op * op /* operation to be performed */ ,
5580+ carry_level * doing UNUSED_ARG /* current carry
5581+ * level */ ,
5582+ carry_level * todo /* next carry level */ )
5583+{
5584+ znode *node;
5585+ carry_insert_data cdata;
5586+ coord_t dcoord;
5587+ reiser4_item_data data;
5588+ int result;
5589+ int real_size;
5590+ item_plugin *iplug;
5591+ carry_plugin_info info;
5592+ coord_t *coord;
5593+
5594+ assert("nikita-982", op != NULL);
5595+ assert("nikita-983", todo != NULL);
5596+ assert("nikita-984", op->op == COP_PASTE);
5597+
5598+ coord_init_zero(&dcoord);
5599+
5600+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5601+ if (result != 0)
5602+ return result;
5603+
5604+ coord = op->u.insert.d->coord;
5605+
5606+ /* handle case when op -> u.insert.coord doesn't point to the item
5607+ of required type. restart as insert. */
5608+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5609+ op->op = COP_INSERT;
5610+ op->u.insert.type = COPT_PASTE_RESTARTED;
5611+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5612+
5613+ return result;
5614+ }
5615+
5616+ node = coord->node;
5617+ iplug = item_plugin_by_coord(coord);
5618+ assert("nikita-992", iplug != NULL);
5619+
5620+ assert("nikita-985", node != NULL);
5621+ assert("nikita-986", node_plugin_by_node(node) != NULL);
5622+
5623+ assert("nikita-987",
5624+ space_needed_for_op(node, op) <= znode_free_space(node));
5625+
5626+ assert("nikita-1286", coord_is_existing_item(coord));
5627+
5628+ /*
5629+ * if item is expanded as a result of this operation, we should first
5630+ * change item size, than call ->b.paste item method. If item is
5631+ * shrunk, it should be done other way around: first call ->b.paste
5632+ * method, then reduce item size.
5633+ */
5634+
5635+ real_size = space_needed_for_op(node, op);
5636+ if (real_size > 0)
5637+ node->nplug->change_item_size(coord, real_size);
5638+
5639+ doing->restartable = 0;
5640+ info.doing = doing;
5641+ info.todo = todo;
5642+
5643+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5644+
5645+ if (real_size < 0)
5646+ node->nplug->change_item_size(coord, real_size);
5647+
5648+ /* if we pasted at the beginning of the item, update item's key. */
5649+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5650+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5651+
5652+ znode_make_dirty(node);
5653+ return result;
5654+}
5655+
5656+/* handle carry COP_EXTENT operation. */
5657+static int carry_extent(carry_op * op /* operation to perform */ ,
5658+ carry_level * doing /* queue of operations @op
5659+ * is part of */ ,
5660+ carry_level * todo /* queue where new operations
5661+ * are accumulated */ )
5662+{
5663+ znode *node;
5664+ carry_insert_data cdata;
5665+ coord_t coord;
5666+ reiser4_item_data data;
5667+ carry_op *delete_dummy;
5668+ carry_op *insert_extent;
5669+ int result;
5670+ carry_plugin_info info;
5671+
5672+ assert("nikita-1751", op != NULL);
5673+ assert("nikita-1752", todo != NULL);
5674+ assert("nikita-1753", op->op == COP_EXTENT);
5675+
5676+ /* extent insertion overview:
5677+
5678+ extents live on the TWIG LEVEL, which is level one above the leaf
5679+ one. This complicates extent insertion logic somewhat: it may
5680+ happen (and going to happen all the time) that in logical key
5681+ ordering extent has to be placed between items I1 and I2, located
5682+ at the leaf level, but I1 and I2 are in the same formatted leaf
5683+ node N1. To insert extent one has to
5684+
5685+ (1) reach node N1 and shift data between N1, its neighbors and
5686+ possibly newly allocated nodes until I1 and I2 fall into different
5687+ nodes. Since I1 and I2 are still neighboring items in logical key
5688+ order, they will be necessary utmost items in their respective
5689+ nodes.
5690+
5691+ (2) After this new extent item is inserted into node on the twig
5692+ level.
5693+
5694+ Fortunately this process can reuse almost all code from standard
5695+ insertion procedure (viz. make_space() and insert_paste_common()),
5696+ due to the following observation: make_space() only shifts data up
5697+ to and excluding or including insertion point. It never
5698+ "over-moves" through insertion point. Thus, one can use
5699+ make_space() to perform step (1). All required for this is just to
5700+ instruct free_space_shortage() to keep make_space() shifting data
5701+ until insertion point is at the node border.
5702+
5703+ */
5704+
5705+ /* perform common functionality of insert and paste. */
5706+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5707+ if (result != 0)
5708+ return result;
5709+
5710+ node = op->u.extent.d->coord->node;
5711+ assert("nikita-1754", node != NULL);
5712+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
5713+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5714+
5715+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5716+ extent fits between items. */
5717+
5718+ info.doing = doing;
5719+ info.todo = todo;
5720+
5721+ /* there is another complication due to placement of extents on the
5722+ twig level: extents are "rigid" in the sense that key-range
5723+ occupied by extent cannot grow indefinitely to the right as it is
5724+ for the formatted leaf nodes. Because of this when search finds two
5725+ adjacent extents on the twig level, it has to "drill" to the leaf
5726+ level, creating new node. Here we are removing this node.
5727+ */
5728+ if (node_is_empty(node)) {
5729+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5730+ if (IS_ERR(delete_dummy))
5731+ return PTR_ERR(delete_dummy);
5732+ delete_dummy->u.delete.child = NULL;
5733+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5734+ ZF_SET(node, JNODE_HEARD_BANSHEE);
5735+ }
5736+
5737+ /* proceed with inserting extent item into parent. We are definitely
5738+ inserting rather than pasting if we get that far. */
5739+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5740+ if (IS_ERR(insert_extent))
5741+ /* @delete_dummy will be automatically destroyed on the level
5742+ exiting */
5743+ return PTR_ERR(insert_extent);
5744+ /* NOTE-NIKITA insertion by key is simplest option here. Another
5745+ possibility is to insert on the left or right of already existing
5746+ item.
5747+ */
5748+ insert_extent->u.insert.type = COPT_KEY;
5749+ insert_extent->u.insert.d = op->u.extent.d;
5750+ assert("nikita-1719", op->u.extent.d->key != NULL);
5751+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5752+ insert_extent->u.insert.flags =
5753+ znode_get_tree(node)->carry.new_extent_flags;
5754+
5755+ /*
5756+ * if carry was asked to track lock handle we should actually track
5757+ * lock handle on the twig node rather than on the leaf where
5758+ * operation was started from. Transfer tracked lock handle.
5759+ */
5760+ if (doing->track_type) {
5761+ assert("nikita-3242", doing->tracked != NULL);
5762+ assert("nikita-3244", todo->tracked == NULL);
5763+ todo->tracked = doing->tracked;
5764+ todo->track_type = CARRY_TRACK_NODE;
5765+ doing->tracked = NULL;
5766+ doing->track_type = 0;
5767+ }
5768+
5769+ return 0;
5770+}
5771+
5772+/* update key in @parent between pointers to @left and @right.
5773+
5774+ Find coords of @left and @right and update delimiting key between them.
5775+ This is helper function called by carry_update(). Finds position of
5776+ internal item involved. Updates item key. Updates delimiting keys of child
5777+ nodes involved.
5778+*/
5779+static int update_delimiting_key(znode * parent /* node key is updated
5780+ * in */ ,
5781+ znode * left /* child of @parent */ ,
5782+ znode * right /* child of @parent */ ,
5783+ carry_level * doing /* current carry
5784+ * level */ ,
5785+ carry_level * todo /* parent carry
5786+ * level */ ,
5787+ const char **error_msg /* place to
5788+ * store error
5789+ * message */ )
5790+{
5791+ coord_t left_pos;
5792+ coord_t right_pos;
5793+ int result;
5794+ reiser4_key ldkey;
5795+ carry_plugin_info info;
5796+
5797+ assert("nikita-1177", right != NULL);
5798+ /* find position of right left child in a parent */
5799+ result = find_child_ptr(parent, right, &right_pos);
5800+ if (result != NS_FOUND) {
5801+ *error_msg = "Cannot find position of right child";
5802+ return result;
5803+ }
5804+
5805+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5806+ /* find position of the left child in a parent */
5807+ result = find_child_ptr(parent, left, &left_pos);
5808+ if (result != NS_FOUND) {
5809+ *error_msg = "Cannot find position of left child";
5810+ return result;
5811+ }
5812+ assert("nikita-1355", left_pos.node != NULL);
5813+ } else
5814+ left_pos.node = NULL;
5815+
5816+ /* check that they are separated by exactly one key and are basically
5817+ sane */
5818+ if (REISER4_DEBUG) {
5819+ if ((left_pos.node != NULL)
5820+ && !coord_is_existing_unit(&left_pos)) {
5821+ *error_msg = "Left child is bastard";
5822+ return RETERR(-EIO);
5823+ }
5824+ if (!coord_is_existing_unit(&right_pos)) {
5825+ *error_msg = "Right child is bastard";
5826+ return RETERR(-EIO);
5827+ }
5828+ if (left_pos.node != NULL &&
5829+ !coord_are_neighbors(&left_pos, &right_pos)) {
5830+ *error_msg = "Children are not direct siblings";
5831+ return RETERR(-EIO);
5832+ }
5833+ }
5834+ *error_msg = NULL;
5835+
5836+ info.doing = doing;
5837+ info.todo = todo;
5838+
5839+ /*
5840+ * If child node is not empty, new key of internal item is a key of
5841+ * leftmost item in the child node. If the child is empty, take its
5842+ * right delimiting key as a new key of the internal item. Precise key
5843+ * in the latter case is not important per se, because the child (and
5844+ * the internal item) are going to be killed shortly anyway, but we
5845+ * have to preserve correct order of keys in the parent node.
5846+ */
5847+
5848+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5849+ leftmost_key_in_node(right, &ldkey);
5850+ else {
5851+ read_lock_dk(znode_get_tree(parent));
5852+ ldkey = *znode_get_rd_key(right);
5853+ read_unlock_dk(znode_get_tree(parent));
5854+ }
5855+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5856+ doing->restartable = 0;
5857+ znode_make_dirty(parent);
5858+ return 0;
5859+}
5860+
5861+/* implements COP_UPDATE opration
5862+
5863+ Update delimiting keys.
5864+
5865+*/
5866+static int carry_update(carry_op * op /* operation to be performed */ ,
5867+ carry_level * doing /* current carry level */ ,
5868+ carry_level * todo /* next carry level */ )
5869+{
5870+ int result;
5871+ carry_node *missing UNUSED_ARG;
5872+ znode *left;
5873+ znode *right;
5874+ carry_node *lchild;
5875+ carry_node *rchild;
5876+ const char *error_msg;
5877+ reiser4_tree *tree;
5878+
5879+ /*
5880+ * This operation is called to update key of internal item. This is
5881+ * necessary when carry shifted of cut data on the child
5882+ * level. Arguments of this operation are:
5883+ *
5884+ * @right --- child node. Operation should update key of internal
5885+ * item pointing to @right.
5886+ *
5887+ * @left --- left neighbor of @right. This parameter is optional.
5888+ */
5889+
5890+ assert("nikita-902", op != NULL);
5891+ assert("nikita-903", todo != NULL);
5892+ assert("nikita-904", op->op == COP_UPDATE);
5893+
5894+ lchild = op->u.update.left;
5895+ rchild = op->node;
5896+
5897+ if (lchild != NULL) {
5898+ assert("nikita-1001", lchild->parent);
5899+ assert("nikita-1003", !lchild->left);
71430cf6 5900+ left = reiser4_carry_real(lchild);
44254afd
MT
5901+ } else
5902+ left = NULL;
5903+
5904+ tree = znode_get_tree(rchild->node);
5905+ read_lock_tree(tree);
5906+ right = znode_parent(rchild->node);
5907+ read_unlock_tree(tree);
5908+
5909+ if (right != NULL) {
5910+ result = update_delimiting_key(right,
5911+ lchild ? lchild->node : NULL,
5912+ rchild->node,
5913+ doing, todo, &error_msg);
5914+ } else {
5915+ error_msg = "Cannot find node to update key in";
5916+ result = RETERR(-EIO);
5917+ }
5918+ /* operation will be reposted to the next level by the
5919+ ->update_item_key() method of node plugin, if necessary. */
5920+
5921+ if (result != 0) {
5922+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
5923+ error_msg ? : "", result);
5924+ }
5925+ return result;
5926+}
5927+
5928+/* move items from @node during carry */
5929+static int carry_shift_data(sideof side /* in what direction to move data */ ,
5930+ coord_t * insert_coord /* coord where new item
5931+ * is to be inserted */ ,
5932+ znode * node /* node which data are moved from */ ,
5933+ carry_level * doing /* active carry queue */ ,
5934+ carry_level * todo /* carry queue where new
5935+ * operations are to be put
5936+ * in */ ,
5937+ unsigned int including_insert_coord_p /* true if
5938+ * @insertion_coord
5939+ * can be moved */ )
5940+{
5941+ int result;
5942+ znode *source;
5943+ carry_plugin_info info;
5944+ node_plugin *nplug;
5945+
5946+ source = insert_coord->node;
5947+
5948+ info.doing = doing;
5949+ info.todo = todo;
5950+
5951+ nplug = node_plugin_by_node(node);
5952+ result = nplug->shift(insert_coord, node,
5953+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5954+ (int)including_insert_coord_p, &info);
5955+ /* the only error ->shift() method of node plugin can return is
5956+ -ENOMEM due to carry node/operation allocation. */
5957+ assert("nikita-915", result >= 0 || result == -ENOMEM);
5958+ if (result > 0) {
5959+ /*
5960+ * if some number of bytes was actually shifted, mark nodes
5961+ * dirty, and carry level as non-restartable.
5962+ */
5963+ doing->restartable = 0;
5964+ znode_make_dirty(source);
5965+ znode_make_dirty(node);
5966+ }
5967+
5968+ assert("nikita-2077", coord_check(insert_coord));
5969+ return 0;
5970+}
5971+
5972+typedef carry_node *(*carry_iterator) (carry_node * node);
5973+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
5974+ carry_iterator iterator);
5975+
5976+static carry_node *pool_level_list_prev(carry_node *node)
5977+{
5978+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
5979+}
5980+
5981+/* look for the left neighbor of given carry node in a carry queue.
5982+
5983+ This is used by find_left_neighbor(), but I am not sure that this
5984+ really gives any advantage. More statistics required.
5985+
5986+*/
5987+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
5988+ * of */ ,
5989+ carry_level * level /* level to scan */ )
5990+{
5991+ return find_dir_carry(node, level,
5992+ (carry_iterator) pool_level_list_prev);
5993+}
5994+
5995+static carry_node *pool_level_list_next(carry_node *node)
5996+{
5997+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
5998+}
5999+
6000+/* look for the right neighbor of given carry node in a
6001+ carry queue.
6002+
6003+ This is used by find_right_neighbor(), but I am not sure that this
6004+ really gives any advantage. More statistics required.
6005+
6006+*/
6007+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6008+ * of */ ,
6009+ carry_level * level /* level to scan */ )
6010+{
6011+ return find_dir_carry(node, level,
6012+ (carry_iterator) pool_level_list_next);
6013+}
6014+
6015+/* look for the left or right neighbor of given carry node in a carry
6016+ queue.
6017+
6018+ Helper function used by find_{left|right}_carry().
6019+*/
6020+static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6021+ * from */ ,
6022+ carry_level * level /* level to scan */ ,
6023+ carry_iterator iterator /* operation to
6024+ * move to the next
6025+ * node */ )
6026+{
6027+ carry_node *neighbor;
6028+
6029+ assert("nikita-1059", node != NULL);
6030+ assert("nikita-1060", level != NULL);
6031+
6032+ /* scan list of carry nodes on this list dir-ward, skipping all
6033+ carry nodes referencing the same znode. */
6034+ neighbor = node;
6035+ while (1) {
6036+ neighbor = iterator(neighbor);
6037+ if (carry_node_end(level, neighbor))
6038+ /* list head is reached */
6039+ return NULL;
71430cf6 6040+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
44254afd
MT
6041+ return neighbor;
6042+ }
6043+}
6044+
6045+/*
6046+ * Memory reservation estimation.
6047+ *
6048+ * Carry process proceeds through tree levels upwards. Carry assumes that it
6049+ * takes tree in consistent state (e.g., that search tree invariants hold),
6050+ * and leaves tree consistent after it finishes. This means that when some
6051+ * error occurs carry cannot simply return if there are pending carry
6052+ * operations. Generic solution for this problem is carry-undo either as
6053+ * transaction manager feature (requiring checkpoints and isolation), or
6054+ * through some carry specific mechanism.
6055+ *
6056+ * Our current approach is to panic if carry hits an error while tree is
6057+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6058+ * this "memory reservation" mechanism was added.
6059+ *
6060+ * Memory reservation is implemented by perthread-pages.diff patch from
6061+ * core-patches. Its API is defined in <linux/gfp.h>
6062+ *
6063+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6064+ * void perthread_pages_release(int nrpages);
6065+ * int perthread_pages_count(void);
6066+ *
6067+ * carry estimates its worst case memory requirements at the entry, reserved
6068+ * enough memory, and released unused pages before returning.
6069+ *
6070+ * Code below estimates worst case memory requirements for a given carry
6071+ * queue. This is dome by summing worst case memory requirements for each
6072+ * operation in the queue.
6073+ *
6074+ */
6075+
6076+/*
6077+ * Memory memory requirements of many operations depends on the tree
6078+ * height. For example, item insertion requires new node to be inserted at
6079+ * each tree level in the worst case. What tree height should be used for
6080+ * estimation? Current tree height is wrong, because tree height can change
6081+ * between the time when estimation was done and the time when operation is
6082+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6083+ * is also not desirable, because it would lead to the huge over-estimation
6084+ * all the time. Plausible solution is "capped tree height": if current tree
6085+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6086+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6087+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6088+ * to be increased even more during short interval of time.
6089+ */
6090+#define TREE_HEIGHT_CAP (5)
6091+
6092+/* return capped tree height for the @tree. See comment above. */
6093+static int cap_tree_height(reiser4_tree * tree)
6094+{
6095+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
6096+}
6097+
6098+/* return capped tree height for the current tree. */
6099+static int capped_height(void)
6100+{
6101+ return cap_tree_height(current_tree);
6102+}
6103+
6104+/* return number of pages required to store given number of bytes */
6105+static int bytes_to_pages(int bytes)
6106+{
6107+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6108+}
6109+
6110+/* how many pages are required to allocate znodes during item insertion. */
6111+static int carry_estimate_znodes(void)
6112+{
6113+ /*
6114+ * Note, that there we have some problem here: there is no way to
6115+ * reserve pages specifically for the given slab. This means that
6116+ * these pages can be hijacked for some other end.
6117+ */
6118+
6119+ /* in the worst case we need 3 new znode on each tree level */
6120+ return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6121+}
6122+
6123+/*
6124+ * how many pages are required to load bitmaps. One bitmap per level.
6125+ */
6126+static int carry_estimate_bitmaps(void)
6127+{
6128+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6129+ int bytes;
6130+
6131+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6132+ * bitmap.c, skip for now. */
6133+ 2 * sizeof(jnode)); /* working and commit jnodes */
6134+ return bytes_to_pages(bytes) + 2; /* and their contents */
6135+ } else
6136+ /* bitmaps were pre-loaded during mount */
6137+ return 0;
6138+}
6139+
6140+/* worst case item insertion memory requirements */
6141+static int carry_estimate_insert(carry_op * op, carry_level * level)
6142+{
6143+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6144+ capped_height() + /* new block on each level */
6145+ 1 + /* and possibly extra new block at the leaf level */
6146+ 3; /* loading of leaves into memory */
6147+}
6148+
6149+/* worst case item deletion memory requirements */
6150+static int carry_estimate_delete(carry_op * op, carry_level * level)
6151+{
6152+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6153+ 3; /* loading of leaves into memory */
6154+}
6155+
6156+/* worst case tree cut memory requirements */
6157+static int carry_estimate_cut(carry_op * op, carry_level * level)
6158+{
6159+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6160+ 3; /* loading of leaves into memory */
6161+}
6162+
6163+/* worst case memory requirements of pasting into item */
6164+static int carry_estimate_paste(carry_op * op, carry_level * level)
6165+{
6166+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6167+ capped_height() + /* new block on each level */
6168+ 1 + /* and possibly extra new block at the leaf level */
6169+ 3; /* loading of leaves into memory */
6170+}
6171+
6172+/* worst case memory requirements of extent insertion */
6173+static int carry_estimate_extent(carry_op * op, carry_level * level)
6174+{
6175+ return carry_estimate_insert(op, level) + /* insert extent */
6176+ carry_estimate_delete(op, level); /* kill leaf */
6177+}
6178+
6179+/* worst case memory requirements of key update */
6180+static int carry_estimate_update(carry_op * op, carry_level * level)
6181+{
6182+ return 0;
6183+}
6184+
6185+/* worst case memory requirements of flow insertion */
6186+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6187+{
6188+ int newnodes;
6189+
6190+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6191+ CARRY_FLOW_NEW_NODES_LIMIT);
6192+ /*
6193+ * roughly estimate insert_flow as a sequence of insertions.
6194+ */
6195+ return newnodes * carry_estimate_insert(op, level);
6196+}
6197+
6198+/* This is dispatch table for carry operations. It can be trivially
6199+ abstracted into useful plugin: tunable balancing policy is a good
6200+ thing. */
6201+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6202+ [COP_INSERT] = {
6203+ .handler = carry_insert,
6204+ .estimate = carry_estimate_insert}
6205+ ,
6206+ [COP_DELETE] = {
6207+ .handler = carry_delete,
6208+ .estimate = carry_estimate_delete}
6209+ ,
6210+ [COP_CUT] = {
6211+ .handler = carry_cut,
6212+ .estimate = carry_estimate_cut}
6213+ ,
6214+ [COP_PASTE] = {
6215+ .handler = carry_paste,
6216+ .estimate = carry_estimate_paste}
6217+ ,
6218+ [COP_EXTENT] = {
6219+ .handler = carry_extent,
6220+ .estimate = carry_estimate_extent}
6221+ ,
6222+ [COP_UPDATE] = {
6223+ .handler = carry_update,
6224+ .estimate = carry_estimate_update}
6225+ ,
6226+ [COP_INSERT_FLOW] = {
6227+ .handler = carry_insert_flow,
6228+ .estimate = carry_estimate_insert_flow}
6229+};
6230+
6231+/* Make Linus happy.
6232+ Local variables:
6233+ c-indentation-style: "K&R"
6234+ mode-name: "LC"
6235+ c-basic-offset: 8
6236+ tab-width: 8
6237+ fill-column: 120
6238+ scroll-step: 1
6239+ End:
6240+*/
71430cf6
MT
6241diff -urN linux-2.6.22.orig/fs/reiser4/carry_ops.h linux-2.6.22/fs/reiser4/carry_ops.h
6242--- linux-2.6.22.orig/fs/reiser4/carry_ops.h 1970-01-01 03:00:00.000000000 +0300
6243+++ linux-2.6.22/fs/reiser4/carry_ops.h 2007-07-29 00:25:34.828684053 +0400
44254afd
MT
6244@@ -0,0 +1,42 @@
6245+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6246+
6247+/* implementation of carry operations. See carry_ops.c for details. */
6248+
6249+#if !defined( __CARRY_OPS_H__ )
6250+#define __CARRY_OPS_H__
6251+
6252+#include "forward.h"
6253+#include "znode.h"
6254+#include "carry.h"
6255+
6256+/* carry operation handlers */
6257+typedef struct carry_op_handler {
6258+ /* perform operation */
6259+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6260+ /* estimate memory requirements for @op */
6261+ int (*estimate) (carry_op * op, carry_level * level);
6262+} carry_op_handler;
6263+
6264+/* This is dispatch table for carry operations. It can be trivially
6265+ abstracted into useful plugin: tunable balancing policy is a good
6266+ thing. */
6267+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6268+
6269+unsigned int space_needed(const znode * node, const coord_t * coord,
6270+ const reiser4_item_data * data, int inserting);
6271+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6272+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6273+
6274+/* __CARRY_OPS_H__ */
6275+#endif
6276+
6277+/* Make Linus happy.
6278+ Local variables:
6279+ c-indentation-style: "K&R"
6280+ mode-name: "LC"
6281+ c-basic-offset: 8
6282+ tab-width: 8
6283+ fill-column: 120
6284+ scroll-step: 1
6285+ End:
6286+*/
71430cf6
MT
6287diff -urN linux-2.6.22.orig/fs/reiser4/context.c linux-2.6.22/fs/reiser4/context.c
6288--- linux-2.6.22.orig/fs/reiser4/context.c 1970-01-01 03:00:00.000000000 +0300
6289+++ linux-2.6.22/fs/reiser4/context.c 2007-07-29 00:25:34.832685088 +0400
6290@@ -0,0 +1,288 @@
44254afd
MT
6291+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6292+
6293+/* Manipulation of reiser4_context */
6294+
6295+/*
6296+ * global context used during system call. Variable of this type is allocated
6297+ * on the stack at the beginning of the reiser4 part of the system call and
6298+ * pointer to it is stored in the current->fs_context. This allows us to avoid
6299+ * passing pointer to current transaction and current lockstack (both in
6300+ * one-to-one mapping with threads) all over the call chain.
6301+ *
6302+ * It's kind of like those global variables the prof used to tell you not to
6303+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6304+ *
6305+ * In some situations it is desirable to have ability to enter reiser4_context
6306+ * more than once for the same thread (nested contexts). For example, there
6307+ * are some functions that can be called either directly from VFS/VM or from
6308+ * already active reiser4 context (->writepage, for example).
6309+ *
6310+ * In such situations "child" context acts like dummy: all activity is
6311+ * actually performed in the top level context, and get_current_context()
71430cf6
MT
6312+ * always returns top level context.
6313+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6314+ * nested any way.
44254afd
MT
6315+ *
6316+ * Note that there is an important difference between reiser4 uses
6317+ * ->fs_context and the way other file systems use it. Other file systems
6318+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6319+ * (this is why ->fs_context was initially called ->journal_info). This means,
6320+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6321+ * to the file system, they assume that some transaction is already underway,
6322+ * and usually bail out, because starting nested transaction would most likely
6323+ * lead to the deadlock. This gives false positives with reiser4, because we
6324+ * set ->fs_context before starting transaction.
6325+ */
6326+
6327+#include "debug.h"
6328+#include "super.h"
6329+#include "context.h"
6330+
6331+#include <linux/writeback.h> /* balance_dirty_pages() */
6332+#include <linux/hardirq.h>
6333+
71430cf6
MT
6334+static void _reiser4_init_context(reiser4_context * context,
6335+ struct super_block *super)
44254afd
MT
6336+{
6337+ memset(context, 0, sizeof(*context));
6338+
6339+ context->super = super;
6340+ context->magic = context_magic;
6341+ context->outer = current->journal_info;
6342+ current->journal_info = (void *)context;
6343+ context->nr_children = 0;
6344+ context->gfp_mask = GFP_KERNEL;
6345+
6346+ init_lock_stack(&context->stack);
6347+
71430cf6 6348+ reiser4_txn_begin(context);
44254afd
MT
6349+
6350+ /* initialize head of tap list */
6351+ INIT_LIST_HEAD(&context->taps);
6352+#if REISER4_DEBUG
6353+ context->task = current;
6354+#endif
6355+ grab_space_enable();
6356+}
6357+
6358+/* initialize context and bind it to the current thread
6359+
6360+ This function should be called at the beginning of reiser4 part of
6361+ syscall.
6362+*/
71430cf6 6363+reiser4_context * reiser4_init_context(struct super_block * super)
44254afd
MT
6364+{
6365+ reiser4_context *context;
6366+
6367+ assert("nikita-2662", !in_interrupt() && !in_irq());
6368+ assert("nikita-3357", super != NULL);
6369+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6370+
6371+ context = get_current_context_check();
6372+ if (context && context->super == super) {
6373+ context = (reiser4_context *) current->journal_info;
6374+ context->nr_children++;
6375+ return context;
6376+ }
6377+
6378+ context = kmalloc(sizeof(*context), GFP_KERNEL);
6379+ if (context == NULL)
6380+ return ERR_PTR(RETERR(-ENOMEM));
6381+
71430cf6 6382+ _reiser4_init_context(context, super);
44254afd
MT
6383+ return context;
6384+}
6385+
6386+/* this is used in scan_mgr which is called with spinlock held and in
6387+ reiser4_fill_super magic */
6388+void init_stack_context(reiser4_context *context, struct super_block *super)
6389+{
6390+ assert("nikita-2662", !in_interrupt() && !in_irq());
6391+ assert("nikita-3357", super != NULL);
6392+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6393+ assert("vs-12", !is_in_reiser4_context());
6394+
71430cf6 6395+ _reiser4_init_context(context, super);
44254afd
MT
6396+ context->on_stack = 1;
6397+ return;
6398+}
6399+
6400+/* cast lock stack embedded into reiser4 context up to its container */
6401+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6402+{
6403+ return container_of(owner, reiser4_context, stack);
6404+}
6405+
6406+/* true if there is already _any_ reiser4 context for the current thread */
6407+int is_in_reiser4_context(void)
6408+{
6409+ reiser4_context *ctx;
6410+
6411+ ctx = current->journal_info;
6412+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6413+}
6414+
6415+/*
6416+ * call balance dirty pages for the current context.
6417+ *
6418+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
6419+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6420+ * write---this covers vast majority of all dirty traffic), but we cannot do
6421+ * this immediately when formatted node is dirtied, because long term lock is
6422+ * usually held at that time. To work around this, dirtying of formatted node
6423+ * simply increases ->nr_marked_dirty counter in the current reiser4
6424+ * context. When we are about to leave this context,
6425+ * balance_dirty_pages_ratelimited() is called, if necessary.
6426+ *
6427+ * This introduces another problem: sometimes we do not want to run
6428+ * balance_dirty_pages_ratelimited() when leaving a context, for example
6429+ * because some important lock (like ->i_mutex on the parent directory) is
6430+ * held. To achieve this, ->nobalance flag can be set in the current context.
6431+ */
6432+static void balance_dirty_pages_at(reiser4_context *context)
6433+{
6434+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
6435+
6436+ /*
6437+ * call balance_dirty_pages_ratelimited() to process formatted nodes
6438+ * dirtied during this system call. Do that only if we are not in mount
6439+ * and there were nodes dirtied in this context and we are not in
6440+ * writepage (to avoid deadlock) and not in pdflush
6441+ */
6442+ if (sbinfo != NULL && sbinfo->fake != NULL &&
6443+ context->nr_marked_dirty != 0 &&
6444+ !(current->flags & PF_MEMALLOC) &&
6445+ !current_is_pdflush())
6446+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6447+}
6448+
6449+/* release resources associated with context.
6450+
6451+ This function should be called at the end of "session" with reiser4,
6452+ typically just before leaving reiser4 driver back to VFS.
6453+
6454+ This is good place to put some degugging consistency checks, like that
6455+ thread released all locks and closed transcrash etc.
6456+
6457+*/
71430cf6 6458+static void reiser4_done_context(reiser4_context * context /* context being released */ )
44254afd
MT
6459+{
6460+ assert("nikita-860", context != NULL);
6461+ assert("nikita-859", context->magic == context_magic);
6462+ assert("vs-646", (reiser4_context *) current->journal_info == context);
6463+ assert("zam-686", !in_interrupt() && !in_irq());
6464+
6465+ /* only do anything when leaving top-level reiser4 context. All nested
6466+ * contexts are just dummies. */
6467+ if (context->nr_children == 0) {
6468+ assert("jmacd-673", context->trans == NULL);
6469+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
71430cf6
MT
6470+ assert("nikita-1936", reiser4_no_counters_are_held());
6471+ assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
44254afd 6472+ assert("zam-1004", ergo(get_super_private(context->super),
71430cf6 6473+ get_super_private(context->super)->delete_mutex_owner !=
44254afd
MT
6474+ current));
6475+
6476+ /* release all grabbed but as yet unused blocks */
6477+ if (context->grabbed_blocks != 0)
6478+ all_grabbed2free();
6479+
6480+ /*
6481+ * synchronize against longterm_unlock_znode():
6482+ * wake_up_requestor() wakes up requestors without holding
6483+ * zlock (otherwise they will immediately bump into that lock
6484+ * after wake up on another CPU). To work around (rare)
6485+ * situation where requestor has been woken up asynchronously
6486+ * and managed to run until completion (and destroy its
6487+ * context and lock stack) before wake_up_requestor() called
6488+ * wake_up() on it, wake_up_requestor() synchronize on lock
6489+ * stack spin lock. It has actually been observed that spin
6490+ * lock _was_ locked at this point, because
6491+ * wake_up_requestor() took interrupt.
6492+ */
6493+ spin_lock_stack(&context->stack);
6494+ spin_unlock_stack(&context->stack);
6495+
6496+ assert("zam-684", context->nr_children == 0);
6497+ /* restore original ->fs_context value */
6498+ current->journal_info = context->outer;
6499+ if (context->on_stack == 0)
6500+ kfree(context);
6501+ } else {
6502+ context->nr_children--;
6503+#if REISER4_DEBUG
6504+ assert("zam-685", context->nr_children >= 0);
6505+#endif
6506+ }
6507+}
6508+
6509+/*
6510+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6511+ * transaction. Call done_context() to do context related book-keeping.
6512+ */
6513+void reiser4_exit_context(reiser4_context * context)
6514+{
71430cf6 6515+ assert("nikita-3021", reiser4_schedulable());
44254afd
MT
6516+
6517+ if (context->nr_children == 0) {
6518+ if (!context->nobalance) {
71430cf6 6519+ reiser4_txn_restart(context);
44254afd
MT
6520+ balance_dirty_pages_at(context);
6521+ }
6522+
6523+ /* if filesystem is mounted with -o sync or -o dirsync - commit
6524+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
6525+ commiting on exit_context when inode semaphore is held and
6526+ to have ktxnmgrd to do commit instead to get better
6527+ concurrent filesystem accesses. But, when one mounts with -o
6528+ sync, he cares more about reliability than about
6529+ performance. So, for now we have this simple mount -o sync
6530+ support. */
6531+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6532+ txn_atom *atom;
6533+
6534+ atom = get_current_atom_locked_nocheck();
6535+ if (atom) {
6536+ atom->flags |= ATOM_FORCE_COMMIT;
6537+ context->trans->flags &= ~TXNH_DONT_COMMIT;
6538+ spin_unlock_atom(atom);
6539+ }
6540+ }
71430cf6 6541+ reiser4_txn_end(context);
44254afd 6542+ }
71430cf6 6543+ reiser4_done_context(context);
44254afd
MT
6544+}
6545+
71430cf6 6546+void reiser4_ctx_gfp_mask_set(void)
44254afd
MT
6547+{
6548+ reiser4_context *ctx;
6549+
6550+ ctx = get_current_context();
6551+ if (ctx->entd == 0 &&
6552+ list_empty(&ctx->stack.locks) &&
6553+ ctx->trans->atom == NULL)
6554+ ctx->gfp_mask = GFP_KERNEL;
6555+ else
6556+ ctx->gfp_mask = GFP_NOFS;
6557+}
6558+
71430cf6
MT
6559+void reiser4_ctx_gfp_mask_force (gfp_t mask)
6560+{
6561+ reiser4_context *ctx;
6562+ ctx = get_current_context();
6563+
6564+ assert("edward-1454", ctx != NULL);
6565+
6566+ ctx->gfp_mask = mask;
6567+}
6568+
44254afd
MT
6569+/*
6570+ * Local variables:
6571+ * c-indentation-style: "K&R"
6572+ * mode-name: "LC"
6573+ * c-basic-offset: 8
6574+ * tab-width: 8
6575+ * fill-column: 120
6576+ * scroll-step: 1
6577+ * End:
6578+ */
71430cf6
MT
6579diff -urN linux-2.6.22.orig/fs/reiser4/context.h linux-2.6.22/fs/reiser4/context.h
6580--- linux-2.6.22.orig/fs/reiser4/context.h 1970-01-01 03:00:00.000000000 +0300
6581+++ linux-2.6.22/fs/reiser4/context.h 2007-07-29 00:25:34.832685088 +0400
44254afd
MT
6582@@ -0,0 +1,228 @@
6583+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6584+ * reiser4/README */
6585+
6586+/* Reiser4 context. See context.c for details. */
6587+
6588+#if !defined( __REISER4_CONTEXT_H__ )
6589+#define __REISER4_CONTEXT_H__
6590+
6591+#include "forward.h"
6592+#include "debug.h"
6593+#include "dformat.h"
6594+#include "tap.h"
6595+#include "lock.h"
6596+
6597+#include <linux/types.h> /* for __u?? */
6598+#include <linux/fs.h> /* for struct super_block */
6599+#include <linux/spinlock.h>
6600+#include <linux/sched.h> /* for struct task_struct */
6601+
44254afd
MT
6602+/* reiser4 per-thread context */
6603+struct reiser4_context {
6604+ /* magic constant. For identification of reiser4 contexts. */
6605+ __u32 magic;
6606+
6607+ /* current lock stack. See lock.[ch]. This is where list of all
6608+ locks taken by current thread is kept. This is also used in
6609+ deadlock detection. */
6610+ lock_stack stack;
6611+
6612+ /* current transcrash. */
6613+ txn_handle *trans;
6614+ /* transaction handle embedded into reiser4_context. ->trans points
6615+ * here by default. */
6616+ txn_handle trans_in_ctx;
6617+
6618+ /* super block we are working with. To get the current tree
6619+ use &get_super_private (reiser4_get_current_sb ())->tree. */
6620+ struct super_block *super;
6621+
6622+ /* parent fs activation */
6623+ struct fs_activation *outer;
6624+
6625+ /* per-thread grabbed (for further allocation) blocks counter */
6626+ reiser4_block_nr grabbed_blocks;
6627+
6628+ /* list of taps currently monitored. See tap.c */
6629+ struct list_head taps;
6630+
6631+ /* grabbing space is enabled */
6632+ unsigned int grab_enabled:1;
6633+ /* should be set when we are write dirty nodes to disk in jnode_flush or
6634+ * reiser4_write_logs() */
6635+ unsigned int writeout_mode:1;
6636+ /* true, if current thread is an ent thread */
6637+ unsigned int entd:1;
6638+ /* true, if balance_dirty_pages() should not be run when leaving this
6639+ * context. This is used to avoid lengthly balance_dirty_pages()
6640+ * operation when holding some important resource, like directory
6641+ * ->i_mutex */
6642+ unsigned int nobalance:1;
6643+
71430cf6 6644+ /* this bit is used on reiser4_done_context to decide whether context is
44254afd
MT
6645+ kmalloc-ed and has to be kfree-ed */
6646+ unsigned int on_stack:1;
6647+
6648+ /* count non-trivial jnode_set_dirty() calls */
6649+ unsigned long nr_marked_dirty;
6650+
6651+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6652+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6653+ * captures pages. When number of pages captured in one
6654+ * reiser4_sync_inodes reaches some threshold - some atoms get
6655+ * flushed */
6656+ int nr_captured;
6657+ int nr_children; /* number of child contexts */
6658+#if REISER4_DEBUG
6659+ /* debugging information about reiser4 locks held by the current
6660+ * thread */
71430cf6 6661+ reiser4_lock_cnt_info locks;
44254afd
MT
6662+ struct task_struct *task; /* so we can easily find owner of the stack */
6663+
6664+ /*
6665+ * disk space grabbing debugging support
6666+ */
6667+ /* how many disk blocks were grabbed by the first call to
6668+ * reiser4_grab_space() in this context */
6669+ reiser4_block_nr grabbed_initially;
6670+
6671+ /* list of all threads doing flush currently */
6672+ struct list_head flushers_link;
6673+ /* information about last error encountered by reiser4 */
6674+ err_site err;
6675+#endif
6676+ void *vp;
6677+ gfp_t gfp_mask;
6678+};
6679+
6680+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6681+
6682+/* Debugging helps. */
6683+#if REISER4_DEBUG
6684+extern void print_contexts(void);
6685+#endif
6686+
6687+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6688+#define current_blocksize reiser4_get_current_sb()->s_blocksize
6689+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6690+
71430cf6 6691+extern reiser4_context *reiser4_init_context(struct super_block *);
44254afd
MT
6692+extern void init_stack_context(reiser4_context *, struct super_block *);
6693+extern void reiser4_exit_context(reiser4_context *);
6694+
6695+/* magic constant we store in reiser4_context allocated at the stack. Used to
6696+ catch accesses to staled or uninitialized contexts. */
6697+#define context_magic ((__u32) 0x4b1b5d0b)
6698+
6699+extern int is_in_reiser4_context(void);
6700+
6701+/*
6702+ * return reiser4_context for the thread @tsk
6703+ */
6704+static inline reiser4_context *get_context(const struct task_struct *tsk)
6705+{
6706+ assert("vs-1682",
6707+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6708+ return (reiser4_context *) tsk->journal_info;
6709+}
6710+
6711+/*
6712+ * return reiser4 context of the current thread, or NULL if there is none.
6713+ */
6714+static inline reiser4_context *get_current_context_check(void)
6715+{
6716+ if (is_in_reiser4_context())
6717+ return get_context(current);
6718+ else
6719+ return NULL;
6720+}
6721+
6722+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
6723+
6724+/* return context associated with current thread */
6725+static inline reiser4_context *get_current_context(void)
6726+{
6727+ return get_context(current);
6728+}
6729+
71430cf6 6730+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
44254afd
MT
6731+{
6732+ reiser4_context *ctx;
6733+
6734+ ctx = get_current_context_check();
6735+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6736+}
6737+
71430cf6
MT
6738+void reiser4_ctx_gfp_mask_set(void);
6739+void reiser4_ctx_gfp_mask_force (gfp_t mask);
44254afd
MT
6740+
6741+/*
6742+ * true if current thread is in the write-out mode. Thread enters write-out
6743+ * mode during jnode_flush and reiser4_write_logs().
6744+ */
6745+static inline int is_writeout_mode(void)
6746+{
6747+ return get_current_context()->writeout_mode;
6748+}
6749+
6750+/*
6751+ * enter write-out mode
6752+ */
6753+static inline void writeout_mode_enable(void)
6754+{
6755+ assert("zam-941", !get_current_context()->writeout_mode);
6756+ get_current_context()->writeout_mode = 1;
6757+}
6758+
6759+/*
6760+ * leave write-out mode
6761+ */
6762+static inline void writeout_mode_disable(void)
6763+{
6764+ assert("zam-942", get_current_context()->writeout_mode);
6765+ get_current_context()->writeout_mode = 0;
6766+}
6767+
6768+static inline void grab_space_enable(void)
6769+{
6770+ get_current_context()->grab_enabled = 1;
6771+}
6772+
6773+static inline void grab_space_disable(void)
6774+{
6775+ get_current_context()->grab_enabled = 0;
6776+}
6777+
6778+static inline void grab_space_set_enabled(int enabled)
6779+{
6780+ get_current_context()->grab_enabled = enabled;
6781+}
6782+
6783+static inline int is_grab_enabled(reiser4_context * ctx)
6784+{
6785+ return ctx->grab_enabled;
6786+}
6787+
6788+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6789+ * flush would be performed when it is closed. This is necessary when handle
6790+ * has to be closed under some coarse semaphore, like i_mutex of
6791+ * directory. Commit will be performed by ktxnmgrd. */
6792+static inline void context_set_commit_async(reiser4_context * context)
6793+{
6794+ context->nobalance = 1;
6795+ context->trans->flags |= TXNH_DONT_COMMIT;
6796+}
6797+
6798+/* __REISER4_CONTEXT_H__ */
6799+#endif
6800+
6801+/* Make Linus happy.
6802+ Local variables:
6803+ c-indentation-style: "K&R"
6804+ mode-name: "LC"
6805+ c-basic-offset: 8
6806+ tab-width: 8
6807+ fill-column: 120
6808+ scroll-step: 1
6809+ End:
6810+*/
71430cf6
MT
6811diff -urN linux-2.6.22.orig/fs/reiser4/coord.c linux-2.6.22/fs/reiser4/coord.c
6812--- linux-2.6.22.orig/fs/reiser4/coord.c 1970-01-01 03:00:00.000000000 +0300
6813+++ linux-2.6.22/fs/reiser4/coord.c 2007-07-29 00:25:34.832685088 +0400
6814@@ -0,0 +1,935 @@
44254afd
MT
6815+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6816+
6817+#include "forward.h"
6818+#include "debug.h"
6819+#include "dformat.h"
6820+#include "tree.h"
6821+#include "plugin/item/item.h"
6822+#include "znode.h"
6823+#include "coord.h"
6824+
6825+/* Internal constructor. */
6826+static inline void
6827+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6828+ pos_in_node_t unit_pos, between_enum between)
6829+{
6830+ coord->node = (znode *) node;
6831+ coord_set_item_pos(coord, item_pos);
6832+ coord->unit_pos = unit_pos;
6833+ coord->between = between;
6834+ ON_DEBUG(coord->plug_v = 0);
6835+ ON_DEBUG(coord->body_v = 0);
6836+
6837+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6838+}
6839+
6840+/* after shifting of node content, coord previously set properly may become
6841+ invalid, try to "normalize" it. */
6842+void coord_normalize(coord_t * coord)
6843+{
6844+ znode *node;
6845+
6846+ node = coord->node;
6847+ assert("vs-683", node);
6848+
6849+ coord_clear_iplug(coord);
6850+
6851+ if (node_is_empty(node)) {
6852+ coord_init_first_unit(coord, node);
6853+ } else if ((coord->between == AFTER_ITEM)
6854+ || (coord->between == AFTER_UNIT)) {
6855+ return;
6856+ } else if (coord->item_pos == coord_num_items(coord)
6857+ && coord->between == BEFORE_ITEM) {
6858+ coord_dec_item_pos(coord);
6859+ coord->between = AFTER_ITEM;
6860+ } else if (coord->unit_pos == coord_num_units(coord)
6861+ && coord->between == BEFORE_UNIT) {
6862+ coord->unit_pos--;
6863+ coord->between = AFTER_UNIT;
6864+ } else if (coord->item_pos == coord_num_items(coord)
6865+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6866+ coord_dec_item_pos(coord);
6867+ coord->unit_pos = 0;
6868+ coord->between = AFTER_ITEM;
6869+ }
6870+}
6871+
6872+/* Copy a coordinate. */
6873+void coord_dup(coord_t * coord, const coord_t * old_coord)
6874+{
6875+ assert("jmacd-9800", coord_check(old_coord));
6876+ coord_dup_nocheck(coord, old_coord);
6877+}
6878+
6879+/* Copy a coordinate without check. Useful when old_coord->node is not
6880+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6881+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6882+{
6883+ coord->node = old_coord->node;
6884+ coord_set_item_pos(coord, old_coord->item_pos);
6885+ coord->unit_pos = old_coord->unit_pos;
6886+ coord->between = old_coord->between;
6887+ coord->iplugid = old_coord->iplugid;
6888+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
6889+ ON_DEBUG(coord->body_v = old_coord->body_v);
6890+}
6891+
6892+/* Initialize an invalid coordinate. */
6893+void coord_init_invalid(coord_t * coord, const znode * node)
6894+{
6895+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
6896+}
6897+
6898+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6899+{
6900+ coord_init_values(coord, node, 0, 0, AT_UNIT);
6901+}
6902+
6903+/* Initialize a coordinate to point at the first unit of the first item. If the node is
6904+ empty, it is positioned at the EMPTY_NODE. */
6905+void coord_init_first_unit(coord_t * coord, const znode * node)
6906+{
6907+ int is_empty = node_is_empty(node);
6908+
6909+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6910+
6911+ assert("jmacd-9801", coord_check(coord));
6912+}
6913+
6914+/* Initialize a coordinate to point at the last unit of the last item. If the node is
6915+ empty, it is positioned at the EMPTY_NODE. */
6916+void coord_init_last_unit(coord_t * coord, const znode * node)
6917+{
6918+ int is_empty = node_is_empty(node);
6919+
6920+ coord_init_values(coord, node,
6921+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6922+ (is_empty ? EMPTY_NODE : AT_UNIT));
6923+ if (!is_empty)
6924+ coord->unit_pos = coord_last_unit_pos(coord);
6925+ assert("jmacd-9802", coord_check(coord));
6926+}
6927+
6928+/* Initialize a coordinate to before the first item. If the node is empty, it is
6929+ positioned at the EMPTY_NODE. */
6930+void coord_init_before_first_item(coord_t * coord, const znode * node)
6931+{
6932+ int is_empty = node_is_empty(node);
6933+
6934+ coord_init_values(coord, node, 0, 0,
6935+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6936+
6937+ assert("jmacd-9803", coord_check(coord));
6938+}
6939+
6940+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
6941+ at the EMPTY_NODE. */
6942+void coord_init_after_last_item(coord_t * coord, const znode * node)
6943+{
6944+ int is_empty = node_is_empty(node);
6945+
6946+ coord_init_values(coord, node,
6947+ (is_empty ? 0 : node_num_items(node) - 1), 0,
6948+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
6949+
6950+ assert("jmacd-9804", coord_check(coord));
6951+}
6952+
6953+/* Initialize a coordinate to after last unit in the item. Coord must be set
6954+ already to existing item */
6955+void coord_init_after_item_end(coord_t * coord)
6956+{
6957+ coord->between = AFTER_UNIT;
6958+ coord->unit_pos = coord_last_unit_pos(coord);
6959+}
6960+
6961+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6962+void coord_init_before_item(coord_t * coord)
6963+{
6964+ coord->unit_pos = 0;
6965+ coord->between = BEFORE_ITEM;
6966+}
6967+
6968+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
6969+void coord_init_after_item(coord_t * coord)
6970+{
6971+ coord->unit_pos = 0;
6972+ coord->between = AFTER_ITEM;
6973+}
6974+
6975+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
6976+ it was not clear how actually */
6977+void coord_init_zero(coord_t * coord)
6978+{
6979+ memset(coord, 0, sizeof(*coord));
6980+}
6981+
6982+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
6983+unsigned coord_num_units(const coord_t * coord)
6984+{
6985+ assert("jmacd-9806", coord_is_existing_item(coord));
6986+
6987+ return item_plugin_by_coord(coord)->b.nr_units(coord);
6988+}
6989+
6990+/* Returns true if the coord was initializewd by coord_init_invalid (). */
6991+/* Audited by: green(2002.06.15) */
6992+int coord_is_invalid(const coord_t * coord)
6993+{
6994+ return coord->between == INVALID_COORD;
6995+}
6996+
6997+/* Returns true if the coordinate is positioned at an existing item, not before or after
6998+ an item. It may be placed at, before, or after any unit within the item, whether
6999+ existing or not. */
7000+int coord_is_existing_item(const coord_t * coord)
7001+{
7002+ switch (coord->between) {
7003+ case EMPTY_NODE:
7004+ case BEFORE_ITEM:
7005+ case AFTER_ITEM:
7006+ case INVALID_COORD:
7007+ return 0;
7008+
7009+ case BEFORE_UNIT:
7010+ case AT_UNIT:
7011+ case AFTER_UNIT:
7012+ return coord->item_pos < coord_num_items(coord);
7013+ }
7014+
7015+ impossible("jmacd-9900", "unreachable coord: %p", coord);
7016+ return 0;
7017+}
7018+
7019+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7020+ unit. */
7021+/* Audited by: green(2002.06.15) */
7022+int coord_is_existing_unit(const coord_t * coord)
7023+{
7024+ switch (coord->between) {
7025+ case EMPTY_NODE:
7026+ case BEFORE_UNIT:
7027+ case AFTER_UNIT:
7028+ case BEFORE_ITEM:
7029+ case AFTER_ITEM:
7030+ case INVALID_COORD:
7031+ return 0;
7032+
7033+ case AT_UNIT:
7034+ return (coord->item_pos < coord_num_items(coord)
7035+ && coord->unit_pos < coord_num_units(coord));
7036+ }
7037+
7038+ impossible("jmacd-9902", "unreachable");
7039+ return 0;
7040+}
7041+
7042+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7043+ true for empty nodes nor coordinates positioned before the first item. */
7044+/* Audited by: green(2002.06.15) */
7045+int coord_is_leftmost_unit(const coord_t * coord)
7046+{
7047+ return (coord->between == AT_UNIT && coord->item_pos == 0
7048+ && coord->unit_pos == 0);
7049+}
7050+
7051+#if REISER4_DEBUG
7052+/* For assertions only, checks for a valid coordinate. */
7053+int coord_check(const coord_t * coord)
7054+{
7055+ if (coord->node == NULL) {
7056+ return 0;
7057+ }
7058+ if (znode_above_root(coord->node))
7059+ return 1;
7060+
7061+ switch (coord->between) {
7062+ default:
7063+ case INVALID_COORD:
7064+ return 0;
7065+ case EMPTY_NODE:
7066+ if (!node_is_empty(coord->node)) {
7067+ return 0;
7068+ }
7069+ return coord->item_pos == 0 && coord->unit_pos == 0;
7070+
7071+ case BEFORE_UNIT:
7072+ case AFTER_UNIT:
7073+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
7074+ && (coord->unit_pos == 0))
7075+ return 1;
7076+ case AT_UNIT:
7077+ break;
7078+ case AFTER_ITEM:
7079+ case BEFORE_ITEM:
7080+ /* before/after item should not set unit_pos. */
7081+ if (coord->unit_pos != 0) {
7082+ return 0;
7083+ }
7084+ break;
7085+ }
7086+
7087+ if (coord->item_pos >= node_num_items(coord->node)) {
7088+ return 0;
7089+ }
7090+
7091+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7092+ between is set either AFTER_ITEM or BEFORE_ITEM */
7093+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7094+ return 1;
7095+
7096+ if (coord_is_iplug_set(coord) &&
7097+ coord->unit_pos >
7098+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7099+ return 0;
7100+ }
7101+ return 1;
7102+}
7103+#endif
7104+
7105+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7106+ Returns 1 if the new position is does not exist. */
7107+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7108+{
7109+ /* If the node is invalid, leave it. */
7110+ if (coord->between == INVALID_COORD) {
7111+ return 1;
7112+ }
7113+
7114+ /* If the node is empty, set it appropriately. */
7115+ if (items == 0) {
7116+ coord->between = EMPTY_NODE;
7117+ coord_set_item_pos(coord, 0);
7118+ coord->unit_pos = 0;
7119+ return 1;
7120+ }
7121+
7122+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7123+ if (coord->between == EMPTY_NODE) {
7124+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7125+ coord_set_item_pos(coord, 0);
7126+ coord->unit_pos = 0;
7127+ return 0;
7128+ }
7129+
7130+ /* If the item_pos is out-of-range, set it appropriatly. */
7131+ if (coord->item_pos >= items) {
7132+ coord->between = AFTER_ITEM;
7133+ coord_set_item_pos(coord, items - 1);
7134+ coord->unit_pos = 0;
7135+ /* If is_next, return 1 (can't go any further). */
7136+ return is_next;
7137+ }
7138+
7139+ return 0;
7140+}
7141+
7142+/* Advances the coordinate by one unit to the right. If empty, no change. If
7143+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7144+ existing unit. */
7145+int coord_next_unit(coord_t * coord)
7146+{
7147+ unsigned items = coord_num_items(coord);
7148+
7149+ if (coord_adjust_items(coord, items, 1) == 1) {
7150+ return 1;
7151+ }
7152+
7153+ switch (coord->between) {
7154+ case BEFORE_UNIT:
7155+ /* Now it is positioned at the same unit. */
7156+ coord->between = AT_UNIT;
7157+ return 0;
7158+
7159+ case AFTER_UNIT:
7160+ case AT_UNIT:
7161+ /* If it was at or after a unit and there are more units in this item,
7162+ advance to the next one. */
7163+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7164+ coord->unit_pos += 1;
7165+ coord->between = AT_UNIT;
7166+ return 0;
7167+ }
7168+
7169+ /* Otherwise, it is crossing an item boundary and treated as if it was
7170+ after the current item. */
7171+ coord->between = AFTER_ITEM;
7172+ coord->unit_pos = 0;
7173+ /* FALLTHROUGH */
7174+
7175+ case AFTER_ITEM:
7176+ /* Check for end-of-node. */
7177+ if (coord->item_pos == items - 1) {
7178+ return 1;
7179+ }
7180+
7181+ coord_inc_item_pos(coord);
7182+ coord->unit_pos = 0;
7183+ coord->between = AT_UNIT;
7184+ return 0;
7185+
7186+ case BEFORE_ITEM:
7187+ /* The adjust_items checks ensure that we are valid here. */
7188+ coord->unit_pos = 0;
7189+ coord->between = AT_UNIT;
7190+ return 0;
7191+
7192+ case INVALID_COORD:
7193+ case EMPTY_NODE:
7194+ /* Handled in coord_adjust_items(). */
7195+ break;
7196+ }
7197+
7198+ impossible("jmacd-9902", "unreachable");
7199+ return 0;
7200+}
7201+
7202+/* Advances the coordinate by one item to the right. If empty, no change. If
7203+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7204+ an existing item. */
7205+int coord_next_item(coord_t * coord)
7206+{
7207+ unsigned items = coord_num_items(coord);
7208+
7209+ if (coord_adjust_items(coord, items, 1) == 1) {
7210+ return 1;
7211+ }
7212+
7213+ switch (coord->between) {
7214+ case AFTER_UNIT:
7215+ case AT_UNIT:
7216+ case BEFORE_UNIT:
7217+ case AFTER_ITEM:
7218+ /* Check for end-of-node. */
7219+ if (coord->item_pos == items - 1) {
7220+ coord->between = AFTER_ITEM;
7221+ coord->unit_pos = 0;
7222+ coord_clear_iplug(coord);
7223+ return 1;
7224+ }
7225+
7226+ /* Anywhere in an item, go to the next one. */
7227+ coord->between = AT_UNIT;
7228+ coord_inc_item_pos(coord);
7229+ coord->unit_pos = 0;
7230+ return 0;
7231+
7232+ case BEFORE_ITEM:
7233+ /* The out-of-range check ensures that we are valid here. */
7234+ coord->unit_pos = 0;
7235+ coord->between = AT_UNIT;
7236+ return 0;
7237+ case INVALID_COORD:
7238+ case EMPTY_NODE:
7239+ /* Handled in coord_adjust_items(). */
7240+ break;
7241+ }
7242+
7243+ impossible("jmacd-9903", "unreachable");
7244+ return 0;
7245+}
7246+
7247+/* Advances the coordinate by one unit to the left. If empty, no change. If
7248+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7249+ is an existing unit. */
7250+int coord_prev_unit(coord_t * coord)
7251+{
7252+ unsigned items = coord_num_items(coord);
7253+
7254+ if (coord_adjust_items(coord, items, 0) == 1) {
7255+ return 1;
7256+ }
7257+
7258+ switch (coord->between) {
7259+ case AT_UNIT:
7260+ case BEFORE_UNIT:
7261+ if (coord->unit_pos > 0) {
7262+ coord->unit_pos -= 1;
7263+ coord->between = AT_UNIT;
7264+ return 0;
7265+ }
7266+
7267+ if (coord->item_pos == 0) {
7268+ coord->between = BEFORE_ITEM;
7269+ return 1;
7270+ }
7271+
7272+ coord_dec_item_pos(coord);
7273+ coord->unit_pos = coord_last_unit_pos(coord);
7274+ coord->between = AT_UNIT;
7275+ return 0;
7276+
7277+ case AFTER_UNIT:
7278+ /* What if unit_pos is out-of-range? */
7279+ assert("jmacd-5442",
7280+ coord->unit_pos <= coord_last_unit_pos(coord));
7281+ coord->between = AT_UNIT;
7282+ return 0;
7283+
7284+ case BEFORE_ITEM:
7285+ if (coord->item_pos == 0) {
7286+ return 1;
7287+ }
7288+
7289+ coord_dec_item_pos(coord);
7290+ /* FALLTHROUGH */
7291+
7292+ case AFTER_ITEM:
7293+ coord->between = AT_UNIT;
7294+ coord->unit_pos = coord_last_unit_pos(coord);
7295+ return 0;
7296+
7297+ case INVALID_COORD:
7298+ case EMPTY_NODE:
7299+ break;
7300+ }
7301+
7302+ impossible("jmacd-9904", "unreachable");
7303+ return 0;
7304+}
7305+
7306+/* Advances the coordinate by one item to the left. If empty, no change. If
7307+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7308+ is an existing item. */
7309+int coord_prev_item(coord_t * coord)
7310+{
7311+ unsigned items = coord_num_items(coord);
7312+
7313+ if (coord_adjust_items(coord, items, 0) == 1) {
7314+ return 1;
7315+ }
7316+
7317+ switch (coord->between) {
7318+ case AT_UNIT:
7319+ case AFTER_UNIT:
7320+ case BEFORE_UNIT:
7321+ case BEFORE_ITEM:
7322+
7323+ if (coord->item_pos == 0) {
7324+ coord->between = BEFORE_ITEM;
7325+ coord->unit_pos = 0;
7326+ return 1;
7327+ }
7328+
7329+ coord_dec_item_pos(coord);
7330+ coord->unit_pos = 0;
7331+ coord->between = AT_UNIT;
7332+ return 0;
7333+
7334+ case AFTER_ITEM:
7335+ coord->between = AT_UNIT;
7336+ coord->unit_pos = 0;
7337+ return 0;
7338+
7339+ case INVALID_COORD:
7340+ case EMPTY_NODE:
7341+ break;
7342+ }
7343+
7344+ impossible("jmacd-9905", "unreachable");
7345+ return 0;
7346+}
7347+
7348+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7349+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7350+{
7351+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7352+ if (dir == LEFT_SIDE) {
7353+ coord_init_first_unit(coord, node);
7354+ } else {
7355+ coord_init_last_unit(coord, node);
7356+ }
7357+}
7358+
7359+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7360+ argument. */
7361+/* Audited by: green(2002.06.15) */
7362+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7363+{
7364+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7365+ if (dir == LEFT_SIDE) {
7366+ return coord_is_before_leftmost(coord);
7367+ } else {
7368+ return coord_is_after_rightmost(coord);
7369+ }
7370+}
7371+
7372+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7373+/* Audited by: green(2002.06.15) */
7374+int coord_sideof_unit(coord_t * coord, sideof dir)
7375+{
7376+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7377+ if (dir == LEFT_SIDE) {
7378+ return coord_prev_unit(coord);
7379+ } else {
7380+ return coord_next_unit(coord);
7381+ }
7382+}
7383+
7384+#if REISER4_DEBUG
44254afd
MT
7385+int coords_equal(const coord_t * c1, const coord_t * c2)
7386+{
7387+ assert("nikita-2840", c1 != NULL);
7388+ assert("nikita-2841", c2 != NULL);
7389+
7390+ return
7391+ c1->node == c2->node &&
7392+ c1->item_pos == c2->item_pos &&
7393+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7394+}
71430cf6 7395+#endif /* REISER4_DEBUG */
44254afd
MT
7396+
7397+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7398+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7399+/* Audited by: green(2002.06.15) */
7400+coord_wrt_node coord_wrt(const coord_t * coord)
7401+{
7402+ if (coord_is_before_leftmost(coord)) {
7403+ return COORD_ON_THE_LEFT;
7404+ }
7405+
7406+ if (coord_is_after_rightmost(coord)) {
7407+ return COORD_ON_THE_RIGHT;
7408+ }
7409+
7410+ return COORD_INSIDE;
7411+}
7412+
7413+/* Returns true if the coordinate is positioned after the last item or after the last unit
7414+ of the last item or it is an empty node. */
7415+/* Audited by: green(2002.06.15) */
7416+int coord_is_after_rightmost(const coord_t * coord)
7417+{
7418+ assert("jmacd-7313", coord_check(coord));
7419+
7420+ switch (coord->between) {
7421+ case INVALID_COORD:
7422+ case AT_UNIT:
7423+ case BEFORE_UNIT:
7424+ case BEFORE_ITEM:
7425+ return 0;
7426+
7427+ case EMPTY_NODE:
7428+ return 1;
7429+
7430+ case AFTER_ITEM:
7431+ return (coord->item_pos == node_num_items(coord->node) - 1);
7432+
7433+ case AFTER_UNIT:
7434+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7435+ coord->unit_pos == coord_last_unit_pos(coord));
7436+ }
7437+
7438+ impossible("jmacd-9908", "unreachable");
7439+ return 0;
7440+}
7441+
7442+/* Returns true if the coordinate is positioned before the first item or it is an empty
7443+ node. */
7444+int coord_is_before_leftmost(const coord_t * coord)
7445+{
7446+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7447+ necessary to check if coord is set before leftmost
7448+ assert ("jmacd-7313", coord_check (coord)); */
7449+ switch (coord->between) {
7450+ case INVALID_COORD:
7451+ case AT_UNIT:
7452+ case AFTER_ITEM:
7453+ case AFTER_UNIT:
7454+ return 0;
7455+
7456+ case EMPTY_NODE:
7457+ return 1;
7458+
7459+ case BEFORE_ITEM:
7460+ case BEFORE_UNIT:
7461+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
7462+ }
7463+
7464+ impossible("jmacd-9908", "unreachable");
7465+ return 0;
7466+}
7467+
7468+/* Returns true if the coordinate is positioned after a item, before a item, after the
7469+ last unit of an item, before the first unit of an item, or at an empty node. */
7470+/* Audited by: green(2002.06.15) */
7471+int coord_is_between_items(const coord_t * coord)
7472+{
7473+ assert("jmacd-7313", coord_check(coord));
7474+
7475+ switch (coord->between) {
7476+ case INVALID_COORD:
7477+ case AT_UNIT:
7478+ return 0;
7479+
7480+ case AFTER_ITEM:
7481+ case BEFORE_ITEM:
7482+ case EMPTY_NODE:
7483+ return 1;
7484+
7485+ case BEFORE_UNIT:
7486+ return coord->unit_pos == 0;
7487+
7488+ case AFTER_UNIT:
7489+ return coord->unit_pos == coord_last_unit_pos(coord);
7490+ }
7491+
7492+ impossible("jmacd-9908", "unreachable");
7493+ return 0;
7494+}
7495+
71430cf6 7496+#if REISER4_DEBUG
44254afd
MT
7497+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7498+ before-after or item boundaries. */
7499+int coord_are_neighbors(coord_t * c1, coord_t * c2)
7500+{
7501+ coord_t *left;
7502+ coord_t *right;
7503+
7504+ assert("nikita-1241", c1 != NULL);
7505+ assert("nikita-1242", c2 != NULL);
7506+ assert("nikita-1243", c1->node == c2->node);
7507+ assert("nikita-1244", coord_is_existing_unit(c1));
7508+ assert("nikita-1245", coord_is_existing_unit(c2));
7509+
7510+ left = right = NULL;
7511+ switch (coord_compare(c1, c2)) {
7512+ case COORD_CMP_ON_LEFT:
7513+ left = c1;
7514+ right = c2;
7515+ break;
7516+ case COORD_CMP_ON_RIGHT:
7517+ left = c2;
7518+ right = c1;
7519+ break;
7520+ case COORD_CMP_SAME:
7521+ return 0;
7522+ default:
7523+ wrong_return_value("nikita-1246", "compare_coords()");
7524+ }
7525+ assert("vs-731", left && right);
7526+ if (left->item_pos == right->item_pos) {
7527+ return left->unit_pos + 1 == right->unit_pos;
7528+ } else if (left->item_pos + 1 == right->item_pos) {
7529+ return (left->unit_pos == coord_last_unit_pos(left))
7530+ && (right->unit_pos == 0);
7531+ } else {
7532+ return 0;
7533+ }
7534+}
71430cf6 7535+#endif /* REISER4_DEBUG */
44254afd
MT
7536+
7537+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7538+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
7539+/* Audited by: green(2002.06.15) */
7540+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7541+{
7542+ assert("vs-209", c1->node == c2->node);
7543+ assert("vs-194", coord_is_existing_unit(c1)
7544+ && coord_is_existing_unit(c2));
7545+
7546+ if (c1->item_pos > c2->item_pos)
7547+ return COORD_CMP_ON_RIGHT;
7548+ if (c1->item_pos < c2->item_pos)
7549+ return COORD_CMP_ON_LEFT;
7550+ if (c1->unit_pos > c2->unit_pos)
7551+ return COORD_CMP_ON_RIGHT;
7552+ if (c1->unit_pos < c2->unit_pos)
7553+ return COORD_CMP_ON_LEFT;
7554+ return COORD_CMP_SAME;
7555+}
7556+
7557+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
7558+ non-zero if there is no position to the right. */
7559+int coord_set_to_right(coord_t * coord)
7560+{
7561+ unsigned items = coord_num_items(coord);
7562+
7563+ if (coord_adjust_items(coord, items, 1) == 1) {
7564+ return 1;
7565+ }
7566+
7567+ switch (coord->between) {
7568+ case AT_UNIT:
7569+ return 0;
7570+
7571+ case BEFORE_ITEM:
7572+ case BEFORE_UNIT:
7573+ coord->between = AT_UNIT;
7574+ return 0;
7575+
7576+ case AFTER_UNIT:
7577+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7578+ coord->unit_pos += 1;
7579+ coord->between = AT_UNIT;
7580+ return 0;
7581+ } else {
7582+
7583+ coord->unit_pos = 0;
7584+
7585+ if (coord->item_pos == items - 1) {
7586+ coord->between = AFTER_ITEM;
7587+ return 1;
7588+ }
7589+
7590+ coord_inc_item_pos(coord);
7591+ coord->between = AT_UNIT;
7592+ return 0;
7593+ }
7594+
7595+ case AFTER_ITEM:
7596+ if (coord->item_pos == items - 1) {
7597+ return 1;
7598+ }
7599+
7600+ coord_inc_item_pos(coord);
7601+ coord->unit_pos = 0;
7602+ coord->between = AT_UNIT;
7603+ return 0;
7604+
7605+ case EMPTY_NODE:
7606+ return 1;
7607+
7608+ case INVALID_COORD:
7609+ break;
7610+ }
7611+
7612+ impossible("jmacd-9920", "unreachable");
7613+ return 0;
7614+}
7615+
7616+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
7617+ non-zero if there is no position to the left. */
7618+int coord_set_to_left(coord_t * coord)
7619+{
7620+ unsigned items = coord_num_items(coord);
7621+
7622+ if (coord_adjust_items(coord, items, 0) == 1) {
7623+ return 1;
7624+ }
7625+
7626+ switch (coord->between) {
7627+ case AT_UNIT:
7628+ return 0;
7629+
7630+ case AFTER_UNIT:
7631+ coord->between = AT_UNIT;
7632+ return 0;
7633+
7634+ case AFTER_ITEM:
7635+ coord->between = AT_UNIT;
7636+ coord->unit_pos = coord_last_unit_pos(coord);
7637+ return 0;
7638+
7639+ case BEFORE_UNIT:
7640+ if (coord->unit_pos > 0) {
7641+ coord->unit_pos -= 1;
7642+ coord->between = AT_UNIT;
7643+ return 0;
7644+ } else {
7645+
7646+ if (coord->item_pos == 0) {
7647+ coord->between = BEFORE_ITEM;
7648+ return 1;
7649+ }
7650+
7651+ coord->unit_pos = coord_last_unit_pos(coord);
7652+ coord_dec_item_pos(coord);
7653+ coord->between = AT_UNIT;
7654+ return 0;
7655+ }
7656+
7657+ case BEFORE_ITEM:
7658+ if (coord->item_pos == 0) {
7659+ return 1;
7660+ }
7661+
7662+ coord_dec_item_pos(coord);
7663+ coord->unit_pos = coord_last_unit_pos(coord);
7664+ coord->between = AT_UNIT;
7665+ return 0;
7666+
7667+ case EMPTY_NODE:
7668+ return 1;
7669+
7670+ case INVALID_COORD:
7671+ break;
7672+ }
7673+
7674+ impossible("jmacd-9920", "unreachable");
7675+ return 0;
7676+}
7677+
7678+static const char *coord_tween_tostring(between_enum n)
7679+{
7680+ switch (n) {
7681+ case BEFORE_UNIT:
7682+ return "before unit";
7683+ case BEFORE_ITEM:
7684+ return "before item";
7685+ case AT_UNIT:
7686+ return "at unit";
7687+ case AFTER_UNIT:
7688+ return "after unit";
7689+ case AFTER_ITEM:
7690+ return "after item";
7691+ case EMPTY_NODE:
7692+ return "empty node";
7693+ case INVALID_COORD:
7694+ return "invalid";
7695+ default:
7696+ {
7697+ static char buf[30];
7698+
7699+ sprintf(buf, "unknown: %i", n);
7700+ return buf;
7701+ }
7702+ }
7703+}
7704+
7705+void print_coord(const char *mes, const coord_t * coord, int node)
7706+{
7707+ if (coord == NULL) {
7708+ printk("%s: null\n", mes);
7709+ return;
7710+ }
7711+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7712+ mes, coord->item_pos, coord->unit_pos,
7713+ coord_tween_tostring(coord->between), coord->iplugid);
7714+}
7715+
7716+int
7717+item_utmost_child_real_block(const coord_t * coord, sideof side,
7718+ reiser4_block_nr * blk)
7719+{
7720+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7721+ side,
7722+ blk);
7723+}
7724+
7725+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7726+{
7727+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7728+}
7729+
7730+/* @count bytes of flow @f got written, update correspondingly f->length,
7731+ f->data and f->key */
7732+void move_flow_forward(flow_t * f, unsigned count)
7733+{
7734+ if (f->data)
7735+ f->data += count;
7736+ f->length -= count;
7737+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
7738+}
7739+
7740+/*
7741+ Local variables:
7742+ c-indentation-style: "K&R"
7743+ mode-name: "LC"
7744+ c-basic-offset: 8
7745+ tab-width: 8
7746+ fill-column: 120
7747+ scroll-step: 1
7748+ End:
7749+*/
71430cf6
MT
7750diff -urN linux-2.6.22.orig/fs/reiser4/coord.h linux-2.6.22/fs/reiser4/coord.h
7751--- linux-2.6.22.orig/fs/reiser4/coord.h 1970-01-01 03:00:00.000000000 +0300
7752+++ linux-2.6.22/fs/reiser4/coord.h 2007-07-29 00:25:34.832685088 +0400
44254afd
MT
7753@@ -0,0 +1,389 @@
7754+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7755+
7756+/* Coords */
7757+
7758+#if !defined( __REISER4_COORD_H__ )
7759+#define __REISER4_COORD_H__
7760+
7761+#include "forward.h"
7762+#include "debug.h"
7763+#include "dformat.h"
7764+#include "key.h"
7765+
7766+/* insertions happen between coords in the tree, so we need some means
7767+ of specifying the sense of betweenness. */
7768+typedef enum {
7769+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7770+ AT_UNIT,
7771+ AFTER_UNIT,
7772+ BEFORE_ITEM,
7773+ AFTER_ITEM,
7774+ INVALID_COORD,
7775+ EMPTY_NODE,
7776+} between_enum;
7777+
7778+/* location of coord w.r.t. its node */
7779+typedef enum {
7780+ COORD_ON_THE_LEFT = -1,
7781+ COORD_ON_THE_RIGHT = +1,
7782+ COORD_INSIDE = 0
7783+} coord_wrt_node;
7784+
7785+typedef enum {
7786+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7787+} coord_cmp;
7788+
7789+struct coord {
7790+ /* node in a tree */
7791+ /* 0 */ znode *node;
7792+
7793+ /* position of item within node */
7794+ /* 4 */ pos_in_node_t item_pos;
7795+ /* position of unit within item */
7796+ /* 6 */ pos_in_node_t unit_pos;
7797+ /* optimization: plugin of item is stored in coord_t. Until this was
7798+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7799+ is invalidated (set to 0xff) on each modification of ->item_pos,
7800+ and all such modifications are funneled through coord_*_item_pos()
7801+ functions below.
7802+ */
7803+ /* 8 */ char iplugid;
7804+ /* position of coord w.r.t. to neighboring items and/or units.
7805+ Values are taken from &between_enum above.
7806+ */
7807+ /* 9 */ char between;
7808+ /* padding. It will be added by the compiler anyway to conform to the
7809+ * C language alignment requirements. We keep it here to be on the
7810+ * safe side and to have a clear picture of the memory layout of this
7811+ * structure. */
7812+ /* 10 */ __u16 pad;
7813+ /* 12 */ int offset;
7814+#if REISER4_DEBUG
7815+ unsigned long plug_v;
7816+ unsigned long body_v;
7817+#endif
7818+};
7819+
7820+#define INVALID_PLUGID ((char)((1 << 8) - 1))
7821+#define INVALID_OFFSET -1
7822+
7823+static inline void coord_clear_iplug(coord_t * coord)
7824+{
7825+ assert("nikita-2835", coord != NULL);
7826+ coord->iplugid = INVALID_PLUGID;
7827+ coord->offset = INVALID_OFFSET;
7828+}
7829+
7830+static inline int coord_is_iplug_set(const coord_t * coord)
7831+{
7832+ assert("nikita-2836", coord != NULL);
7833+ return coord->iplugid != INVALID_PLUGID;
7834+}
7835+
7836+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7837+{
7838+ assert("nikita-2478", coord != NULL);
7839+ coord->item_pos = pos;
7840+ coord_clear_iplug(coord);
7841+}
7842+
7843+static inline void coord_dec_item_pos(coord_t * coord)
7844+{
7845+ assert("nikita-2480", coord != NULL);
7846+ --coord->item_pos;
7847+ coord_clear_iplug(coord);
7848+}
7849+
7850+static inline void coord_inc_item_pos(coord_t * coord)
7851+{
7852+ assert("nikita-2481", coord != NULL);
7853+ ++coord->item_pos;
7854+ coord_clear_iplug(coord);
7855+}
7856+
7857+static inline void coord_add_item_pos(coord_t * coord, int delta)
7858+{
7859+ assert("nikita-2482", coord != NULL);
7860+ coord->item_pos += delta;
7861+ coord_clear_iplug(coord);
7862+}
7863+
7864+static inline void coord_invalid_item_pos(coord_t * coord)
7865+{
7866+ assert("nikita-2832", coord != NULL);
7867+ coord->item_pos = (unsigned short)~0;
7868+ coord_clear_iplug(coord);
7869+}
7870+
7871+/* Reverse a direction. */
7872+static inline sideof sideof_reverse(sideof side)
7873+{
7874+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7875+}
7876+
7877+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7878+
7879+ "first" and "last"
7880+ "next" and "prev"
7881+ "before" and "after"
7882+ "leftmost" and "rightmost"
7883+
7884+ But I think the chosen names are decent the way they are.
7885+*/
7886+
7887+/* COORD INITIALIZERS */
7888+
7889+/* Initialize an invalid coordinate. */
7890+extern void coord_init_invalid(coord_t * coord, const znode * node);
7891+
7892+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7893+
7894+/* Initialize a coordinate to point at the first unit of the first item. If the node is
7895+ empty, it is positioned at the EMPTY_NODE. */
7896+extern void coord_init_first_unit(coord_t * coord, const znode * node);
7897+
7898+/* Initialize a coordinate to point at the last unit of the last item. If the node is
7899+ empty, it is positioned at the EMPTY_NODE. */
7900+extern void coord_init_last_unit(coord_t * coord, const znode * node);
7901+
7902+/* Initialize a coordinate to before the first item. If the node is empty, it is
7903+ positioned at the EMPTY_NODE. */
7904+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7905+
7906+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7907+ at the EMPTY_NODE. */
7908+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7909+
7910+/* Initialize a coordinate to after last unit in the item. Coord must be set
7911+ already to existing item */
7912+void coord_init_after_item_end(coord_t * coord);
7913+
7914+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7915+void coord_init_before_item(coord_t *);
7916+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7917+void coord_init_after_item(coord_t *);
7918+
7919+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7920+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7921+ sideof dir);
7922+
7923+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7924+ it was not clear how actually
7925+ FIXME-VS: added by vs (2002, june, 8) */
7926+extern void coord_init_zero(coord_t * coord);
7927+
7928+/* COORD METHODS */
7929+
7930+/* after shifting of node content, coord previously set properly may become
7931+ invalid, try to "normalize" it. */
7932+void coord_normalize(coord_t * coord);
7933+
7934+/* Copy a coordinate. */
7935+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7936+
7937+/* Copy a coordinate without check. */
7938+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7939+
7940+unsigned coord_num_units(const coord_t * coord);
7941+
7942+/* Return the last valid unit number at the present item (i.e.,
7943+ coord_num_units() - 1). */
7944+static inline unsigned coord_last_unit_pos(const coord_t * coord)
7945+{
7946+ return coord_num_units(coord) - 1;
7947+}
7948+
7949+#if REISER4_DEBUG
7950+/* For assertions only, checks for a valid coordinate. */
7951+extern int coord_check(const coord_t * coord);
7952+
7953+extern unsigned long znode_times_locked(const znode * z);
7954+
7955+static inline void coord_update_v(coord_t * coord)
7956+{
7957+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7958+}
7959+#endif
7960+
7961+extern int coords_equal(const coord_t * c1, const coord_t * c2);
7962+
7963+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
7964+
7965+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7966+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7967+extern coord_wrt_node coord_wrt(const coord_t * coord);
7968+
7969+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7970+ before-after or item boundaries. */
7971+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
7972+
7973+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
7974+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
7975+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
7976+
7977+/* COORD PREDICATES */
7978+
7979+/* Returns true if the coord was initializewd by coord_init_invalid (). */
7980+extern int coord_is_invalid(const coord_t * coord);
7981+
7982+/* Returns true if the coordinate is positioned at an existing item, not before or after
7983+ an item. It may be placed at, before, or after any unit within the item, whether
7984+ existing or not. If this is true you can call methods of the item plugin. */
7985+extern int coord_is_existing_item(const coord_t * coord);
7986+
7987+/* Returns true if the coordinate is positioned after a item, before a item, after the
7988+ last unit of an item, before the first unit of an item, or at an empty node. */
7989+extern int coord_is_between_items(const coord_t * coord);
7990+
7991+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7992+ unit. */
7993+extern int coord_is_existing_unit(const coord_t * coord);
7994+
7995+/* Returns true if the coordinate is positioned at an empty node. */
7996+extern int coord_is_empty(const coord_t * coord);
7997+
7998+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7999+ true for empty nodes nor coordinates positioned before the first item. */
8000+extern int coord_is_leftmost_unit(const coord_t * coord);
8001+
8002+/* Returns true if the coordinate is positioned after the last item or after the last unit
8003+ of the last item or it is an empty node. */
8004+extern int coord_is_after_rightmost(const coord_t * coord);
8005+
8006+/* Returns true if the coordinate is positioned before the first item or it is an empty
8007+ node. */
8008+extern int coord_is_before_leftmost(const coord_t * coord);
8009+
8010+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8011+ argument. */
8012+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8013+
8014+/* COORD MODIFIERS */
8015+
8016+/* Advances the coordinate by one unit to the right. If empty, no change. If
8017+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8018+ an existing unit. */
8019+extern int coord_next_unit(coord_t * coord);
8020+
8021+/* Advances the coordinate by one item to the right. If empty, no change. If
8022+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8023+ an existing item. */
8024+extern int coord_next_item(coord_t * coord);
8025+
8026+/* Advances the coordinate by one unit to the left. If empty, no change. If
8027+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8028+ is an existing unit. */
8029+extern int coord_prev_unit(coord_t * coord);
8030+
8031+/* Advances the coordinate by one item to the left. If empty, no change. If
8032+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8033+ is an existing item. */
8034+extern int coord_prev_item(coord_t * coord);
8035+
8036+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8037+ non-zero if there is no position to the right. */
8038+extern int coord_set_to_right(coord_t * coord);
8039+
8040+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8041+ non-zero if there is no position to the left. */
8042+extern int coord_set_to_left(coord_t * coord);
8043+
8044+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8045+ and non-zero if the unit did not exist. */
8046+extern int coord_set_after_unit(coord_t * coord);
8047+
8048+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8049+extern int coord_sideof_unit(coord_t * coord, sideof dir);
8050+
8051+/* iterate over all units in @node */
8052+#define for_all_units( coord, node ) \
8053+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8054+ coord_next_unit( coord ) == 0 ; )
8055+
8056+/* iterate over all items in @node */
8057+#define for_all_items( coord, node ) \
8058+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8059+ coord_next_item( coord ) == 0 ; )
8060+
8061+/* COORD/ITEM METHODS */
8062+
8063+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8064+ reiser4_block_nr * blk);
8065+extern int item_utmost_child(const coord_t * coord, sideof side,
8066+ jnode ** child);
8067+
8068+/* a flow is a sequence of bytes being written to or read from the tree. The
8069+ tree will slice the flow into items while storing it into nodes, but all of
8070+ that is hidden from anything outside the tree. */
8071+
8072+struct flow {
8073+ reiser4_key key; /* key of start of flow's sequence of bytes */
8074+ loff_t length; /* length of flow's sequence of bytes */
8075+ char *data; /* start of flow's sequence of bytes */
8076+ int user; /* if 1 data is user space, 0 - kernel space */
8077+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8078+};
8079+
8080+void move_flow_forward(flow_t * f, unsigned count);
8081+
8082+/* &reiser4_item_data - description of data to be inserted or pasted
8083+
8084+ Q: articulate the reasons for the difference between this and flow.
8085+
8086+ A: Becides flow we insert into tree other things: stat data, directory
8087+ entry, etc. To insert them into tree one has to provide this structure. If
8088+ one is going to insert flow - he can use insert_flow, where this structure
8089+ does not have to be created
8090+*/
8091+struct reiser4_item_data {
8092+ /* actual data to be inserted. If NULL, ->create_item() will not
8093+ do xmemcpy itself, leaving this up to the caller. This can
8094+ save some amount of unnecessary memory copying, for example,
8095+ during insertion of stat data.
8096+
8097+ */
8098+ char *data;
8099+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
8100+ kernel space */
8101+ int user;
8102+ /* amount of data we are going to insert or paste */
8103+ int length;
8104+ /* "Arg" is opaque data that is passed down to the
8105+ ->create_item() method of node layout, which in turn
8106+ hands it to the ->create_hook() of item being created. This
8107+ arg is currently used by:
8108+
8109+ . ->create_hook() of internal item
8110+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8111+ . ->paste() method of directory item.
8112+ . ->create_hook() of extent item
8113+
8114+ For internal item, this is left "brother" of new node being
8115+ inserted and it is used to add new node into sibling list
8116+ after parent to it was just inserted into parent.
8117+
8118+ While ->arg does look somewhat of unnecessary compication,
8119+ it actually saves a lot of headache in many places, because
8120+ all data necessary to insert or paste new data into tree are
8121+ collected in one place, and this eliminates a lot of extra
8122+ argument passing and storing everywhere.
8123+
8124+ */
8125+ void *arg;
8126+ /* plugin of item we are inserting */
8127+ item_plugin *iplug;
8128+};
8129+
8130+/* __REISER4_COORD_H__ */
8131+#endif
8132+
8133+/* Make Linus happy.
8134+ Local variables:
8135+ c-indentation-style: "K&R"
8136+ mode-name: "LC"
8137+ c-basic-offset: 8
8138+ tab-width: 8
8139+ fill-column: 120
8140+ scroll-step: 1
8141+ End:
8142+*/
71430cf6
MT
8143diff -urN linux-2.6.22.orig/fs/reiser4/debug.c linux-2.6.22/fs/reiser4/debug.c
8144--- linux-2.6.22.orig/fs/reiser4/debug.c 1970-01-01 03:00:00.000000000 +0300
8145+++ linux-2.6.22/fs/reiser4/debug.c 2007-07-29 00:25:34.836686123 +0400
8146@@ -0,0 +1,308 @@
44254afd
MT
8147+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8148+ * reiser4/README */
8149+
8150+/* Debugging facilities. */
8151+
8152+/*
8153+ * This file contains generic debugging functions used by reiser4. Roughly
8154+ * following:
8155+ *
8156+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
8157+ *
71430cf6
MT
8158+ * locking:
8159+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8160+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
44254afd 8161+ *
71430cf6
MT
8162+ * error code monitoring (see comment before RETERR macro):
8163+ * reiser4_return_err(), reiser4_report_err().
44254afd
MT
8164+ *
8165+ * stack back-tracing: fill_backtrace()
8166+ *
71430cf6
MT
8167+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8168+ * reiser4_debugtrap().
44254afd
MT
8169+ *
8170+ */
8171+
8172+#include "reiser4.h"
8173+#include "context.h"
8174+#include "super.h"
8175+#include "txnmgr.h"
8176+#include "znode.h"
8177+
8178+#include <linux/sysfs.h>
8179+#include <linux/slab.h>
8180+#include <linux/types.h>
8181+#include <linux/fs.h>
8182+#include <linux/spinlock.h>
8183+#include <linux/kallsyms.h>
8184+#include <linux/vmalloc.h>
8185+#include <linux/ctype.h>
8186+#include <linux/sysctl.h>
8187+#include <linux/hardirq.h>
8188+
71430cf6 8189+#if 0
44254afd 8190+#if REISER4_DEBUG
71430cf6 8191+static void reiser4_report_err(void);
44254afd 8192+#else
71430cf6 8193+#define reiser4_report_err() noop
44254afd 8194+#endif
71430cf6 8195+#endif /* 0 */
44254afd
MT
8196+
8197+/*
8198+ * global buffer where message given to reiser4_panic is formatted.
8199+ */
8200+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8201+
8202+/*
8203+ * lock protecting consistency of panic_buf under concurrent panics
8204+ */
8205+static DEFINE_SPINLOCK(panic_guard);
8206+
8207+/* Your best friend. Call it on each occasion. This is called by
8208+ fs/reiser4/debug.h:reiser4_panic(). */
8209+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8210+{
8211+ static int in_panic = 0;
8212+ va_list args;
8213+
8214+ /*
8215+ * check for recursive panic.
8216+ */
8217+ if (in_panic == 0) {
8218+ in_panic = 1;
8219+
8220+ spin_lock(&panic_guard);
8221+ va_start(args, format);
8222+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8223+ va_end(args);
8224+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8225+ spin_unlock(&panic_guard);
8226+
8227+ /*
8228+ * if kernel debugger is configured---drop in. Early dropping
8229+ * into kgdb is not always convenient, because panic message
8230+ * is not yet printed most of the times. But:
8231+ *
8232+ * (1) message can be extracted from printk_buf[]
8233+ * (declared static inside of printk()), and
8234+ *
8235+ * (2) sometimes serial/kgdb combo dies while printing
8236+ * long panic message, so it's more prudent to break into
8237+ * debugger earlier.
8238+ *
8239+ */
8240+ DEBUGON(1);
8241+ }
8242+ /* to make gcc happy about noreturn attribute */
8243+ panic("%s", panic_buf);
8244+}
8245+
71430cf6 8246+#if 0
44254afd
MT
8247+void
8248+reiser4_print_prefix(const char *level, int reperr, const char *mid,
8249+ const char *function, const char *file, int lineno)
8250+{
8251+ const char *comm;
8252+ int pid;
8253+
8254+ if (unlikely(in_interrupt() || in_irq())) {
8255+ comm = "interrupt";
8256+ pid = 0;
8257+ } else {
8258+ comm = current->comm;
8259+ pid = current->pid;
8260+ }
8261+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8262+ level, comm, pid, function, file, lineno, mid);
8263+ if (reperr)
71430cf6 8264+ reiser4_report_err();
44254afd 8265+}
71430cf6 8266+#endif /* 0 */
44254afd
MT
8267+
8268+/* Preemption point: this should be called periodically during long running
8269+ operations (carry, allocate, and squeeze are best examples) */
71430cf6 8270+int reiser4_preempt_point(void)
44254afd 8271+{
71430cf6 8272+ assert("nikita-3008", reiser4_schedulable());
44254afd
MT
8273+ cond_resched();
8274+ return signal_pending(current);
8275+}
8276+
8277+#if REISER4_DEBUG
8278+/* Debugging aid: return struct where information about locks taken by current
8279+ thread is accumulated. This can be used to formulate lock ordering
8280+ constraints and various assertions.
8281+
8282+*/
71430cf6 8283+reiser4_lock_cnt_info *reiser4_lock_counters(void)
44254afd
MT
8284+{
8285+ reiser4_context *ctx = get_current_context();
8286+ assert("jmacd-1123", ctx != NULL);
8287+ return &ctx->locks;
8288+}
8289+
8290+/*
8291+ * print human readable information about locks held by the reiser4 context.
8292+ */
8293+static void print_lock_counters(const char *prefix,
71430cf6 8294+ const reiser4_lock_cnt_info * info)
44254afd
MT
8295+{
8296+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8297+ "jload: %i, "
8298+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8299+ "ktxnmgrd: %i, fq: %i\n"
8300+ "inode: %i, "
8301+ "cbk_cache: %i (r:%i,w%i), "
8302+ "eflush: %i, "
8303+ "zlock: %i,\n"
8304+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8305+ "d: %i, x: %i, t: %i\n", prefix,
8306+ info->spin_locked_jnode,
8307+ info->rw_locked_tree, info->read_locked_tree,
8308+ info->write_locked_tree,
8309+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8310+ info->spin_locked_jload,
8311+ info->spin_locked_txnh,
8312+ info->spin_locked_atom, info->spin_locked_stack,
8313+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8314+ info->spin_locked_fq,
8315+ info->spin_locked_inode,
8316+ info->rw_locked_cbk_cache,
8317+ info->read_locked_cbk_cache,
8318+ info->write_locked_cbk_cache,
8319+ info->spin_locked_super_eflush,
8320+ info->spin_locked_zlock,
8321+ info->spin_locked,
8322+ info->long_term_locked_znode,
8323+ info->inode_sem_r, info->inode_sem_w,
8324+ info->d_refs, info->x_refs, info->t_refs);
8325+}
8326+
8327+/* check that no spinlocks are held */
71430cf6 8328+int reiser4_schedulable(void)
44254afd
MT
8329+{
8330+ if (get_current_context_check() != NULL) {
8331+ if (!LOCK_CNT_NIL(spin_locked)) {
71430cf6 8332+ print_lock_counters("in atomic", reiser4_lock_counters());
44254afd
MT
8333+ return 0;
8334+ }
8335+ }
8336+ might_sleep();
8337+ return 1;
8338+}
8339+/*
8340+ * return true, iff no locks are held.
8341+ */
71430cf6 8342+int reiser4_no_counters_are_held(void)
44254afd 8343+{
71430cf6 8344+ reiser4_lock_cnt_info *counters;
44254afd 8345+
71430cf6 8346+ counters = reiser4_lock_counters();
44254afd
MT
8347+ return
8348+ (counters->spin_locked_zlock == 0) &&
8349+ (counters->spin_locked_jnode == 0) &&
8350+ (counters->rw_locked_tree == 0) &&
8351+ (counters->read_locked_tree == 0) &&
8352+ (counters->write_locked_tree == 0) &&
8353+ (counters->rw_locked_dk == 0) &&
8354+ (counters->read_locked_dk == 0) &&
8355+ (counters->write_locked_dk == 0) &&
8356+ (counters->spin_locked_txnh == 0) &&
8357+ (counters->spin_locked_atom == 0) &&
8358+ (counters->spin_locked_stack == 0) &&
8359+ (counters->spin_locked_txnmgr == 0) &&
8360+ (counters->spin_locked_inode == 0) &&
8361+ (counters->spin_locked == 0) &&
8362+ (counters->long_term_locked_znode == 0) &&
8363+ (counters->inode_sem_r == 0) &&
8364+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8365+}
8366+
8367+/*
8368+ * return true, iff transaction commit can be done under locks held by the
8369+ * current thread.
8370+ */
71430cf6 8371+int reiser4_commit_check_locks(void)
44254afd 8372+{
71430cf6 8373+ reiser4_lock_cnt_info *counters;
44254afd
MT
8374+ int inode_sem_r;
8375+ int inode_sem_w;
8376+ int result;
8377+
8378+ /*
8379+ * inode's read/write semaphore is the only reiser4 lock that can be
8380+ * held during commit.
8381+ */
8382+
71430cf6 8383+ counters = reiser4_lock_counters();
44254afd
MT
8384+ inode_sem_r = counters->inode_sem_r;
8385+ inode_sem_w = counters->inode_sem_w;
8386+
8387+ counters->inode_sem_r = counters->inode_sem_w = 0;
71430cf6 8388+ result = reiser4_no_counters_are_held();
44254afd
MT
8389+ counters->inode_sem_r = inode_sem_r;
8390+ counters->inode_sem_w = inode_sem_w;
8391+ return result;
8392+}
8393+
8394+/*
8395+ * fill "error site" in the current reiser4 context. See comment before RETERR
8396+ * macro for more details.
8397+ */
71430cf6 8398+void reiser4_return_err(int code, const char *file, int line)
44254afd
MT
8399+{
8400+ if (code < 0 && is_in_reiser4_context()) {
8401+ reiser4_context *ctx = get_current_context();
8402+
8403+ if (ctx != NULL) {
8404+ ctx->err.code = code;
8405+ ctx->err.file = file;
8406+ ctx->err.line = line;
8407+ }
8408+ }
8409+}
8410+
71430cf6 8411+#if 0
44254afd 8412+/*
71430cf6 8413+ * report error information recorder by reiser4_return_err().
44254afd 8414+ */
71430cf6 8415+static void reiser4_report_err(void)
44254afd
MT
8416+{
8417+ reiser4_context *ctx = get_current_context_check();
8418+
8419+ if (ctx != NULL) {
8420+ if (ctx->err.code != 0) {
8421+ printk("code: %i at %s:%i\n",
8422+ ctx->err.code, ctx->err.file, ctx->err.line);
8423+ }
8424+ }
8425+}
71430cf6 8426+#endif /* 0 */
44254afd
MT
8427+
8428+#endif /* REISER4_DEBUG */
8429+
8430+#if KERNEL_DEBUGGER
8431+
8432+/*
8433+ * this functions just drops into kernel debugger. It is a convenient place to
8434+ * put breakpoint in.
8435+ */
71430cf6 8436+void reiser4_debugtrap(void)
44254afd
MT
8437+{
8438+ /* do nothing. Put break point here. */
8439+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8440+ extern void breakpoint(void);
8441+ breakpoint();
8442+#endif
8443+}
8444+#endif
8445+
8446+/* Make Linus happy.
8447+ Local variables:
8448+ c-indentation-style: "K&R"
8449+ mode-name: "LC"
8450+ c-basic-offset: 8
8451+ tab-width: 8
8452+ fill-column: 120
8453+ End:
8454+*/
71430cf6
MT
8455diff -urN linux-2.6.22.orig/fs/reiser4/debug.h linux-2.6.22/fs/reiser4/debug.h
8456--- linux-2.6.22.orig/fs/reiser4/debug.h 1970-01-01 03:00:00.000000000 +0300
8457+++ linux-2.6.22/fs/reiser4/debug.h 2007-07-29 00:25:34.836686123 +0400
44254afd
MT
8458@@ -0,0 +1,350 @@
8459+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8460+
8461+/* Declarations of debug macros. */
8462+
8463+#if !defined( __FS_REISER4_DEBUG_H__ )
8464+#define __FS_REISER4_DEBUG_H__
8465+
8466+#include "forward.h"
8467+#include "reiser4.h"
8468+
8469+/* generic function to produce formatted output, decorating it with
8470+ whatever standard prefixes/postfixes we want. "Fun" is a function
8471+ that will be actually called, can be printk, panic etc.
8472+ This is for use by other debugging macros, not by users. */
8473+#define DCALL(lev, fun, reperr, label, format, ...) \
8474+({ \
8475+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8476+ current->comm, current->pid, __FUNCTION__, \
8477+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
8478+})
8479+
8480+/*
8481+ * cause kernel to crash
8482+ */
8483+#define reiser4_panic(mid, format, ...) \
8484+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8485+
8486+/* print message with indication of current process, file, line and
8487+ function */
8488+#define reiser4_log(label, format, ...) \
8489+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8490+
8491+/* Assertion checked during compilation.
8492+ If "cond" is false (0) we get duplicate case label in switch.
8493+ Use this to check something like famous
8494+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8495+ in 3.x journal.c. If cassertion fails you get compiler error,
8496+ so no "maintainer-id".
8497+*/
8498+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8499+
8500+#define noop do {;} while(0)
8501+
8502+#if REISER4_DEBUG
8503+/* version of info that only actually prints anything when _d_ebugging
8504+ is on */
8505+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8506+/* macro to catch logical errors. Put it into `default' clause of
8507+ switch() statement. */
8508+#define impossible(label, format, ...) \
8509+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8510+/* assert assures that @cond is true. If it is not, reiser4_panic() is
8511+ called. Use this for checking logical consistency and _never_ call
8512+ this to check correctness of external data: disk blocks and user-input . */
8513+#define assert(label, cond) \
8514+({ \
8515+ /* call_on_each_assert(); */ \
8516+ if (cond) { \
8517+ /* put negated check to avoid using !(cond) that would lose \
8518+ * warnings for things like assert(a = b); */ \
8519+ ; \
8520+ } else { \
8521+ DEBUGON(1); \
8522+ reiser4_panic(label, "assertion failed: %s", #cond); \
8523+ } \
8524+})
8525+
8526+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8527+#define check_me( label, expr ) assert( label, ( expr ) )
8528+
8529+#define ON_DEBUG( exp ) exp
8530+
71430cf6 8531+extern int reiser4_schedulable(void);
44254afd
MT
8532+extern void call_on_each_assert(void);
8533+
8534+#else
8535+
8536+#define dinfo( format, args... ) noop
8537+#define impossible( label, format, args... ) noop
8538+#define assert( label, cond ) noop
8539+#define check_me( label, expr ) ( ( void ) ( expr ) )
8540+#define ON_DEBUG( exp )
71430cf6 8541+#define reiser4_schedulable() might_sleep()
44254afd
MT
8542+
8543+/* REISER4_DEBUG */
8544+#endif
8545+
8546+#if REISER4_DEBUG
8547+/* per-thread information about lock acquired by this thread. Used by lock
8548+ * ordering checking in spin_macros.h */
71430cf6 8549+typedef struct reiser4_lock_cnt_info {
44254afd
MT
8550+ int rw_locked_tree;
8551+ int read_locked_tree;
8552+ int write_locked_tree;
8553+
8554+ int rw_locked_dk;
8555+ int read_locked_dk;
8556+ int write_locked_dk;
8557+
8558+ int rw_locked_cbk_cache;
8559+ int read_locked_cbk_cache;
8560+ int write_locked_cbk_cache;
8561+
8562+ int spin_locked_zlock;
8563+ int spin_locked_jnode;
8564+ int spin_locked_jload;
8565+ int spin_locked_txnh;
8566+ int spin_locked_atom;
8567+ int spin_locked_stack;
8568+ int spin_locked_txnmgr;
8569+ int spin_locked_ktxnmgrd;
8570+ int spin_locked_fq;
8571+ int spin_locked_inode;
8572+ int spin_locked_super_eflush;
8573+ int spin_locked;
8574+ int long_term_locked_znode;
8575+
8576+ int inode_sem_r;
8577+ int inode_sem_w;
8578+
8579+ int d_refs;
8580+ int x_refs;
8581+ int t_refs;
71430cf6 8582+} reiser4_lock_cnt_info;
44254afd 8583+
71430cf6 8584+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
44254afd
MT
8585+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8586+
8587+/* increment lock-counter @counter, if present */
71430cf6
MT
8588+#define LOCK_CNT_INC(counter) \
8589+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
44254afd
MT
8590+
8591+/* decrement lock-counter @counter, if present */
71430cf6
MT
8592+#define LOCK_CNT_DEC(counter) \
8593+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
44254afd
MT
8594+
8595+/* check that lock-counter is zero. This is for use in assertions */
71430cf6
MT
8596+#define LOCK_CNT_NIL(counter) \
8597+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
44254afd
MT
8598+
8599+/* check that lock-counter is greater than zero. This is for use in
8600+ * assertions */
71430cf6
MT
8601+#define LOCK_CNT_GTZ(counter) \
8602+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8603+#define LOCK_CNT_LT(counter,n) \
8604+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
44254afd
MT
8605+
8606+#else /* REISER4_DEBUG */
8607+
8608+/* no-op versions on the above */
8609+
71430cf6
MT
8610+typedef struct reiser4_lock_cnt_info {
8611+} reiser4_lock_cnt_info;
44254afd 8612+
71430cf6 8613+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
44254afd
MT
8614+#define LOCK_CNT_INC(counter) noop
8615+#define LOCK_CNT_DEC(counter) noop
8616+#define LOCK_CNT_NIL(counter) (1)
8617+#define LOCK_CNT_GTZ(counter) (1)
8618+#define LOCK_CNT_LT(counter,n) (1)
8619+
8620+#endif /* REISER4_DEBUG */
8621+
8622+#define assert_spin_not_locked(lock) BUG_ON(0)
8623+#define assert_rw_write_locked(lock) BUG_ON(0)
8624+#define assert_rw_read_locked(lock) BUG_ON(0)
8625+#define assert_rw_locked(lock) BUG_ON(0)
8626+#define assert_rw_not_write_locked(lock) BUG_ON(0)
8627+#define assert_rw_not_read_locked(lock) BUG_ON(0)
8628+#define assert_rw_not_locked(lock) BUG_ON(0)
8629+
8630+/* flags controlling debugging behavior. Are set through debug_flags=N mount
8631+ option. */
8632+typedef enum {
8633+ /* print a lot of information during panic. When this is on all jnodes
8634+ * are listed. This can be *very* large output. Usually you don't want
8635+ * this. Especially over serial line. */
8636+ REISER4_VERBOSE_PANIC = 0x00000001,
8637+ /* print a lot of information during umount */
8638+ REISER4_VERBOSE_UMOUNT = 0x00000002,
8639+ /* print gathered statistics on umount */
8640+ REISER4_STATS_ON_UMOUNT = 0x00000004,
8641+ /* check node consistency */
8642+ REISER4_CHECK_NODE = 0x00000008
8643+} reiser4_debug_flags;
8644+
8645+extern int is_in_reiser4_context(void);
8646+
8647+/*
8648+ * evaluate expression @e only if with reiser4 context
8649+ */
8650+#define ON_CONTEXT(e) do { \
8651+ if(is_in_reiser4_context()) { \
8652+ e; \
8653+ } } while(0)
8654+
8655+/*
8656+ * evaluate expression @e only when within reiser4_context and debugging is
8657+ * on.
8658+ */
8659+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8660+
8661+/*
8662+ * complain about unexpected function result and crash. Used in "default"
8663+ * branches of switch statements and alike to assert that invalid results are
8664+ * not silently ignored.
8665+ */
8666+#define wrong_return_value( label, function ) \
8667+ impossible( label, "wrong return value from " function )
8668+
8669+/* Issue different types of reiser4 messages to the console */
8670+#define warning( label, format, ... ) \
8671+ DCALL( KERN_WARNING, \
8672+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8673+#define notice( label, format, ... ) \
8674+ DCALL( KERN_NOTICE, \
8675+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8676+
8677+/* mark not yet implemented functionality */
8678+#define not_yet( label, format, ... ) \
8679+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8680+
8681+extern void reiser4_do_panic(const char *format, ...)
8682+ __attribute__ ((noreturn, format(printf, 1, 2)));
8683+
71430cf6 8684+extern int reiser4_preempt_point(void);
44254afd
MT
8685+extern void reiser4_print_stats(void);
8686+
44254afd 8687+#if REISER4_DEBUG
71430cf6
MT
8688+extern int reiser4_no_counters_are_held(void);
8689+extern int reiser4_commit_check_locks(void);
44254afd 8690+#else
71430cf6
MT
8691+#define reiser4_no_counters_are_held() (1)
8692+#define reiser4_commit_check_locks() (1)
44254afd
MT
8693+#endif
8694+
8695+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8696+#define IS_POW(i) \
8697+({ \
8698+ typeof(i) __i; \
8699+ \
8700+ __i = (i); \
8701+ !(__i & (__i - 1)); \
8702+})
8703+
8704+#define KERNEL_DEBUGGER (1)
8705+
8706+#if KERNEL_DEBUGGER
8707+
71430cf6 8708+extern void reiser4_debugtrap(void);
44254afd
MT
8709+
8710+/*
8711+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8712+ * kgdb is not compiled in, do nothing.
8713+ */
71430cf6
MT
8714+#define DEBUGON(cond) \
8715+({ \
8716+ if (unlikely(cond)) \
8717+ reiser4_debugtrap(); \
44254afd
MT
8718+})
8719+#else
8720+#define DEBUGON(cond) noop
8721+#endif
8722+
8723+/*
8724+ * Error code tracing facility. (Idea is borrowed from XFS code.)
8725+ *
8726+ * Suppose some strange and/or unexpected code is returned from some function
8727+ * (for example, write(2) returns -EEXIST). It is possible to place a
8728+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
8729+ * in what particular place -EEXIST was generated first?
8730+ *
8731+ * In reiser4 all places where actual error codes are produced (that is,
8732+ * statements of the form
8733+ *
8734+ * return -EFOO; // (1), or
8735+ *
8736+ * result = -EFOO; // (2)
8737+ *
8738+ * are replaced with
8739+ *
8740+ * return RETERR(-EFOO); // (1a), and
8741+ *
8742+ * result = RETERR(-EFOO); // (2a) respectively
8743+ *
8744+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8745+ * printed in error and warning messages. Moreover, it's possible to put a
71430cf6
MT
8746+ * conditional breakpoint in reiser4_return_err (low-level function called
8747+ * by RETERR() to do the actual work) to break into debugger immediately
8748+ * when particular error happens.
44254afd
MT
8749+ *
8750+ */
8751+
8752+#if REISER4_DEBUG
8753+
8754+/*
8755+ * data-type to store information about where error happened ("error site").
8756+ */
8757+typedef struct err_site {
8758+ int code; /* error code */
8759+ const char *file; /* source file, filled by __FILE__ */
8760+ int line; /* source file line, filled by __LINE__ */
8761+} err_site;
8762+
71430cf6 8763+extern void reiser4_return_err(int code, const char *file, int line);
44254afd
MT
8764+
8765+/*
8766+ * fill &get_current_context()->err_site with error information.
8767+ */
71430cf6
MT
8768+#define RETERR(code) \
8769+({ \
8770+ typeof(code) __code; \
8771+ \
8772+ __code = (code); \
8773+ reiser4_return_err(__code, __FILE__, __LINE__); \
8774+ __code; \
44254afd
MT
8775+})
8776+
8777+#else
8778+
8779+/*
8780+ * no-op versions of the above
8781+ */
8782+
8783+typedef struct err_site {
8784+} err_site;
8785+#define RETERR(code) code
8786+#endif
8787+
8788+#if REISER4_LARGE_KEY
8789+/*
8790+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8791+ */
8792+#define ON_LARGE_KEY(...) __VA_ARGS__
8793+#else
8794+#define ON_LARGE_KEY(...)
8795+#endif
8796+
8797+/* __FS_REISER4_DEBUG_H__ */
8798+#endif
8799+
8800+/* Make Linus happy.
8801+ Local variables:
8802+ c-indentation-style: "K&R"
8803+ mode-name: "LC"
8804+ c-basic-offset: 8
8805+ tab-width: 8
8806+ fill-column: 120
8807+ End:
8808+*/
71430cf6
MT
8809diff -urN linux-2.6.22.orig/fs/reiser4/dformat.h linux-2.6.22/fs/reiser4/dformat.h
8810--- linux-2.6.22.orig/fs/reiser4/dformat.h 1970-01-01 03:00:00.000000000 +0300
8811+++ linux-2.6.22/fs/reiser4/dformat.h 2007-07-29 00:25:34.836686123 +0400
8812@@ -0,0 +1,70 @@
44254afd
MT
8813+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8814+
8815+/* Formats of on-disk data and conversion functions. */
8816+
8817+/* put all item formats in the files describing the particular items,
8818+ our model is, everything you need to do to add an item to reiser4,
8819+ (excepting the changes to the plugin that uses the item which go
8820+ into the file defining that plugin), you put into one file. */
8821+/* Data on disk are stored in little-endian format.
8822+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
8823+ d??tocpu() and cputod??() to convert. */
8824+
8825+#if !defined( __FS_REISER4_DFORMAT_H__ )
8826+#define __FS_REISER4_DFORMAT_H__
8827+
8828+#include <asm/byteorder.h>
8829+#include <asm/unaligned.h>
8830+#include <linux/types.h>
8831+
44254afd
MT
8832+typedef __u8 d8;
8833+typedef __le16 d16;
8834+typedef __le32 d32;
8835+typedef __le64 d64;
8836+
8837+#define PACKED __attribute__((packed))
8838+
8839+/* data-type for block number */
8840+typedef __u64 reiser4_block_nr;
8841+
8842+/* data-type for block number on disk, disk format */
8843+typedef __le64 reiser4_dblock_nr;
8844+
8845+/**
8846+ * disk_addr_eq - compare disk addresses
8847+ * @b1: pointer to block number ot compare
8848+ * @b2: pointer to block number ot compare
8849+ *
8850+ * Returns true if if disk addresses are the same
8851+ */
8852+static inline int disk_addr_eq(const reiser4_block_nr *b1,
8853+ const reiser4_block_nr * b2)
8854+{
8855+ assert("nikita-1033", b1 != NULL);
8856+ assert("nikita-1266", b2 != NULL);
8857+
8858+ return !memcmp(b1, b2, sizeof *b1);
8859+}
8860+
8861+/* structure of master reiser4 super block */
8862+typedef struct reiser4_master_sb {
8863+ char magic[16]; /* "ReIsEr4" */
8864+ __le16 disk_plugin_id; /* id of disk layout plugin */
8865+ __le16 blocksize;
8866+ char uuid[16]; /* unique id */
8867+ char label[16]; /* filesystem label */
8868+ __le64 diskmap; /* location of the diskmap. 0 if not present */
8869+} reiser4_master_sb;
8870+
8871+/* __FS_REISER4_DFORMAT_H__ */
8872+#endif
8873+
8874+/*
8875+ * Local variables:
8876+ * c-indentation-style: "K&R"
8877+ * mode-name: "LC"
8878+ * c-basic-offset: 8
8879+ * tab-width: 8
8880+ * fill-column: 79
8881+ * End:
8882+ */
71430cf6
MT
8883diff -urN linux-2.6.22.orig/fs/reiser4/dscale.c linux-2.6.22/fs/reiser4/dscale.c
8884--- linux-2.6.22.orig/fs/reiser4/dscale.c 1970-01-01 03:00:00.000000000 +0300
8885+++ linux-2.6.22/fs/reiser4/dscale.c 2007-07-29 00:25:34.836686123 +0400
44254afd
MT
8886@@ -0,0 +1,174 @@
8887+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8888+ * reiser4/README */
8889+
8890+/* Scalable on-disk integers */
8891+
8892+/*
8893+ * Various on-disk structures contain integer-like structures. Stat-data
8894+ * contain [yes, "data" is plural, check the dictionary] file size, link
8895+ * count; extent unit contains extent width etc. To accommodate for general
8896+ * case enough space is reserved to keep largest possible value. 64 bits in
8897+ * all cases above. But in overwhelming majority of cases numbers actually
8898+ * stored in these fields will be comparatively small and reserving 8 bytes is
8899+ * a waste of precious disk bandwidth.
8900+ *
8901+ * Scalable integers are one way to solve this problem. dscale_write()
8902+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8903+ * depending on the magnitude of the value supplied. dscale_read() reads value
8904+ * previously stored by dscale_write().
8905+ *
8906+ * dscale_write() produces format not completely unlike of UTF: two highest
8907+ * bits of the first byte are used to store "tag". One of 4 possible tag
8908+ * values is chosen depending on the number being encoded:
8909+ *
8910+ * 0 ... 0x3f => 0 [table 1]
8911+ * 0x40 ... 0x3fff => 1
8912+ * 0x4000 ... 0x3fffffff => 2
8913+ * 0x40000000 ... 0xffffffffffffffff => 3
8914+ *
8915+ * (see dscale_range() function)
8916+ *
8917+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8918+ * to be stored, so in this case there is no place in the first byte to store
8919+ * tag. For such values tag is stored in an extra 9th byte.
8920+ *
8921+ * As _highest_ bits are used for the test (which is natural) scaled integers
8922+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8923+ * uses LITTLE-ENDIAN.
8924+ *
8925+ */
8926+
8927+#include "debug.h"
8928+#include "dscale.h"
8929+
8930+/* return tag of scaled integer stored at @address */
8931+static int gettag(const unsigned char *address)
8932+{
8933+ /* tag is stored in two highest bits */
8934+ return (*address) >> 6;
8935+}
8936+
8937+/* clear tag from value. Clear tag embedded into @value. */
8938+static void cleartag(__u64 * value, int tag)
8939+{
8940+ /*
8941+ * W-w-what ?!
8942+ *
8943+ * Actually, this is rather simple: @value passed here was read by
8944+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8945+ * zeroes. Tag is still stored in the highest (arithmetically)
8946+ * non-zero bits of @value, but relative position of tag within __u64
8947+ * depends on @tag.
8948+ *
8949+ * For example if @tag is 0, it's stored 2 highest bits of lowest
8950+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8951+ *
8952+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8953+ * and it's offset if (2 * 8) - 2 == 14 bits.
8954+ *
8955+ * See table 1 above for details.
8956+ *
8957+ * All these cases are captured by the formula:
8958+ */
8959+ *value &= ~(3 << (((1 << tag) << 3) - 2));
8960+ /*
8961+ * That is, clear two (3 == 0t11) bits at the offset
8962+ *
8963+ * 8 * (2 ^ tag) - 2,
8964+ *
8965+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
8966+ */
8967+}
8968+
8969+/* return tag for @value. See table 1 above for details. */
8970+static int dscale_range(__u64 value)
8971+{
8972+ if (value > 0x3fffffff)
8973+ return 3;
8974+ if (value > 0x3fff)
8975+ return 2;
8976+ if (value > 0x3f)
8977+ return 1;
8978+ return 0;
8979+}
8980+
8981+/* restore value stored at @adderss by dscale_write() and return number of
8982+ * bytes consumed */
8983+int dscale_read(unsigned char *address, __u64 * value)
8984+{
8985+ int tag;
8986+
8987+ /* read tag */
8988+ tag = gettag(address);
8989+ switch (tag) {
8990+ case 3:
8991+ /* In this case tag is stored in an extra byte, skip this byte
8992+ * and decode value stored in the next 8 bytes.*/
8993+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
8994+ /* worst case: 8 bytes for value itself plus one byte for
8995+ * tag. */
8996+ return 9;
8997+ case 0:
8998+ *value = get_unaligned(address);
8999+ break;
9000+ case 1:
9001+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9002+ break;
9003+ case 2:
9004+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9005+ break;
9006+ default:
9007+ return RETERR(-EIO);
9008+ }
9009+ /* clear tag embedded into @value */
9010+ cleartag(value, tag);
9011+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
9012+ return 1 << tag;
9013+}
9014+
9015+/* store @value at @address and return number of bytes consumed */
9016+int dscale_write(unsigned char *address, __u64 value)
9017+{
9018+ int tag;
9019+ int shift;
9020+ __be64 v;
9021+ unsigned char *valarr;
9022+
9023+ tag = dscale_range(value);
9024+ v = __cpu_to_be64(value);
9025+ valarr = (unsigned char *)&v;
9026+ shift = (tag == 3) ? 1 : 0;
9027+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9028+ *address |= (tag << 6);
9029+ return shift + (1 << tag);
9030+}
9031+
9032+/* number of bytes required to store @value */
9033+int dscale_bytes(__u64 value)
9034+{
9035+ int bytes;
9036+
9037+ bytes = 1 << dscale_range(value);
9038+ if (bytes == 8)
9039+ ++bytes;
9040+ return bytes;
9041+}
9042+
9043+/* returns true if @value and @other require the same number of bytes to be
9044+ * stored. Used by detect when data structure (like stat-data) has to be
9045+ * expanded or contracted. */
9046+int dscale_fit(__u64 value, __u64 other)
9047+{
9048+ return dscale_range(value) == dscale_range(other);
9049+}
9050+
9051+/* Make Linus happy.
9052+ Local variables:
9053+ c-indentation-style: "K&R"
9054+ mode-name: "LC"
9055+ c-basic-offset: 8
9056+ tab-width: 8
9057+ fill-column: 120
9058+ scroll-step: 1
9059+ End:
9060+*/
71430cf6
MT
9061diff -urN linux-2.6.22.orig/fs/reiser4/dscale.h linux-2.6.22/fs/reiser4/dscale.h
9062--- linux-2.6.22.orig/fs/reiser4/dscale.h 1970-01-01 03:00:00.000000000 +0300
9063+++ linux-2.6.22/fs/reiser4/dscale.h 2007-07-29 00:25:34.836686123 +0400
44254afd
MT
9064@@ -0,0 +1,27 @@
9065+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9066+ * reiser4/README */
9067+
9068+/* Scalable on-disk integers. See dscale.h for details. */
9069+
9070+#if !defined( __FS_REISER4_DSCALE_H__ )
9071+#define __FS_REISER4_DSCALE_H__
9072+
9073+#include "dformat.h"
9074+
9075+extern int dscale_read(unsigned char *address, __u64 * value);
9076+extern int dscale_write(unsigned char *address, __u64 value);
9077+extern int dscale_bytes(__u64 value);
9078+extern int dscale_fit(__u64 value, __u64 other);
9079+
9080+/* __FS_REISER4_DSCALE_H__ */
9081+#endif
9082+
9083+/* Make Linus happy.
9084+ Local variables:
9085+ c-indentation-style: "K&R"
9086+ mode-name: "LC"
9087+ c-basic-offset: 8
9088+ tab-width: 8
9089+ fill-column: 120
9090+ End:
9091+*/
71430cf6
MT
9092diff -urN linux-2.6.22.orig/fs/reiser4/entd.c linux-2.6.22/fs/reiser4/entd.c
9093--- linux-2.6.22.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9094+++ linux-2.6.22/fs/reiser4/entd.c 2007-07-29 00:25:34.840687159 +0400
9095@@ -0,0 +1,335 @@
44254afd
MT
9096+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9097+ * reiser4/README */
9098+
9099+/* Ent daemon. */
9100+
9101+#include "debug.h"
9102+#include "txnmgr.h"
9103+#include "tree.h"
9104+#include "entd.h"
9105+#include "super.h"
9106+#include "context.h"
9107+#include "reiser4.h"
9108+#include "vfs_ops.h"
9109+#include "page_cache.h"
9110+#include "inode.h"
9111+
9112+#include <linux/sched.h> /* struct task_struct */
9113+#include <linux/suspend.h>
9114+#include <linux/kernel.h>
9115+#include <linux/writeback.h>
9116+#include <linux/time.h> /* INITIAL_JIFFIES */
9117+#include <linux/backing-dev.h> /* bdi_write_congested */
9118+#include <linux/wait.h>
9119+#include <linux/kthread.h>
71430cf6 9120+#include <linux/freezer.h>
44254afd
MT
9121+
9122+#define DEF_PRIORITY 12
9123+#define MAX_ENTD_ITERS 10
9124+
9125+static void entd_flush(struct super_block *, struct wbq *);
9126+static int entd(void *arg);
9127+
9128+/*
9129+ * set ->comm field of end thread to make its state visible to the user level
9130+ */
9131+#define entd_set_comm(state) \
9132+ snprintf(current->comm, sizeof(current->comm), \
9133+ "ent:%s%s", super->s_id, (state))
9134+
9135+/**
71430cf6 9136+ * reiser4_init_entd - initialize entd context and start kernel daemon
44254afd
MT
9137+ * @super: super block to start ent thread for
9138+ *
9139+ * Creates entd contexts, starts kernel thread and waits until it
9140+ * initializes.
9141+ */
71430cf6 9142+int reiser4_init_entd(struct super_block *super)
44254afd
MT
9143+{
9144+ entd_context *ctx;
9145+
9146+ assert("nikita-3104", super != NULL);
9147+
9148+ ctx = get_entd_context(super);
9149+
9150+ memset(ctx, 0, sizeof *ctx);
9151+ spin_lock_init(&ctx->guard);
9152+ init_waitqueue_head(&ctx->wait);
9153+#if REISER4_DEBUG
9154+ INIT_LIST_HEAD(&ctx->flushers_list);
9155+#endif
9156+ /* lists of writepage requests */
9157+ INIT_LIST_HEAD(&ctx->todo_list);
9158+ INIT_LIST_HEAD(&ctx->done_list);
9159+ /* start entd */
9160+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9161+ if (IS_ERR(ctx->tsk))
9162+ return PTR_ERR(ctx->tsk);
9163+ return 0;
9164+}
9165+
71430cf6 9166+static void put_wbq(struct wbq *rq)
44254afd 9167+{
71430cf6
MT
9168+ iput(rq->mapping->host);
9169+ complete(&rq->completion);
44254afd
MT
9170+}
9171+
9172+/* ent should be locked */
9173+static struct wbq *__get_wbq(entd_context * ent)
9174+{
9175+ struct wbq *wbq;
9176+
71430cf6 9177+ if (list_empty(&ent->todo_list))
44254afd
MT
9178+ return NULL;
9179+
9180+ ent->nr_todo_reqs --;
9181+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
9182+ list_del_init(&wbq->link);
9183+ return wbq;
9184+}
9185+
44254afd
MT
9186+/* ent thread function */
9187+static int entd(void *arg)
9188+{
9189+ struct super_block *super;
9190+ entd_context *ent;
9191+ int done = 0;
9192+
9193+ super = arg;
9194+ /* do_fork() just copies task_struct into the new
9195+ thread. ->fs_context shouldn't be copied of course. This shouldn't
9196+ be a problem for the rest of the code though.
9197+ */
9198+ current->journal_info = NULL;
9199+
9200+ ent = get_entd_context(super);
9201+
9202+ while (!done) {
9203+ try_to_freeze();
9204+
9205+ spin_lock(&ent->guard);
9206+ while (ent->nr_todo_reqs != 0) {
71430cf6 9207+ struct wbq *rq;
44254afd 9208+
71430cf6 9209+ assert("", list_empty(&ent->done_list));
44254afd
MT
9210+
9211+ /* take request from the queue head */
9212+ rq = __get_wbq(ent);
9213+ assert("", rq != NULL);
9214+ ent->cur_request = rq;
9215+ spin_unlock(&ent->guard);
9216+
9217+ entd_set_comm("!");
9218+ entd_flush(super, rq);
9219+
71430cf6 9220+ put_wbq(rq);
44254afd
MT
9221+
9222+ /*
9223+ * wakeup all requestors and iput their inodes
9224+ */
9225+ spin_lock(&ent->guard);
71430cf6
MT
9226+ while (!list_empty(&ent->done_list)) {
9227+ rq = list_entry(ent->done_list.next, struct wbq, link);
9228+ list_del_init(&rq->link);
44254afd
MT
9229+ ent->nr_done_reqs --;
9230+ spin_unlock(&ent->guard);
44254afd 9231+ assert("", rq->written == 1);
71430cf6 9232+ put_wbq(rq);
44254afd
MT
9233+ spin_lock(&ent->guard);
9234+ }
9235+ }
9236+ spin_unlock(&ent->guard);
9237+
9238+ entd_set_comm(".");
9239+
9240+ {
9241+ DEFINE_WAIT(__wait);
9242+
9243+ do {
9244+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9245+ if (kthread_should_stop()) {
9246+ done = 1;
9247+ break;
9248+ }
9249+ if (ent->nr_todo_reqs != 0)
9250+ break;
9251+ schedule();
9252+ } while (0);
9253+ finish_wait(&ent->wait, &__wait);
9254+ }
9255+ }
44254afd 9256+ BUG_ON(ent->nr_todo_reqs != 0);
44254afd
MT
9257+ return 0;
9258+}
9259+
9260+/**
71430cf6 9261+ * reiser4_done_entd - stop entd kernel thread
44254afd
MT
9262+ * @super: super block to stop ent thread for
9263+ *
9264+ * It is called on umount. Sends stop signal to entd and wait until it handles
9265+ * it.
9266+ */
71430cf6 9267+void reiser4_done_entd(struct super_block *super)
44254afd
MT
9268+{
9269+ entd_context *ent;
9270+
9271+ assert("nikita-3103", super != NULL);
9272+
9273+ ent = get_entd_context(super);
9274+ assert("zam-1055", ent->tsk != NULL);
9275+ kthread_stop(ent->tsk);
9276+}
9277+
9278+/* called at the beginning of jnode_flush to register flusher thread with ent
9279+ * daemon */
71430cf6 9280+void reiser4_enter_flush(struct super_block *super)
44254afd
MT
9281+{
9282+ entd_context *ent;
9283+
9284+ assert("zam-1029", super != NULL);
9285+ ent = get_entd_context(super);
9286+
9287+ assert("zam-1030", ent != NULL);
9288+
9289+ spin_lock(&ent->guard);
9290+ ent->flushers++;
9291+#if REISER4_DEBUG
9292+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9293+#endif
9294+ spin_unlock(&ent->guard);
9295+}
9296+
9297+/* called at the end of jnode_flush */
71430cf6 9298+void reiser4_leave_flush(struct super_block *super)
44254afd
MT
9299+{
9300+ entd_context *ent;
9301+ int wake_up_ent;
9302+
9303+ assert("zam-1027", super != NULL);
9304+ ent = get_entd_context(super);
9305+
9306+ assert("zam-1028", ent != NULL);
9307+
9308+ spin_lock(&ent->guard);
9309+ ent->flushers--;
9310+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9311+#if REISER4_DEBUG
9312+ list_del_init(&get_current_context()->flushers_link);
9313+#endif
9314+ spin_unlock(&ent->guard);
9315+ if (wake_up_ent)
9316+ wake_up(&ent->wait);
9317+}
9318+
9319+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9320+
9321+static void entd_flush(struct super_block *super, struct wbq *rq)
9322+{
9323+ reiser4_context ctx;
9324+ int tmp;
9325+
9326+ init_stack_context(&ctx, super);
9327+ ctx.entd = 1;
9328+ ctx.gfp_mask = GFP_NOFS;
9329+
71430cf6
MT
9330+ rq->wbc->range_start = page_offset(rq->page);
9331+ rq->wbc->range_end = rq->wbc->range_start +
9332+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
44254afd
MT
9333+ tmp = rq->wbc->nr_to_write;
9334+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9335+
9336+ if (rq->wbc->nr_to_write > 0) {
71430cf6
MT
9337+ rq->wbc->range_start = 0;
9338+ rq->wbc->range_end = LLONG_MAX;
44254afd
MT
9339+ generic_sync_sb_inodes(super, rq->wbc);
9340+ }
9341+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
71430cf6 9342+ reiser4_writeout(super, rq->wbc);
44254afd
MT
9343+
9344+ context_set_commit_async(&ctx);
9345+ reiser4_exit_context(&ctx);
9346+}
9347+
9348+/**
9349+ * write_page_by_ent - ask entd thread to flush this page as part of slum
9350+ * @page: page to be written
9351+ * @wbc: writeback control passed to reiser4_writepage
9352+ *
9353+ * Creates a request, puts it on entd list of requests, wakeups entd if
9354+ * necessary, waits until entd completes with the request.
9355+ */
9356+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9357+{
9358+ struct super_block *sb;
9359+ struct inode *inode;
9360+ entd_context *ent;
9361+ struct wbq rq;
9362+
9363+ assert("", PageLocked(page));
9364+ assert("", page->mapping != NULL);
9365+
9366+ sb = page->mapping->host->i_sb;
9367+ ent = get_entd_context(sb);
9368+ assert("", ent && ent->done == 0);
9369+
9370+ /*
9371+ * we are going to unlock page and ask ent thread to write the
9372+ * page. Re-dirty page before unlocking so that if ent thread fails to
9373+ * write it - it will remain dirty
9374+ */
71430cf6 9375+ reiser4_set_page_dirty_internal(page);
44254afd
MT
9376+
9377+ /*
9378+ * pin inode in memory, unlock page, entd_flush will iput. We can not
9379+ * iput here becasue we can not allow delete_inode to be called here
9380+ */
9381+ inode = igrab(page->mapping->host);
9382+ unlock_page(page);
9383+ if (inode == NULL)
9384+ /* inode is getting freed */
9385+ return 0;
9386+
9387+ /* init wbq */
9388+ INIT_LIST_HEAD(&rq.link);
9389+ rq.magic = WBQ_MAGIC;
9390+ rq.wbc = wbc;
9391+ rq.page = page;
9392+ rq.mapping = inode->i_mapping;
9393+ rq.node = NULL;
9394+ rq.written = 0;
71430cf6 9395+ init_completion(&rq.completion);
44254afd
MT
9396+
9397+ /* add request to entd's list of writepage requests */
9398+ spin_lock(&ent->guard);
9399+ ent->nr_todo_reqs++;
9400+ list_add_tail(&rq.link, &ent->todo_list);
9401+ if (ent->nr_todo_reqs == 1)
9402+ wake_up(&ent->wait);
9403+
9404+ spin_unlock(&ent->guard);
9405+
9406+ /* wait until entd finishes */
71430cf6 9407+ wait_for_completion(&rq.completion);
44254afd
MT
9408+
9409+ if (rq.written)
9410+ /* Eventually ENTD has written the page to disk. */
9411+ return 0;
9412+ return 0;
9413+}
9414+
9415+int wbq_available(void)
9416+{
9417+ struct super_block *sb = reiser4_get_current_sb();
9418+ entd_context *ent = get_entd_context(sb);
9419+ return ent->nr_todo_reqs;
9420+}
9421+
9422+/*
9423+ * Local variables:
9424+ * c-indentation-style: "K&R"
9425+ * mode-name: "LC"
9426+ * c-basic-offset: 8
9427+ * tab-width: 8
9428+ * fill-column: 79
9429+ * End:
9430+ */
71430cf6
MT
9431diff -urN linux-2.6.22.orig/fs/reiser4/entd.h linux-2.6.22/fs/reiser4/entd.h
9432--- linux-2.6.22.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9433+++ linux-2.6.22/fs/reiser4/entd.h 2007-07-29 00:25:34.840687159 +0400
44254afd
MT
9434@@ -0,0 +1,90 @@
9435+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9436+
9437+/* Ent daemon. */
9438+
9439+#ifndef __ENTD_H__
9440+#define __ENTD_H__
9441+
9442+#include "context.h"
9443+
9444+#include <linux/fs.h>
9445+#include <linux/completion.h>
9446+#include <linux/wait.h>
9447+#include <linux/spinlock.h>
9448+#include <linux/sched.h> /* for struct task_struct */
9449+
9450+#define WBQ_MAGIC 0x7876dc76
9451+
9452+/* write-back request. */
9453+struct wbq {
9454+ int magic;
9455+ struct list_head link; /* list head of this list is in entd context */
9456+ struct writeback_control *wbc;
9457+ struct page *page;
9458+ struct address_space *mapping;
71430cf6 9459+ struct completion completion;
44254afd
MT
9460+ jnode *node; /* set if ent thread captured requested page */
9461+ int written; /* set if ent thread wrote requested page */
9462+};
9463+
9464+/* ent-thread context. This is used to synchronize starting/stopping ent
9465+ * threads. */
9466+typedef struct entd_context {
9467+ /* wait queue that ent thread waits on for more work. It's
9468+ * signaled by write_page_by_ent(). */
9469+ wait_queue_head_t wait;
9470+ /* spinlock protecting other fields */
9471+ spinlock_t guard;
9472+ /* ent thread */
9473+ struct task_struct *tsk;
9474+ /* set to indicate that ent thread should leave. */
9475+ int done;
9476+ /* counter of active flushers */
9477+ int flushers;
9478+ /*
9479+ * when reiser4_writepage asks entd to write a page - it adds struct
9480+ * wbq to this list
9481+ */
9482+ struct list_head todo_list;
9483+ /* number of elements on the above list */
9484+ int nr_todo_reqs;
9485+
9486+ struct wbq *cur_request;
9487+ /*
9488+ * when entd writes a page it moves write-back request from todo_list
9489+ * to done_list. This list is used at the end of entd iteration to
9490+ * wakeup requestors and iput inodes.
9491+ */
9492+ struct list_head done_list;
9493+ /* number of elements on the above list */
9494+ int nr_done_reqs;
9495+
9496+#if REISER4_DEBUG
9497+ /* list of all active flushers */
9498+ struct list_head flushers_list;
9499+#endif
9500+} entd_context;
9501+
71430cf6
MT
9502+extern int reiser4_init_entd(struct super_block *);
9503+extern void reiser4_done_entd(struct super_block *);
44254afd 9504+
71430cf6
MT
9505+extern void reiser4_enter_flush(struct super_block *);
9506+extern void reiser4_leave_flush(struct super_block *);
44254afd
MT
9507+
9508+extern int write_page_by_ent(struct page *, struct writeback_control *);
9509+extern int wbq_available(void);
9510+extern void ent_writes_page(struct super_block *, struct page *);
9511+
9512+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9513+/* __ENTD_H__ */
9514+#endif
9515+
9516+/* Make Linus happy.
9517+ Local variables:
9518+ c-indentation-style: "K&R"
9519+ mode-name: "LC"
9520+ c-basic-offset: 8
9521+ tab-width: 8
9522+ fill-column: 120
9523+ End:
9524+*/
71430cf6
MT
9525diff -urN linux-2.6.22.orig/fs/reiser4/eottl.c linux-2.6.22/fs/reiser4/eottl.c
9526--- linux-2.6.22.orig/fs/reiser4/eottl.c 1970-01-01 03:00:00.000000000 +0300
9527+++ linux-2.6.22/fs/reiser4/eottl.c 2007-07-29 00:25:34.840687159 +0400
9528@@ -0,0 +1,509 @@
44254afd
MT
9529+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9530+
9531+#include "forward.h"
9532+#include "debug.h"
9533+#include "key.h"
9534+#include "coord.h"
9535+#include "plugin/item/item.h"
9536+#include "plugin/node/node.h"
9537+#include "znode.h"
9538+#include "block_alloc.h"
9539+#include "tree_walk.h"
9540+#include "tree_mod.h"
9541+#include "carry.h"
9542+#include "tree.h"
9543+#include "super.h"
9544+
9545+#include <linux/types.h> /* for __u?? */
9546+
9547+/*
9548+ * Extents on the twig level (EOTTL) handling.
9549+ *
9550+ * EOTTL poses some problems to the tree traversal, that are better explained
9551+ * by example.
9552+ *
9553+ * Suppose we have block B1 on the twig level with the following items:
9554+ *
9555+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9556+ * offset)
9557+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9558+ * 2. internal item I2 with key (10:0:0:0)
9559+ *
9560+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9561+ * then intra-node lookup is done. This lookup finished on the E1, because the
9562+ * key we are looking for is larger than the key of E1 and is smaller than key
9563+ * the of I2.
9564+ *
9565+ * Here search is stuck.
9566+ *
9567+ * After some thought it is clear what is wrong here: extents on the twig level
9568+ * break some basic property of the *search* tree (on the pretext, that they
9569+ * restore property of balanced tree).
9570+ *
9571+ * Said property is the following: if in the internal node of the search tree
9572+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9573+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9574+ * through the Pointer.
9575+ *
9576+ * This is not true, when Pointer is Extent-Pointer, simply because extent
9577+ * cannot expand indefinitely to the right to include any item with
9578+ *
9579+ * Key1 <= Key <= Key2.
9580+ *
9581+ * For example, our E1 extent is only responsible for the data with keys
9582+ *
9583+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9584+ *
9585+ * so, key range
9586+ *
9587+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9588+ *
9589+ * is orphaned: there is no way to get there from the tree root.
9590+ *
9591+ * In other words, extent pointers are different than normal child pointers as
9592+ * far as search tree is concerned, and this creates such problems.
9593+ *
9594+ * Possible solution for this problem is to insert our item into node pointed
9595+ * to by I2. There are some problems through:
9596+ *
9597+ * (1) I2 can be in a different node.
9598+ * (2) E1 can be immediately followed by another extent E2.
9599+ *
9600+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9601+ * for locks/coords as necessary.
9602+ *
9603+ * (2) is more complex. Solution here is to insert new empty leaf node and
9604+ * insert internal item between E1 and E2 pointing to said leaf node. This is
9605+ * further complicated by possibility that E2 is in a different node, etc.
9606+ *
9607+ * Problems:
9608+ *
9609+ * (1) if there was internal item I2 immediately on the right of an extent E1
9610+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9611+ * key of S1 will be less than smallest key in the N2. Normally, search key
9612+ * checks that key we are looking for is in the range of keys covered by the
9613+ * node key is being looked in. To work around of this situation, while
9614+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9615+ * cbk falgs bitmask. This flag is automatically set on entrance to the
9616+ * coord_by_key() and is only cleared when we are about to enter situation
9617+ * described above.
9618+ *
9619+ * (2) If extent E1 is immediately followed by another extent E2 and we are
9620+ * searching for the key that is between E1 and E2 we only have to insert new
9621+ * empty leaf node when coord_by_key was called for insertion, rather than just
9622+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9623+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9624+ * performed by insert_by_key() and friends.
9625+ *
9626+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9627+ * case it requires modification of node content which is only possible under
9628+ * write lock. It may well happen that we only have read lock on the node where
9629+ * new internal pointer is to be inserted (common case: lookup of non-existent
9630+ * stat-data that fells between two extents). If only read lock is held, tree
9631+ * traversal is restarted with lock_level modified so that next time we hit
9632+ * this problem, write lock will be held. Once we have write lock, balancing
9633+ * will be performed.
9634+ */
9635+
9636+/**
9637+ * is_next_item_internal - check whether next item is internal
9638+ * @coord: coordinate of extent item in twig node
9639+ * @key: search key
9640+ * @lh: twig node lock handle
9641+ *
9642+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9643+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9644+ * to that node, @coord is set to its first unit. If next item is not internal
9645+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9646+ * is returned if search restart has to be done.
9647+ */
9648+static int
9649+is_next_item_internal(coord_t *coord, const reiser4_key *key,
9650+ lock_handle *lh)
9651+{
9652+ coord_t next;
9653+ lock_handle rn;
9654+ int result;
9655+
9656+ coord_dup(&next, coord);
9657+ if (coord_next_unit(&next) == 0) {
9658+ /* next unit is in this node */
9659+ if (item_is_internal(&next)) {
9660+ coord_dup(coord, &next);
9661+ return 1;
9662+ }
9663+ assert("vs-3", item_is_extent(&next));
9664+ return 0;
9665+ }
9666+
9667+ /*
9668+ * next unit either does not exist or is in right neighbor. If it is in
9669+ * right neighbor we have to check right delimiting key because
9670+ * concurrent thread could get their first and insert item with a key
9671+ * smaller than @key
9672+ */
9673+ read_lock_dk(current_tree);
9674+ result = keycmp(key, znode_get_rd_key(coord->node));
9675+ read_unlock_dk(current_tree);
9676+ assert("vs-6", result != EQUAL_TO);
9677+ if (result == GREATER_THAN)
9678+ return 2;
9679+
9680+ /* lock right neighbor */
9681+ init_lh(&rn);
9682+ result = reiser4_get_right_neighbor(&rn, coord->node,
9683+ znode_is_wlocked(coord->node) ?
9684+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9685+ GN_CAN_USE_UPPER_LEVELS);
9686+ if (result == -E_NO_NEIGHBOR) {
9687+ /* we are on the rightmost edge of the tree */
9688+ done_lh(&rn);
9689+ return 0;
9690+ }
9691+
9692+ if (result) {
9693+ assert("vs-4", result < 0);
9694+ done_lh(&rn);
9695+ return result;
9696+ }
9697+
9698+ /*
9699+ * check whether concurrent thread managed to insert item with a key
9700+ * smaller than @key
9701+ */
9702+ read_lock_dk(current_tree);
9703+ result = keycmp(key, znode_get_ld_key(rn.node));
9704+ read_unlock_dk(current_tree);
9705+ assert("vs-6", result != EQUAL_TO);
9706+ if (result == GREATER_THAN) {
9707+ done_lh(&rn);
9708+ return 2;
9709+ }
9710+
9711+ result = zload(rn.node);
9712+ if (result) {
9713+ assert("vs-5", result < 0);
9714+ done_lh(&rn);
9715+ return result;
9716+ }
9717+
9718+ coord_init_first_unit(&next, rn.node);
9719+ if (item_is_internal(&next)) {
9720+ /*
9721+ * next unit is in right neighbor and it is an unit of internal
9722+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
9723+ * is set to the first unit of right neighbor.
9724+ */
9725+ coord_dup(coord, &next);
9726+ zrelse(rn.node);
9727+ done_lh(lh);
9728+ move_lh(lh, &rn);
9729+ return 1;
9730+ }
9731+
9732+ /*
9733+ * next unit is unit of extent item. Return without chaning @lh and
9734+ * @coord.
9735+ */
9736+ assert("vs-6", item_is_extent(&next));
9737+ zrelse(rn.node);
9738+ done_lh(&rn);
9739+ return 0;
9740+}
9741+
9742+/**
9743+ * rd_key - calculate key of an item next to the given one
9744+ * @coord: position in a node
9745+ * @key: storage for result key
9746+ *
9747+ * @coord is set between items or after the last item in a node. Calculate key
9748+ * of item to the right of @coord.
9749+ */
9750+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9751+{
9752+ coord_t dup;
9753+
9754+ assert("nikita-2281", coord_is_between_items(coord));
9755+ coord_dup(&dup, coord);
9756+
9757+ if (coord_set_to_right(&dup) == 0)
9758+ /* next item is in this node. Return its key. */
9759+ unit_key_by_coord(&dup, key);
9760+ else {
9761+ /*
9762+ * next item either does not exist or is in right
9763+ * neighbor. Return znode's right delimiting key.
9764+ */
9765+ read_lock_dk(current_tree);
9766+ *key = *znode_get_rd_key(coord->node);
9767+ read_unlock_dk(current_tree);
9768+ }
9769+ return key;
9770+}
9771+
9772+/**
9773+ * add_empty_leaf - insert empty leaf between two extents
9774+ * @insert_coord: position in twig node between two extents
9775+ * @lh: twig node lock handle
9776+ * @key: left delimiting key of new node
9777+ * @rdkey: right delimiting key of new node
9778+ *
9779+ * Inserts empty leaf node between two extent items. It is necessary when we
9780+ * have to insert an item on leaf level between two extents (items on the twig
9781+ * level).
9782+ */
9783+static int
9784+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9785+ const reiser4_key *key, const reiser4_key *rdkey)
9786+{
9787+ int result;
9788+ carry_pool *pool;
9789+ carry_level *todo;
9790+ reiser4_item_data *item;
9791+ carry_insert_data *cdata;
9792+ carry_op *op;
9793+ znode *node;
9794+ reiser4_tree *tree;
9795+
9796+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9797+ tree = znode_get_tree(insert_coord->node);
71430cf6 9798+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
44254afd
MT
9799+ if (IS_ERR(node))
9800+ return PTR_ERR(node);
9801+
9802+ /* setup delimiting keys for node being inserted */
9803+ write_lock_dk(tree);
9804+ znode_set_ld_key(node, key);
9805+ znode_set_rd_key(node, rdkey);
9806+ ON_DEBUG(node->creator = current);
9807+ ON_DEBUG(node->first_key = *key);
9808+ write_unlock_dk(tree);
9809+
9810+ ZF_SET(node, JNODE_ORPHAN);
9811+
9812+ /*
9813+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9814+ * carry_insert_data
9815+ */
9816+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9817+ sizeof(*item) + sizeof(*cdata));
9818+ if (IS_ERR(pool))
9819+ return PTR_ERR(pool);
9820+ todo = (carry_level *) (pool + 1);
9821+ init_carry_level(todo, pool);
9822+
9823+ item = (reiser4_item_data *) (todo + 3);
9824+ cdata = (carry_insert_data *) (item + 1);
9825+
71430cf6 9826+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
44254afd
MT
9827+ if (!IS_ERR(op)) {
9828+ cdata->coord = insert_coord;
9829+ cdata->key = key;
9830+ cdata->data = item;
9831+ op->u.insert.d = cdata;
9832+ op->u.insert.type = COPT_ITEM_DATA;
9833+ build_child_ptr_data(node, item);
9834+ item->arg = NULL;
9835+ /* have @insert_coord to be set at inserted item after
9836+ insertion is done */
9837+ todo->track_type = CARRY_TRACK_CHANGE;
9838+ todo->tracked = lh;
9839+
71430cf6 9840+ result = reiser4_carry(todo, NULL);
44254afd
MT
9841+ if (result == 0) {
9842+ /*
9843+ * pin node in memory. This is necessary for
9844+ * znode_make_dirty() below.
9845+ */
9846+ result = zload(node);
9847+ if (result == 0) {
9848+ lock_handle local_lh;
9849+
9850+ /*
9851+ * if we inserted new child into tree we have
9852+ * to mark it dirty so that flush will be able
9853+ * to process it.
9854+ */
9855+ init_lh(&local_lh);
9856+ result = longterm_lock_znode(&local_lh, node,
9857+ ZNODE_WRITE_LOCK,
9858+ ZNODE_LOCK_LOPRI);
9859+ if (result == 0) {
9860+ znode_make_dirty(node);
9861+
9862+ /*
9863+ * when internal item pointing to @node
9864+ * was inserted into twig node
9865+ * create_hook_internal did not connect
9866+ * it properly because its right
9867+ * neighbor was not known. Do it
9868+ * here
9869+ */
9870+ write_lock_tree(tree);
9871+ assert("nikita-3312",
9872+ znode_is_right_connected(node));
9873+ assert("nikita-2984",
9874+ node->right == NULL);
9875+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9876+ write_unlock_tree(tree);
9877+ result =
9878+ connect_znode(insert_coord, node);
71430cf6 9879+ ON_DEBUG(if (result == 0) check_dkeys(node););
44254afd
MT
9880+
9881+ done_lh(lh);
9882+ move_lh(lh, &local_lh);
9883+ assert("vs-1676", node_is_empty(node));
9884+ coord_init_first_unit(insert_coord,
9885+ node);
9886+ } else {
9887+ warning("nikita-3136",
9888+ "Cannot lock child");
9889+ }
9890+ done_lh(&local_lh);
9891+ zrelse(node);
9892+ }
9893+ }
9894+ } else
9895+ result = PTR_ERR(op);
9896+ zput(node);
9897+ done_carry_pool(pool);
9898+ return result;
9899+}
9900+
9901+/**
9902+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9903+ * @h: search handle
9904+ * @outcome: flag saying whether search has to restart or is done
9905+ *
9906+ * Handles search on twig level. If this function completes search itself then
9907+ * it returns 1. If search has to go one level down then 0 is returned. If
9908+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9909+ * in @h->result.
9910+ */
9911+int handle_eottl(cbk_handle *h, int *outcome)
9912+{
9913+ int result;
9914+ reiser4_key key;
9915+ coord_t *coord;
9916+
9917+ coord = h->coord;
9918+
9919+ if (h->level != TWIG_LEVEL ||
9920+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
9921+ /* Continue to traverse tree downward. */
9922+ return 0;
9923+ }
9924+
9925+ /*
9926+ * make sure that @h->coord is set to twig node and that it is either
9927+ * set to extent item or after extent item
9928+ */
9929+ assert("vs-356", h->level == TWIG_LEVEL);
9930+ assert("vs-357", ( {
9931+ coord_t lcoord;
9932+ coord_dup(&lcoord, coord);
9933+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9934+ item_is_extent(&lcoord);
9935+ }
9936+ ));
9937+
9938+ if (*outcome == NS_FOUND) {
9939+ /* we have found desired key on twig level in extent item */
9940+ h->result = CBK_COORD_FOUND;
9941+ *outcome = LOOKUP_DONE;
9942+ return 1;
9943+ }
9944+
9945+ if (!(h->flags & CBK_FOR_INSERT)) {
9946+ /* tree traversal is not for insertion. Just return
9947+ CBK_COORD_NOTFOUND. */
9948+ h->result = CBK_COORD_NOTFOUND;
9949+ *outcome = LOOKUP_DONE;
9950+ return 1;
9951+ }
9952+
9953+ /* take a look at the item to the right of h -> coord */
9954+ result = is_next_item_internal(coord, h->key, h->active_lh);
9955+ if (unlikely(result < 0)) {
9956+ h->error = "get_right_neighbor failed";
9957+ h->result = result;
9958+ *outcome = LOOKUP_DONE;
9959+ return 1;
9960+ }
9961+ if (result == 0) {
9962+ /*
9963+ * item to the right is also an extent one. Allocate a new node
9964+ * and insert pointer to it after item h -> coord.
9965+ *
9966+ * This is a result of extents being located at the twig
9967+ * level. For explanation, see comment just above
9968+ * is_next_item_internal().
9969+ */
9970+ znode *loaded;
9971+
9972+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
9973+ /*
9974+ * we got node read locked, restart coord_by_key to
9975+ * have write lock on twig level
9976+ */
9977+ h->lock_level = TWIG_LEVEL;
9978+ h->lock_mode = ZNODE_WRITE_LOCK;
9979+ *outcome = LOOKUP_REST;
9980+ return 1;
9981+ }
9982+
9983+ loaded = coord->node;
9984+ result =
9985+ add_empty_leaf(coord, h->active_lh, h->key,
9986+ rd_key(coord, &key));
9987+ if (result) {
9988+ h->error = "could not add empty leaf";
9989+ h->result = result;
9990+ *outcome = LOOKUP_DONE;
9991+ return 1;
9992+ }
9993+ /* added empty leaf is locked (h->active_lh), its parent node
9994+ is unlocked, h->coord is set as EMPTY */
9995+ assert("vs-13", coord->between == EMPTY_NODE);
9996+ assert("vs-14", znode_is_write_locked(coord->node));
9997+ assert("vs-15",
9998+ WITH_DATA(coord->node, node_is_empty(coord->node)));
9999+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10000+ assert("vs-17", coord->node == h->active_lh->node);
10001+ *outcome = LOOKUP_DONE;
10002+ h->result = CBK_COORD_NOTFOUND;
10003+ return 1;
10004+ } else if (result == 1) {
10005+ /*
10006+ * this is special case mentioned in the comment on
10007+ * tree.h:cbk_flags. We have found internal item immediately on
10008+ * the right of extent, and we are going to insert new item
10009+ * there. Key of item we are going to insert is smaller than
10010+ * leftmost key in the node pointed to by said internal item
10011+ * (otherwise search wouldn't come to the extent in the first
10012+ * place).
10013+ *
10014+ * This is a result of extents being located at the twig
10015+ * level. For explanation, see comment just above
10016+ * is_next_item_internal().
10017+ */
10018+ h->flags &= ~CBK_TRUST_DK;
10019+ } else {
10020+ assert("vs-8", result == 2);
10021+ *outcome = LOOKUP_REST;
10022+ return 1;
10023+ }
10024+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10025+ return 0;
10026+}
10027+
10028+/*
10029+ * Local variables:
10030+ * c-indentation-style: "K&R"
10031+ * mode-name: "LC"
10032+ * c-basic-offset: 8
10033+ * tab-width: 8
10034+ * fill-column: 120
10035+ * scroll-step: 1
10036+ * End:
10037+ */
71430cf6
MT
10038diff -urN linux-2.6.22.orig/fs/reiser4/estimate.c linux-2.6.22/fs/reiser4/estimate.c
10039--- linux-2.6.22.orig/fs/reiser4/estimate.c 1970-01-01 03:00:00.000000000 +0300
10040+++ linux-2.6.22/fs/reiser4/estimate.c 2007-07-29 00:25:34.840687159 +0400
10041@@ -0,0 +1,120 @@
44254afd
MT
10042+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10043+
10044+#include "debug.h"
10045+#include "dformat.h"
10046+#include "tree.h"
10047+#include "carry.h"
10048+#include "inode.h"
10049+#include "plugin/cluster.h"
10050+#include "plugin/item/ctail.h"
10051+
10052+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10053+
10054+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10055+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10056+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10057+ leaf level, 3 for twig level, 2 on upper + 1 for root.
10058+
10059+ Do not calculate the current node of the lowest level here - this is overhead only.
10060+
10061+ children is almost always 1 here. Exception is flow insertion
10062+*/
10063+static reiser4_block_nr
10064+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10065+{
10066+ reiser4_block_nr ten_percent;
10067+
10068+ ten_percent = ((103 * childen) >> 10);
10069+
10070+ /* If we have too many balancings at the time, tree height can raise on more
10071+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10072+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10073+}
10074+
10075+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10076+ perform insertion of one item into the tree */
10077+/* it is only called when tree height changes, or gets initialized */
10078+reiser4_block_nr calc_estimate_one_insert(tree_level height)
10079+{
10080+ return 1 + max_balance_overhead(1, height);
10081+}
10082+
10083+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10084+{
10085+ return tree->estimate_one_insert;
10086+}
10087+
10088+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10089+ perform insertion of one unit into an item in the tree */
10090+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10091+{
10092+ /* estimate insert into item just like item insertion */
10093+ return tree->estimate_one_insert;
10094+}
10095+
10096+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10097+{
10098+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10099+ level */
10100+ return tree->estimate_one_insert;
10101+}
10102+
10103+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10104+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10105+ levels */
10106+reiser4_block_nr estimate_insert_flow(tree_level height)
10107+{
10108+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10109+ CARRY_FLOW_NEW_NODES_LIMIT,
10110+ height);
10111+}
10112+
10113+/* returnes max number of nodes can be occupied by disk cluster */
10114+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10115+{
10116+ int per_cluster;
10117+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10118+ return 3 + per_cluster +
10119+ max_balance_overhead(3 + per_cluster,
10120+ REISER4_MAX_ZTREE_HEIGHT);
10121+}
10122+
10123+/* how many nodes might get dirty and added
10124+ during insertion of a disk cluster */
10125+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10126+{
10127+ return estimate_cluster(inode, 1); /* 24 */
10128+}
10129+
10130+/* how many nodes might get dirty and added
10131+ during update of a (prepped or unprepped) disk cluster */
10132+reiser4_block_nr estimate_update_cluster(struct inode * inode)
10133+{
10134+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10135+}
10136+
71430cf6
MT
10137+/* How many nodes occupied by a disk cluster might get dirty.
10138+ Note that this estimation is not precise (i.e. disk cluster
10139+ can occupy more nodes).
10140+ Q: Why we don't use precise estimation?
10141+ A: 1.Because precise estimation is fairly bad: 65536 nodes
10142+ for 64K logical cluster, it means 256M of dead space on
10143+ a partition
10144+ 2.It is a very rare case when disk cluster occupies more
10145+ nodes then this estimation returns.
10146+*/
44254afd
MT
10147+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10148+{
71430cf6 10149+ return cluster_nrpages(inode) + 4;
44254afd
MT
10150+}
10151+
10152+/* Make Linus happy.
10153+ Local variables:
10154+ c-indentation-style: "K&R"
10155+ mode-name: "LC"
10156+ c-basic-offset: 8
10157+ tab-width: 8
10158+ fill-column: 120
10159+ scroll-step: 1
10160+ End:
10161+*/
71430cf6
MT
10162diff -urN linux-2.6.22.orig/fs/reiser4/export_ops.c linux-2.6.22/fs/reiser4/export_ops.c
10163--- linux-2.6.22.orig/fs/reiser4/export_ops.c 1970-01-01 03:00:00.000000000 +0300
10164+++ linux-2.6.22/fs/reiser4/export_ops.c 2007-07-29 00:25:34.840687159 +0400
10165@@ -0,0 +1,295 @@
44254afd
MT
10166+/* Copyright 2005 by Hans Reiser, licensing governed by
10167+ * reiser4/README */
10168+
10169+#include "inode.h"
10170+#include "plugin/plugin.h"
10171+
44254afd
MT
10172+/*
10173+ * Supported file-handle types
10174+ */
10175+typedef enum {
10176+ FH_WITH_PARENT = 0x10, /* file handle with parent */
10177+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10178+} reiser4_fhtype;
10179+
10180+#define NFSERROR (255)
10181+
10182+/* initialize place-holder for object */
10183+static void object_on_wire_init(reiser4_object_on_wire *o)
10184+{
10185+ o->plugin = NULL;
10186+}
10187+
10188+/* finish with @o */
10189+static void object_on_wire_done(reiser4_object_on_wire *o)
10190+{
10191+ if (o->plugin != NULL)
10192+ o->plugin->wire.done(o);
10193+}
10194+
10195+/*
10196+ * read serialized object identity from @addr and store information about
10197+ * object in @obj. This is dual to encode_inode().
10198+ */
10199+static char *decode_inode(struct super_block *s, char *addr,
10200+ reiser4_object_on_wire * obj)
10201+{
10202+ file_plugin *fplug;
10203+
10204+ /* identifier of object plugin is stored in the first two bytes,
10205+ * followed by... */
71430cf6 10206+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
44254afd
MT
10207+ if (fplug != NULL) {
10208+ addr += sizeof(d16);
10209+ obj->plugin = fplug;
10210+ assert("nikita-3520", fplug->wire.read != NULL);
10211+ /* plugin specific encoding of object identity. */
10212+ addr = fplug->wire.read(addr, obj);
10213+ } else
10214+ addr = ERR_PTR(RETERR(-EINVAL));
10215+ return addr;
10216+}
10217+
10218+/**
10219+ * reiser4_decode_fh - decode_fh of export operations
10220+ * @super: super block
10221+ * @fh: nfsd file handle
10222+ * @len: length of file handle
10223+ * @fhtype: type of file handle
10224+ * @acceptable: acceptability testing function
10225+ * @context: argument for @acceptable
10226+ *
10227+ * Returns dentry referring to the same file as @fh.
10228+ */
10229+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10230+ int len, int fhtype,
10231+ int (*acceptable) (void *context,
10232+ struct dentry *de),
10233+ void *context)
10234+{
10235+ reiser4_context *ctx;
10236+ reiser4_object_on_wire object;
10237+ reiser4_object_on_wire parent;
10238+ char *addr;
10239+ int with_parent;
10240+
71430cf6 10241+ ctx = reiser4_init_context(super);
44254afd
MT
10242+ if (IS_ERR(ctx))
10243+ return (struct dentry *)ctx;
10244+
10245+ assert("vs-1482",
10246+ fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10247+
10248+ with_parent = (fhtype == FH_WITH_PARENT);
10249+
10250+ addr = (char *)fh;
10251+
10252+ object_on_wire_init(&object);
10253+ object_on_wire_init(&parent);
10254+
10255+ addr = decode_inode(super, addr, &object);
10256+ if (!IS_ERR(addr)) {
10257+ if (with_parent)
10258+ addr = decode_inode(super, addr, &parent);
10259+ if (!IS_ERR(addr)) {
10260+ struct dentry *d;
10261+ typeof(super->s_export_op->find_exported_dentry) fn;
10262+
10263+ fn = super->s_export_op->find_exported_dentry;
10264+ assert("nikita-3521", fn != NULL);
10265+ d = fn(super, &object, with_parent ? &parent : NULL,
10266+ acceptable, context);
10267+ if (d != NULL && !IS_ERR(d))
10268+ /* FIXME check for -ENOMEM */
10269+ reiser4_get_dentry_fsdata(d)->stateless = 1;
10270+ addr = (char *)d;
10271+ }
10272+ }
10273+
10274+ object_on_wire_done(&object);
10275+ object_on_wire_done(&parent);
10276+
10277+ reiser4_exit_context(ctx);
10278+ return (void *)addr;
10279+}
10280+
10281+/*
10282+ * Object serialization support.
10283+ *
10284+ * To support knfsd file system provides export_operations that are used to
10285+ * construct and interpret NFS file handles. As a generalization of this,
10286+ * reiser4 object plugins have serialization support: it provides methods to
10287+ * create on-wire representation of identity of reiser4 object, and
10288+ * re-create/locate object given its on-wire identity.
10289+ *
10290+ */
10291+
10292+/*
10293+ * return number of bytes that on-wire representation of @inode's identity
10294+ * consumes.
10295+ */
10296+static int encode_inode_size(struct inode *inode)
10297+{
10298+ assert("nikita-3514", inode != NULL);
10299+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
10300+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10301+
10302+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10303+}
10304+
10305+/*
10306+ * store on-wire representation of @inode's identity at the area beginning at
10307+ * @start.
10308+ */
10309+static char *encode_inode(struct inode *inode, char *start)
10310+{
10311+ assert("nikita-3517", inode != NULL);
10312+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
10313+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10314+
10315+ /*
10316+ * first, store two-byte identifier of object plugin, then
10317+ */
10318+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10319+ (d16 *) start);
10320+ start += sizeof(d16);
10321+ /*
10322+ * call plugin to serialize object's identity
10323+ */
10324+ return inode_file_plugin(inode)->wire.write(inode, start);
10325+}
10326+
10327+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10328+ * returned if file handle can not be stored */
10329+/**
10330+ * reiser4_encode_fh - encode_fh of export operations
10331+ * @dentry:
10332+ * @fh:
10333+ * @lenp:
10334+ * @need_parent:
10335+ *
10336+ */
10337+static int
10338+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10339+ int need_parent)
10340+{
10341+ struct inode *inode;
10342+ struct inode *parent;
10343+ char *addr;
10344+ int need;
10345+ int delta;
10346+ int result;
10347+ reiser4_context *ctx;
10348+
10349+ /*
10350+ * knfsd asks as to serialize object in @dentry, and, optionally its
10351+ * parent (if need_parent != 0).
10352+ *
10353+ * encode_inode() and encode_inode_size() is used to build
10354+ * representation of object and its parent. All hard work is done by
10355+ * object plugins.
10356+ */
10357+ inode = dentry->d_inode;
10358+ parent = dentry->d_parent->d_inode;
10359+
10360+ addr = (char *)fh;
10361+
10362+ need = encode_inode_size(inode);
10363+ if (need < 0)
10364+ return NFSERROR;
10365+ if (need_parent) {
10366+ delta = encode_inode_size(parent);
10367+ if (delta < 0)
10368+ return NFSERROR;
10369+ need += delta;
10370+ }
10371+
71430cf6 10372+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
44254afd
MT
10373+ if (IS_ERR(ctx))
10374+ return PTR_ERR(ctx);
10375+
10376+ if (need <= sizeof(__u32) * (*lenp)) {
10377+ addr = encode_inode(inode, addr);
10378+ if (need_parent)
10379+ addr = encode_inode(parent, addr);
10380+
10381+ /* store in lenp number of 32bit words required for file
10382+ * handle. */
10383+ *lenp = (need + sizeof(__u32) - 1) >> 2;
10384+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10385+ } else
10386+ /* no enough space in file handle */
10387+ result = NFSERROR;
10388+ reiser4_exit_context(ctx);
10389+ return result;
10390+}
10391+
10392+/**
10393+ * reiser4_get_dentry_parent - get_parent of export operations
10394+ * @child:
10395+ *
10396+ */
10397+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10398+{
10399+ struct inode *dir;
10400+ dir_plugin *dplug;
10401+
10402+ assert("nikita-3527", child != NULL);
10403+ /* see comment in reiser4_get_dentry() about following assertion */
10404+ assert("nikita-3528", is_in_reiser4_context());
10405+
10406+ dir = child->d_inode;
10407+ assert("nikita-3529", dir != NULL);
10408+ dplug = inode_dir_plugin(dir);
10409+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10410+ if (dplug != NULL)
10411+ return dplug->get_parent(dir);
10412+ else
10413+ return ERR_PTR(RETERR(-ENOTDIR));
10414+}
10415+
10416+/**
10417+ * reiser4_get_dentry - get_dentry of export operations
10418+ * @super:
10419+ * @data:
10420+ *
10421+ *
10422+ */
10423+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10424+{
10425+ reiser4_object_on_wire *o;
10426+
10427+ assert("nikita-3522", super != NULL);
10428+ assert("nikita-3523", data != NULL);
10429+ /*
10430+ * this is only supposed to be called by
10431+ *
10432+ * reiser4_decode_fh->find_exported_dentry
10433+ *
10434+ * so, reiser4_context should be here already.
10435+ */
10436+ assert("nikita-3526", is_in_reiser4_context());
10437+
10438+ o = (reiser4_object_on_wire *)data;
10439+ assert("nikita-3524", o->plugin != NULL);
10440+ assert("nikita-3525", o->plugin->wire.get != NULL);
10441+
10442+ return o->plugin->wire.get(super, o);
10443+}
10444+
10445+struct export_operations reiser4_export_operations = {
10446+ .encode_fh = reiser4_encode_fh,
10447+ .decode_fh = reiser4_decode_fh,
10448+ .get_parent = reiser4_get_dentry_parent,
10449+ .get_dentry = reiser4_get_dentry
10450+};
10451+
10452+/*
10453+ * Local variables:
10454+ * c-indentation-style: "K&R"
10455+ * mode-name: "LC"
10456+ * c-basic-offset: 8
10457+ * tab-width: 8
10458+ * fill-column: 79
10459+ * End:
10460+ */
71430cf6
MT
10461diff -urN linux-2.6.22.orig/fs/reiser4/flush.c linux-2.6.22/fs/reiser4/flush.c
10462--- linux-2.6.22.orig/fs/reiser4/flush.c 1970-01-01 03:00:00.000000000 +0300
10463+++ linux-2.6.22/fs/reiser4/flush.c 2007-07-29 00:25:34.000000000 +0400
10464@@ -0,0 +1,3625 @@
44254afd
MT
10465+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10466+
10467+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10468+
10469+#include "forward.h"
10470+#include "debug.h"
10471+#include "dformat.h"
10472+#include "key.h"
10473+#include "coord.h"
10474+#include "plugin/item/item.h"
10475+#include "plugin/plugin.h"
10476+#include "plugin/object.h"
10477+#include "txnmgr.h"
10478+#include "jnode.h"
10479+#include "znode.h"
10480+#include "block_alloc.h"
10481+#include "tree_walk.h"
10482+#include "carry.h"
10483+#include "tree.h"
10484+#include "vfs_ops.h"
10485+#include "inode.h"
10486+#include "page_cache.h"
10487+#include "wander.h"
10488+#include "super.h"
10489+#include "entd.h"
10490+#include "reiser4.h"
10491+#include "flush.h"
10492+#include "writeout.h"
10493+
10494+#include <asm/atomic.h>
10495+#include <linux/fs.h> /* for struct super_block */
10496+#include <linux/mm.h> /* for struct page */
10497+#include <linux/bio.h> /* for struct bio */
10498+#include <linux/pagemap.h>
10499+#include <linux/blkdev.h>
10500+
10501+/* IMPLEMENTATION NOTES */
10502+
10503+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10504+ order to the nodes of the tree in which the parent is placed before its children, which
10505+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
10506+ describes the node that "came before in forward parent-first order". When we speak of a
10507+ "parent-first follower", it describes the node that "comes next in parent-first
10508+ order" (alternatively the node that "came before in reverse parent-first order").
10509+
10510+ The following pseudo-code prints the nodes of a tree in forward parent-first order:
10511+
10512+ void parent_first (node)
10513+ {
10514+ print_node (node);
10515+ if (node->level > leaf) {
10516+ for (i = 0; i < num_children; i += 1) {
10517+ parent_first (node->child[i]);
10518+ }
10519+ }
10520+ }
10521+*/
10522+
10523+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
10524+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10525+ can be accomplished with sequential reads, which results in reading nodes in their
10526+ parent-first order. This is a read-optimization aspect of the flush algorithm, and
10527+ there is also a write-optimization aspect, which is that we wish to make large
10528+ sequential writes to the disk by allocating or reallocating blocks so that they can be
10529+ written in sequence. Sometimes the read-optimization and write-optimization goals
10530+ conflict with each other, as we discuss in more detail below.
10531+*/
10532+
10533+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
10534+ the relevant jnode->state bits and their relevence to flush:
10535+
10536+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
10537+ must be allocated first. In order to be considered allocated, the jnode must have
10538+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
10539+ all dirtied jnodes eventually have one of these bits set during each transaction.
10540+
10541+ JNODE_CREATED: The node was freshly created in its transaction and has no previous
10542+ block address, so it is unconditionally assigned to be relocated, although this is
10543+ mainly for code-convenience. It is not being 'relocated' from anything, but in
10544+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
10545+ remains set even after JNODE_RELOC is set, so the actual relocate can be
10546+ distinguished from the created-and-allocated set easily: relocate-set members
10547+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10548+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10549+
10550+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10551+ decision to maintain the pre-existing location for this node and it will be written
10552+ to the wandered-log.
10553+
10554+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10555+ not created, see note above). A block with JNODE_RELOC set is eligible for
10556+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
10557+ bit is set on a znode, the parent node's internal item is modified and the znode is
10558+ rehashed.
10559+
10560+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10561+ and calls plugin->f.squeeze() method for its items. By this technology we update disk
10562+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10563+ has this flag (races with write(), rare case) the flush algorythm makes the decision
10564+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10565+ repeated allocation.
10566+
10567+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10568+ flush queue. This means the jnode is not on any clean or dirty list, instead it is
10569+ moved to one of the flush queue (see flush_queue.h) object private list. This
10570+ prevents multiple concurrent flushes from attempting to start flushing from the
10571+ same node.
10572+
10573+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10574+ squeeze-and-allocate on a node while its children are actively being squeezed and
10575+ allocated. This flag was created to avoid submitting a write request for a node
10576+ while its children are still being allocated and squeezed. Then flush queue was
10577+ re-implemented to allow unlimited number of nodes be queued. This flag support was
10578+ commented out in source code because we decided that there was no reason to submit
10579+ queued nodes before jnode_flush() finishes. However, current code calls fq_write()
10580+ during a slum traversal and may submit "busy nodes" to disk. Probably we can
10581+ re-enable the JNODE_FLUSH_BUSY bit support in future.
10582+
10583+ With these state bits, we describe a test used frequently in the code below,
10584+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
10585+ test for "flushprepped" returns true if any of the following are true:
10586+
10587+ - The node is not dirty
10588+ - The node has JNODE_RELOC set
10589+ - The node has JNODE_OVRWR set
10590+
10591+ If either the node is not dirty or it has already been processed by flush (and assigned
10592+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
10593+ true then flush has work to do on that node.
10594+*/
10595+
10596+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10597+ flushprepped twice (unless an explicit call to flush_unprep is made as described in
10598+ detail below). For example a node is dirtied, allocated, and then early-flushed to
10599+ disk and set clean. Before the transaction commits, the page is dirtied again and, due
10600+ to memory pressure, the node is flushed again. The flush algorithm will not relocate
10601+ the node to a new disk location, it will simply write it to the same, previously
10602+ relocated position again.
10603+*/
10604+
10605+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10606+ start at a leaf node and allocate in parent-first order by iterating to the right. At
10607+ each step of the iteration, we check for the right neighbor. Before advancing to the
10608+ right neighbor, we check if the current position and the right neighbor share the same
10609+ parent. If they do not share the same parent, the parent is allocated before the right
10610+ neighbor.
10611+
10612+ This process goes recursively up the tree and squeeze nodes level by level as long as
10613+ the right neighbor and the current position have different parents, then it allocates
10614+ the right-neighbors-with-different-parents on the way back down. This process is
10615+ described in more detail in flush_squalloc_changed_ancestor and the recursive function
10616+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10617+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10618+ approaches.
10619+
10620+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down
10621+ approach, we find a starting point by scanning left along each level past dirty nodes,
10622+ then going up and repeating the process until the left node and the parent node are
10623+ clean. We then perform a parent-first traversal from the starting point, which makes
10624+ allocating in parent-first order trivial. After one subtree has been allocated in this
10625+ manner, we move to the right, try moving upward, then repeat the parent-first
10626+ traversal.
10627+
10628+ Both approaches have problems that need to be addressed. Both are approximately the
10629+ same amount of code, but the bottom-up approach has advantages in the order it acquires
10630+ locks which, at the very least, make it the better approach. At first glance each one
10631+ makes the other one look simpler, so it is important to remember a few of the problems
10632+ with each one.
10633+
10634+ Main problem with the top-down approach: When you encounter a clean child during the
10635+ parent-first traversal, what do you do? You would like to avoid searching through a
10636+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10637+ obvious solution. One of the advantages of the top-down approach is that during the
10638+ parent-first traversal you check every child of a parent to see if it is dirty. In
10639+ this way, the top-down approach easily handles the main problem of the bottom-up
10640+ approach: unallocated children.
10641+
10642+ The unallocated children problem is that before writing a node to disk we must make
10643+ sure that all of its children are allocated. Otherwise, the writing the node means
10644+ extra I/O because the node will have to be written again when the child is finally
10645+ allocated.
10646+
10647+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
10648+ should not cause any file system corruption, it only degrades I/O performance because a
10649+ node may be written when it is sure to be written at least one more time in the same
10650+ transaction when the remaining children are allocated. What follows is a description
10651+ of how we will solve the problem.
10652+*/
10653+
10654+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10655+ proceeding in parent first order, allocate some of its left-children, then encounter a
10656+ clean child in the middle of the parent. We do not allocate the clean child, but there
10657+ may remain unallocated (dirty) children to the right of the clean child. If we were to
10658+ stop flushing at this moment and write everything to disk, the parent might still
10659+ contain unallocated children.
10660+
10661+ We could try to allocate all the descendents of every node that we allocate, but this
10662+ is not necessary. Doing so could result in allocating the entire tree: if the root
10663+ node is allocated then every unallocated node would have to be allocated before
10664+ flushing. Actually, we do not have to write a node just because we allocate it. It is
10665+ possible to allocate but not write a node during flush, when it still has unallocated
10666+ children. However, this approach is probably not optimal for the following reason.
10667+
10668+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10669+ to optimize reads that occur in the same order. Thus we are read-optimizing for a
10670+ left-to-right scan through all the leaves in the system, and we are hoping to
10671+ write-optimize at the same time because those nodes will be written together in batch.
10672+ What happens, however, if we assign a block number to a node in its read-optimized
10673+ order but then avoid writing it because it has unallocated children? In that
10674+ situation, we lose out on the write-optimization aspect because a node will have to be
10675+ written again to the its location on the device, later, which likely means seeking back
10676+ to that location.
10677+
10678+ So there are tradeoffs. We can choose either:
10679+
10680+ A. Allocate all unallocated children to preserve both write-optimization and
10681+ read-optimization, but this is not always desirable because it may mean having to
10682+ allocate and flush very many nodes at once.
10683+
10684+ B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10685+ but sacrifice write-optimization because those nodes will be written again.
10686+
10687+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10688+ locations. Instead, choose to write-optimize them later, when they are written. To
10689+ facilitate this, we "undo" the read-optimized allocation that was given to the node so
10690+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
10691+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
10692+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10693+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10694+ location, and set the JNODE_CREATED bit, effectively setting the node back to an
10695+ unallocated state.
10696+
10697+ We will take the following approach in v4.0: for twig nodes we will always finish
10698+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer
10699+ writing and choose write-optimization (C).
10700+
10701+ To summarize, there are several parts to a solution that avoids the problem with
10702+ unallocated children:
10703+
10704+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10705+ problem because there was an experiment which was done showed that we have 1-2 nodes
10706+ with unallocated children for thousands of written nodes. The experiment was simple
10707+ like coping / deletion of linux kernel sources. However the problem can arise in more
10708+ complex tests. I think we have jnode_io_hook to insert a check for unallocated
10709+ children and see what kind of problem we have.
10710+
10711+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10712+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
10713+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10714+ comments in that function.
10715+
10716+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10717+ have unallocated children. If the twig level has unallocated children it is an
10718+ assertion failure. If a higher-level node has unallocated children, then it should be
10719+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
10720+ should be simple.
10721+
10722+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10723+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10724+ this somewhat in the case where large sub-trees are flushed. The following observation
10725+ helps: if both the left- and right-neighbor of a node are processed by the flush
10726+ algorithm then the node itself is guaranteed to have all of its children allocated.
10727+ However, the cost of this check may not be so expensive after all: it is not needed for
10728+ leaves and flush can guarantee this property for twigs. That leaves only (level >
10729+ TWIG) nodes that have to be checked, so this optimization only helps if at least three
10730+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10731+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
10732+ then the number of blocks being written will be very large, so the savings may be
10733+ insignificant. That said, the idea is to maintain both the left and right edges of
10734+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively
10735+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
10736+ edge, the slow check is necessary, but if it is in the interior then it can be assumed
10737+ to have all of its children allocated. FIXME: medium complexity to implement, but
10738+ simple to verify given that we must have a slow check anyway.
10739+
10740+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of
10741+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
10742+ left-scan operation to take unallocated children into account. Normally, the left-scan
10743+ operation goes left as long as adjacent nodes are dirty up until some large maximum
10744+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
10745+ may stop at a position where there are unallocated children to the left with the same
10746+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10747+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10748+ with a rapid scan. The rapid scan skips all the interior children of a node--if the
10749+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10750+ twig to the left). If the left neighbor of the leftmost child is also dirty, then
10751+ continue the scan at the left twig and repeat. This option will cause flush to
10752+ allocate more twigs in a single pass, but it also has the potential to write many more
10753+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
10754+ was partially implemented, code removed August 12, 2002 by JMACD.
10755+*/
10756+
10757+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
10758+ starting point for flush is a leaf node, but actually the flush code cares very little
10759+ about whether or not this is true. It is possible that all the leaf nodes are flushed
10760+ and dirty parent nodes still remain, in which case jnode_flush() is called on a
10761+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
10762+ leaf, even when it is not. This is a simple approach, and there may be a more optimal
10763+ policy but until a problem with this approach is discovered, simplest is probably best.
10764+
10765+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10766+ the leaves. This is done as a matter of simplicity and there is only one (shaky)
10767+ justification. When an atom commits, it flushes all leaf level nodes first, followed
10768+ by twigs, and so on. With flushing done in this order, if flush is eventually called
10769+ on a non-leaf node it means that (somehow) we reached a point where all leaves are
10770+ clean and only internal nodes need to be flushed. If that it the case, then it means
10771+ there were no leaves that were the parent-first preceder/follower of the parent. This
10772+ is expected to be a rare case, which is why we do nothing special about it. However,
10773+ memory pressure may pass an internal node to flush when there are still dirty leaf
10774+ nodes that need to be flushed, which could prove our original assumptions
10775+ "inoperative". If this needs to be fixed, then scan_left/right should have
10776+ special checks for the non-leaf levels. For example, instead of passing from a node to
10777+ the left neighbor, it should pass from the node to the left neighbor's rightmost
10778+ descendent (if dirty).
10779+
10780+*/
10781+
10782+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10783+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
10784+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10785+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
10786+ device becomes sorted such that tree order and block number order fully correlate.
10787+
10788+ Resizing is done by shifting everything either all the way to the left or all the way
10789+ to the right, and then reporting the last block.
10790+*/
10791+
10792+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
10793+ descibes the policy from the highest level:
10794+
10795+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10796+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10797+ leaf nodes.
10798+
10799+ Otherwise, there are two contexts in which we make a decision to relocate:
10800+
10801+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10802+ During the initial stages of flush, after scan-right completes, we want to ask the
10803+ question: should we relocate this leaf node and thus dirty the parent node. Then if
10804+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10805+ the question at the next level up, and so on. In these cases we are moving in the
10806+ reverse-parent first direction.
10807+
10808+ There is another case which is considered the reverse direction, which comes at the end
10809+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
10810+ reach a point where there is a clean twig to the right with a dirty leftmost child. In
10811+ this case, we may wish to relocate the child by testing if it should be relocated
10812+ relative to its parent.
10813+
10814+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10815+ allocate_znode. What distinguishes the forward parent-first case from the
10816+ reverse-parent first case is that the preceder has already been allocated in the
10817+ forward case, whereas in the reverse case we don't know what the preceder is until we
10818+ finish "going in reverse". That simplifies the forward case considerably, and there we
10819+ actually use the block allocator to determine whether, e.g., a block closer to the
10820+ preceder is available.
10821+*/
10822+
10823+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
10824+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10825+ squeeze the parent's left neighbor and the parent. This may change the
10826+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child
10827+ is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10828+ Note that we cannot allocate extents during this or they will be out of parent-first
10829+ order. There is also some difficult coordinate maintenence issues. We can't do a tree
10830+ search to find coordinates again (because we hold locks), we have to determine them
10831+ from the two nodes being squeezed. Looks difficult, but has potential to increase
10832+ space utilization. */
10833+
10834+/* Flush-scan helper functions. */
10835+static void scan_init(flush_scan * scan);
10836+static void scan_done(flush_scan * scan);
10837+
10838+/* Flush-scan algorithm. */
10839+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10840+ unsigned limit);
10841+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10842+static int scan_common(flush_scan * scan, flush_scan * other);
10843+static int scan_formatted(flush_scan * scan);
10844+static int scan_unformatted(flush_scan * scan, flush_scan * other);
10845+static int scan_by_coord(flush_scan * scan);
10846+
10847+/* Initial flush-point ancestor allocation. */
10848+static int alloc_pos_and_ancestors(flush_pos_t * pos);
10849+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10850+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10851+
10852+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
10853+static int squalloc(flush_pos_t * pos);
10854+
10855+/* Flush squeeze implementation. */
10856+static int squeeze_right_non_twig(znode * left, znode * right);
10857+static int shift_one_internal_unit(znode * left, znode * right);
10858+
10859+/* Flush reverse parent-first relocation routines. */
10860+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10861+ const reiser4_block_nr * nblk);
10862+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10863+ flush_pos_t * pos);
10864+static int reverse_relocate_check_dirty_parent(jnode * node,
10865+ const coord_t * parent_coord,
10866+ flush_pos_t * pos);
10867+
10868+/* Flush allocate write-queueing functions: */
10869+static int allocate_znode(znode * node, const coord_t * parent_coord,
10870+ flush_pos_t * pos);
10871+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10872+ flush_pos_t * pos);
10873+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10874+
10875+/* Flush helper functions: */
10876+static int jnode_lock_parent_coord(jnode * node,
10877+ coord_t * coord,
10878+ lock_handle * parent_lh,
10879+ load_count * parent_zh,
10880+ znode_lock_mode mode, int try);
10881+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
71430cf6 10882+ znode_lock_mode mode, int check_dirty, int expected);
44254afd
MT
10883+static int znode_same_parents(znode * a, znode * b);
10884+
10885+static int znode_check_flushprepped(znode * node)
10886+{
10887+ return jnode_check_flushprepped(ZJNODE(node));
10888+}
10889+
10890+/* Flush position functions */
10891+static void pos_init(flush_pos_t * pos);
10892+static int pos_valid(flush_pos_t * pos);
10893+static void pos_done(flush_pos_t * pos);
10894+static int pos_stop(flush_pos_t * pos);
10895+
10896+/* check that @org is first jnode extent unit, if extent is unallocated,
10897+ * because all jnodes of unallocated extent are dirty and of the same atom. */
10898+#define checkchild(scan) \
10899+assert("nikita-3435", \
10900+ ergo(scan->direction == LEFT_SIDE && \
10901+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
10902+ jnode_is_unformatted(scan->node) && \
10903+ extent_is_unallocated(&scan->parent_coord), \
10904+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10905+
10906+/* This flush_cnt variable is used to track the number of concurrent flush operations,
10907+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
10908+ no static initializer function...) */
10909+ON_DEBUG(atomic_t flush_cnt;
10910+ )
10911+
10912+/* check fs backing device for write congestion */
10913+static int check_write_congestion(void)
10914+{
10915+ struct super_block *sb;
10916+ struct backing_dev_info *bdi;
10917+
10918+ sb = reiser4_get_current_sb();
71430cf6 10919+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
44254afd
MT
10920+ return bdi_write_congested(bdi);
10921+}
10922+
10923+/* conditionally write flush queue */
10924+static int write_prepped_nodes(flush_pos_t * pos)
10925+{
10926+ int ret;
10927+
10928+ assert("zam-831", pos);
10929+ assert("zam-832", pos->fq);
10930+
10931+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10932+ return 0;
10933+
10934+ if (check_write_congestion())
10935+ return 0;
10936+
71430cf6 10937+ ret = reiser4_write_fq(pos->fq, pos->nr_written,
44254afd
MT
10938+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10939+ return ret;
10940+}
10941+
10942+/* Proper release all flush pos. resources then move flush position to new
10943+ locked node */
10944+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10945+ load_count * new_load, const coord_t * new_coord)
10946+{
10947+ assert("zam-857", new_lock->node == new_load->node);
10948+
10949+ if (new_coord) {
10950+ assert("zam-858", new_coord->node == new_lock->node);
10951+ coord_dup(&pos->coord, new_coord);
10952+ } else {
10953+ coord_init_first_unit(&pos->coord, new_lock->node);
10954+ }
10955+
10956+ if (pos->child) {
10957+ jput(pos->child);
10958+ pos->child = NULL;
10959+ }
10960+
10961+ move_load_count(&pos->load, new_load);
10962+ done_lh(&pos->lock);
10963+ move_lh(&pos->lock, new_lock);
10964+}
10965+
10966+/* delete empty node which link from the parent still exists. */
10967+static int delete_empty_node(znode * node)
10968+{
10969+ reiser4_key smallest_removed;
10970+
10971+ assert("zam-1019", node != NULL);
10972+ assert("zam-1020", node_is_empty(node));
10973+ assert("zam-1023", znode_is_wlocked(node));
10974+
71430cf6 10975+ return reiser4_delete_node(node, &smallest_removed, NULL, 1);
44254afd
MT
10976+}
10977+
10978+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
10979+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
10980+{
10981+ int ret;
10982+ load_count load;
10983+ lock_handle lock;
10984+
10985+ init_lh(&lock);
10986+ init_load_count(&load);
10987+
10988+ if (jnode_is_znode(org)) {
10989+ ret = longterm_lock_znode(&lock, JZNODE(org),
10990+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
10991+ if (ret)
10992+ return ret;
10993+
10994+ ret = incr_load_count_znode(&load, JZNODE(org));
10995+ if (ret)
10996+ return ret;
10997+
10998+ pos->state =
10999+ (jnode_get_level(org) ==
11000+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11001+ move_flush_pos(pos, &lock, &load, NULL);
11002+ } else {
11003+ coord_t parent_coord;
11004+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11005+ &load, ZNODE_WRITE_LOCK, 0);
11006+ if (ret)
11007+ goto done;
11008+ if (!item_is_extent(&parent_coord)) {
11009+ /* file was converted to tail, org became HB, we found internal
11010+ item */
11011+ ret = -EAGAIN;
11012+ goto done;
11013+ }
11014+
11015+ pos->state = POS_ON_EPOINT;
11016+ move_flush_pos(pos, &lock, &load, &parent_coord);
11017+ pos->child = jref(org);
11018+ if (extent_is_unallocated(&parent_coord)
11019+ && extent_unit_index(&parent_coord) != index_jnode(org)) {
11020+ /* @org is not first child of its parent unit. This may happen
11021+ because longerm lock of its parent node was released between
11022+ scan_left and scan_right. For now work around this having flush to repeat */
11023+ ret = -EAGAIN;
11024+ }
11025+ }
11026+
11027+ done:
11028+ done_load_count(&load);
11029+ done_lh(&lock);
11030+ return ret;
11031+}
11032+
11033+/* TODO LIST (no particular order): */
11034+/* I have labelled most of the legitimate FIXME comments in this file with letters to
11035+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11036+ specific names mentioned instead that need to be inspected/resolved. */
11037+/* B. There is an issue described in reverse_relocate_test having to do with an
11038+ imprecise is_preceder? check having to do with partially-dirty extents. The code that
11039+ sets preceder hints and computes the preceder is basically untested. Careful testing
11040+ needs to be done that preceder calculations are done correctly, since if it doesn't
11041+ affect correctness we will not catch this stuff during regular testing. */
11042+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11043+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11044+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11045+ Many of the calls that may produce one of these return values (i.e.,
11046+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11047+ values themselves and, for instance, stop flushing instead of resulting in a restart.
11048+ If any of these results are true error conditions then flush will go into a busy-loop,
11049+ as we noticed during testing when a corrupt tree caused find_child_ptr to return
11050+ ENOENT. It needs careful thought and testing of corner conditions.
11051+*/
11052+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11053+ block is assigned a block number then early-flushed to disk. It is dirtied again and
11054+ flush is called again. Concurrently, that block is deleted, and the de-allocation of
11055+ its block number does not need to be deferred, since it is not part of the preserve set
11056+ (i.e., it didn't exist before the transaction). I think there may be a race condition
11057+ where flush writes the dirty, created block after the non-deferred deallocated block
11058+ number is re-allocated, making it possible to write deleted data on top of non-deleted
11059+ data. Its just a theory, but it needs to be thought out. */
11060+/* F. bio_alloc() failure is not handled gracefully. */
11061+/* G. Unallocated children. */
11062+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11063+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11064+
11065+/* JNODE_FLUSH: MAIN ENTRY POINT */
11066+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11067+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11068+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11069+ a part of transaction commit.
11070+
11071+ Our objective here is to prep and flush the slum the jnode belongs to. We want to
11072+ squish the slum together, and allocate the nodes in it as we squish because allocation
11073+ of children affects squishing of parents.
11074+
11075+ The "argument" @node tells flush where to start. From there, flush finds the left edge
11076+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11077+ "better place" to start squalloc first we perform a flush_scan.
11078+
11079+ Flush-scanning may be performed in both left and right directions, but for different
11080+ purposes. When scanning to the left, we are searching for a node that precedes a
11081+ sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11082+ During flush-scanning, we also take the opportunity to count the number of consecutive
11083+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11084+ make a decision to reallocate leaf nodes (thus favoring write-optimization).
11085+
11086+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11087+ also be dirty nodes to the right of the argument. If the scan-left operation does not
11088+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11089+ operation to see whether there is, in fact, enough nodes to meet the relocate
11090+ threshold. Each right- and left-scan operation uses a single flush_scan object.
11091+
11092+ After left-scan and possibly right-scan, we prepare a flush_position object with the
11093+ starting flush point or parent coordinate, which was determined using scan-left.
11094+
11095+ Next we call the main flush routine, squalloc, which iterates along the
11096+ leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11097+
11098+ After squalloc returns we take extra steps to ensure that all the children
11099+ of the final twig node are allocated--this involves repeating squalloc
11100+ until we finish at a twig with no unallocated children.
11101+
11102+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11103+ any above-twig nodes during flush_empty_queue that still have unallocated children, we
11104+ flush_unprep them.
11105+
11106+ Flush treats several "failure" cases as non-failures, essentially causing them to start
11107+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11108+ probably be handled properly rather than restarting, but there are a bunch of cases to
11109+ audit.
11110+*/
11111+
11112+static int
11113+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11114+ flush_queue_t * fq, int flags)
11115+{
11116+ long ret = 0;
11117+ flush_scan *right_scan;
11118+ flush_scan *left_scan;
11119+ flush_pos_t *flush_pos;
11120+ int todo;
11121+ struct super_block *sb;
11122+ reiser4_super_info_data *sbinfo;
11123+ jnode *leftmost_in_slum = NULL;
11124+
11125+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
71430cf6 11126+ assert("nikita-3022", reiser4_schedulable());
44254afd 11127+
44254afd 11128+ assert("nikita-3185",
71430cf6 11129+ get_current_super_private()->delete_mutex_owner != current);
44254afd
MT
11130+
11131+ /* allocate right_scan, left_scan and flush_pos */
11132+ right_scan =
71430cf6
MT
11133+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11134+ reiser4_ctx_gfp_mask_get());
44254afd
MT
11135+ if (right_scan == NULL)
11136+ return RETERR(-ENOMEM);
11137+ left_scan = right_scan + 1;
11138+ flush_pos = (flush_pos_t *) (left_scan + 1);
11139+
11140+ sb = reiser4_get_current_sb();
11141+ sbinfo = get_super_private(sb);
44254afd
MT
11142+
11143+ /* Flush-concurrency debug code */
11144+#if REISER4_DEBUG
11145+ atomic_inc(&flush_cnt);
11146+#endif
11147+
71430cf6 11148+ reiser4_enter_flush(sb);
44254afd
MT
11149+
11150+ /* Initialize a flush position. */
11151+ pos_init(flush_pos);
11152+
11153+ flush_pos->nr_written = nr_written;
11154+ flush_pos->fq = fq;
11155+ flush_pos->flags = flags;
11156+ flush_pos->nr_to_write = nr_to_write;
11157+
11158+ scan_init(right_scan);
11159+ scan_init(left_scan);
11160+
11161+ /* First scan left and remember the leftmost scan position. If the leftmost
11162+ position is unformatted we remember its parent_coord. We scan until counting
11163+ FLUSH_SCAN_MAXNODES.
11164+
11165+ If starting @node is unformatted, at the beginning of left scan its
11166+ parent (twig level node, containing extent item) will be long term
11167+ locked and lock handle will be stored in the
11168+ @right_scan->parent_lock. This lock is used to start the rightward
11169+ scan without redoing the tree traversal (necessary to find parent)
11170+ and, hence, is kept during leftward scan. As a result, we have to
11171+ use try-lock when taking long term locks during the leftward scan.
11172+ */
11173+ ret = scan_left(left_scan, right_scan,
11174+ node, sbinfo->flush.scan_maxnodes);
11175+ if (ret != 0)
11176+ goto failed;
11177+
11178+ leftmost_in_slum = jref(left_scan->node);
11179+ scan_done(left_scan);
11180+
11181+ /* Then possibly go right to decide if we will use a policy of relocating leaves.
11182+ This is only done if we did not scan past (and count) enough nodes during the
11183+ leftward scan. If we do scan right, we only care to go far enough to establish
11184+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11185+ scan limit is the difference between left_scan.count and the threshold. */
11186+
11187+ todo = sbinfo->flush.relocate_threshold - left_scan->count;
11188+ /* scan right is inherently deadlock prone, because we are
11189+ * (potentially) holding a lock on the twig node at this moment.
11190+ * FIXME: this is incorrect comment: lock is not held */
11191+ if (todo > 0) {
11192+ ret = scan_right(right_scan, node, (unsigned)todo);
11193+ if (ret != 0)
11194+ goto failed;
11195+ }
11196+
11197+ /* Only the right-scan count is needed, release any rightward locks right away. */
11198+ scan_done(right_scan);
11199+
11200+ /* ... and the answer is: we should relocate leaf nodes if at least
11201+ FLUSH_RELOCATE_THRESHOLD nodes were found. */
11202+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11203+ (left_scan->count + right_scan->count >=
11204+ sbinfo->flush.relocate_threshold);
11205+
11206+ /* Funny business here. We set the 'point' in the flush_position at prior to
11207+ starting squalloc regardless of whether the first point is
11208+ formatted or unformatted. Without this there would be an invariant, in the
11209+ rest of the code, that if the flush_position is unformatted then
11210+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11211+ and if the flush_position is formatted then flush_position->point is non-NULL
11212+ and no parent info is set.
11213+
11214+ This seems lazy, but it makes the initial calls to reverse_relocate_test
11215+ (which ask "is it the pos->point the leftmost child of its parent") much easier
11216+ because we know the first child already. Nothing is broken by this, but the
11217+ reasoning is subtle. Holding an extra reference on a jnode during flush can
11218+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11219+ removed from sibling lists until they have zero reference count. Flush would
11220+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11221+ deleted to the right. So if nothing is broken, why fix it?
11222+
11223+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11224+ point and in any moment, because of the concurrent file system
11225+ activity (for example, truncate). */
11226+
11227+ /* Check jnode state after flush_scan completed. Having a lock on this
11228+ node or its parent (in case of unformatted) helps us in case of
11229+ concurrent flushing. */
11230+ if (jnode_check_flushprepped(leftmost_in_slum)
11231+ && !jnode_convertible(leftmost_in_slum)) {
11232+ ret = 0;
11233+ goto failed;
11234+ }
11235+
11236+ /* Now setup flush_pos using scan_left's endpoint. */
11237+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11238+ if (ret)
11239+ goto failed;
11240+
11241+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11242+ && node_is_empty(flush_pos->coord.node)) {
11243+ znode *empty = flush_pos->coord.node;
11244+
11245+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11246+ ret = delete_empty_node(empty);
11247+ goto failed;
11248+ }
11249+
11250+ if (jnode_check_flushprepped(leftmost_in_slum)
11251+ && !jnode_convertible(leftmost_in_slum)) {
11252+ ret = 0;
11253+ goto failed;
11254+ }
11255+
11256+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11257+ ret = alloc_pos_and_ancestors(flush_pos);
11258+ if (ret)
11259+ goto failed;
11260+
11261+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
11262+ ret = squalloc(flush_pos);
11263+ pos_stop(flush_pos);
11264+ if (ret)
11265+ goto failed;
11266+
11267+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11268+ First, the pos_stop() and pos_valid() routines should be modified
11269+ so that pos_stop() sets a flush_position->stop flag to 1 without
11270+ releasing the current position immediately--instead release it in
11271+ pos_done(). This is a better implementation than the current one anyway.
11272+
11273+ It is not clear that all fields of the flush_position should not be released,
11274+ but at the very least the parent_lock, parent_coord, and parent_load should
11275+ remain held because they are hold the last twig when pos_stop() is
11276+ called.
11277+
11278+ When we reach this point in the code, if the parent_coord is set to after the
11279+ last item then we know that flush reached the end of a twig (and according to
11280+ the new flush queueing design, we will return now). If parent_coord is not
11281+ past the last item, we should check if the current twig has any unallocated
11282+ children to the right (we are not concerned with unallocated children to the
11283+ left--in that case the twig itself should not have been allocated). If the
11284+ twig has unallocated children to the right, set the parent_coord to that
11285+ position and then repeat the call to squalloc.
11286+
11287+ Testing for unallocated children may be defined in two ways: if any internal
11288+ item has a fake block number, it is unallocated; if any extent item is
11289+ unallocated then all of its children are unallocated. But there is a more
11290+ aggressive approach: if there are any dirty children of the twig to the right
11291+ of the current position, we may wish to relocate those nodes now. Checking for
11292+ potential relocation is more expensive as it requires knowing whether there are
11293+ any dirty children that are not unallocated. The extent_needs_allocation
11294+ should be used after setting the correct preceder.
11295+
11296+ When we reach the end of a twig at this point in the code, if the flush can
11297+ continue (when the queue is ready) it will need some information on the future
11298+ starting point. That should be stored away in the flush_handle using a seal, I
11299+ believe. Holding a jref() on the future starting point may break other code
11300+ that deletes that node.
11301+ */
11302+
11303+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11304+ above the twig level. If the VM calls flush above the twig level, do nothing
11305+ and return (but figure out why this happens). The txnmgr should be modified to
11306+ only flush its leaf-level dirty list. This will do all the necessary squeeze
11307+ and allocate steps but leave unallocated branches and possibly unallocated
11308+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11309+ level, the remaining unallocated nodes should be given write-optimized
11310+ locations. (Possibly, the remaining unallocated twigs should be allocated just
11311+ before their leftmost child.)
11312+ */
11313+
11314+ /* Any failure reaches this point. */
11315+ failed:
11316+
11317+ switch (ret) {
11318+ case -E_REPEAT:
11319+ case -EINVAL:
11320+ case -E_DEADLOCK:
11321+ case -E_NO_NEIGHBOR:
11322+ case -ENOENT:
11323+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11324+ in each case. They already are handled in many cases. */
11325+ /* Something bad happened, but difficult to avoid... Try again! */
11326+ ret = 0;
11327+ }
11328+
11329+ if (leftmost_in_slum)
11330+ jput(leftmost_in_slum);
11331+
11332+ pos_done(flush_pos);
11333+ scan_done(left_scan);
11334+ scan_done(right_scan);
11335+ kfree(right_scan);
11336+
11337+ ON_DEBUG(atomic_dec(&flush_cnt));
11338+
71430cf6 11339+ reiser4_leave_flush(sb);
44254afd
MT
11340+
11341+ return ret;
11342+}
11343+
11344+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11345+ * flusher should submit all prepped nodes immediately without keeping them in
11346+ * flush queues for long time. The reason for rapid flush mode is to free
11347+ * memory as fast as possible. */
11348+
11349+#if REISER4_USE_RAPID_FLUSH
11350+
11351+/**
11352+ * submit all prepped nodes if rapid flush mode is set,
11353+ * turn rapid flush mode off.
11354+ */
11355+
11356+static int rapid_flush(flush_pos_t * pos)
11357+{
11358+ if (!wbq_available())
11359+ return 0;
11360+
11361+ return write_prepped_nodes(pos);
11362+}
11363+
11364+#else
11365+
11366+#define rapid_flush(pos) (0)
11367+
11368+#endif /* REISER4_USE_RAPID_FLUSH */
11369+
11370+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11371+ flush_queue_t *fq, int *nr_queued,
11372+ int flags)
11373+{
11374+ jnode * node;
11375+
11376+ if (start != NULL) {
11377+ spin_lock_jnode(start);
11378+ if (!jnode_is_flushprepped(start)) {
11379+ assert("zam-1056", start->atom == atom);
11380+ node = start;
11381+ goto enter;
11382+ }
11383+ spin_unlock_jnode(start);
11384+ }
11385+ /*
11386+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11387+ * nodes. The atom spin lock is not released until all dirty nodes processed or
11388+ * not prepped node found in the atom dirty lists.
11389+ */
11390+ while ((node = find_first_dirty_jnode(atom, flags))) {
11391+ spin_lock_jnode(node);
11392+ enter:
11393+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11394+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11395+
11396+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
11397+ /* move node to the end of atom's writeback list */
11398+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11399+
11400+ /*
11401+ * jnode is not necessarily on dirty list: if it was dirtied when
11402+ * it was on flush queue - it does not get moved to dirty list
11403+ */
11404+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11405+ WB_LIST, 1));
11406+
11407+ } else if (jnode_is_znode(node)
11408+ && znode_above_root(JZNODE(node))) {
11409+ /*
11410+ * A special case for znode-above-root. The above-root (fake)
11411+ * znode is captured and dirtied when the tree height changes or
11412+ * when the root node is relocated. This causes atoms to fuse so
11413+ * that changes at the root are serialized. However, this node is
11414+ * never flushed. This special case used to be in lock.c to
11415+ * prevent the above-root node from ever being captured, but now
11416+ * that it is captured we simply prevent it from flushing. The
11417+ * log-writer code relies on this to properly log superblock
11418+ * modifications of the tree height.
11419+ */
11420+ jnode_make_wander_nolock(node);
11421+ } else if (JF_ISSET(node, JNODE_RELOC)) {
11422+ queue_jnode(fq, node);
11423+ ++(*nr_queued);
11424+ } else
11425+ break;
11426+
11427+ spin_unlock_jnode(node);
11428+ }
11429+ return node;
11430+}
11431+
44254afd
MT
11432+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11433+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11434+ * other errors as they are. */
11435+int
11436+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11437+ txn_atom ** atom, jnode *start)
11438+{
11439+ reiser4_super_info_data *sinfo = get_current_super_private();
11440+ flush_queue_t *fq = NULL;
11441+ jnode *node;
11442+ int nr_queued;
11443+ int ret;
11444+
11445+ assert("zam-889", atom != NULL && *atom != NULL);
11446+ assert_spin_locked(&((*atom)->alock));
11447+ assert("zam-892", get_current_context()->trans->atom == *atom);
11448+
11449+ nr_to_write = LONG_MAX;
11450+ while (1) {
71430cf6 11451+ ret = reiser4_fq_by_atom(*atom, &fq);
44254afd
MT
11452+ if (ret != -E_REPEAT)
11453+ break;
11454+ *atom = get_current_atom_locked();
11455+ }
11456+ if (ret)
11457+ return ret;
11458+
11459+ assert_spin_locked(&((*atom)->alock));
11460+
11461+ /* parallel flushers limit */
11462+ if (sinfo->tmgr.atom_max_flushers != 0) {
11463+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
71430cf6
MT
11464+ /* An reiser4_atom_send_event() call is inside
11465+ reiser4_fq_put_nolock() which is called when flush is
11466+ finished and nr_flushers is decremented. */
11467+ reiser4_atom_wait_event(*atom);
44254afd
MT
11468+ *atom = get_current_atom_locked();
11469+ }
11470+ }
11471+
11472+ /* count ourself as a flusher */
11473+ (*atom)->nr_flushers++;
11474+
11475+ writeout_mode_enable();
11476+
11477+ nr_queued = 0;
11478+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11479+
11480+ if (node == NULL) {
11481+ if (nr_queued == 0) {
11482+ (*atom)->nr_flushers--;
71430cf6
MT
11483+ reiser4_fq_put_nolock(fq);
11484+ reiser4_atom_send_event(*atom);
44254afd
MT
11485+ /* current atom remains locked */
11486+ writeout_mode_disable();
11487+ return 0;
11488+ }
11489+ spin_unlock_atom(*atom);
11490+ } else {
11491+ jref(node);
11492+ BUG_ON((*atom)->super != node->tree->super);
11493+ spin_unlock_atom(*atom);
11494+ spin_unlock_jnode(node);
11495+ BUG_ON(nr_to_write == 0);
11496+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11497+ jput(node);
11498+ }
11499+
11500+ ret =
71430cf6 11501+ reiser4_write_fq(fq, nr_submitted,
44254afd
MT
11502+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11503+
11504+ *atom = get_current_atom_locked();
11505+ (*atom)->nr_flushers--;
71430cf6
MT
11506+ reiser4_fq_put_nolock(fq);
11507+ reiser4_atom_send_event(*atom);
44254afd
MT
11508+ spin_unlock_atom(*atom);
11509+
11510+ writeout_mode_disable();
11511+
11512+ if (ret == 0)
11513+ ret = -E_REPEAT;
11514+
11515+ return ret;
11516+}
11517+
11518+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11519+
11520+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11521+ reverse parent-first relocate context. Here all we know is the preceder and the block
11522+ number. Since we are going in reverse, the preceder may still be relocated as well, so
11523+ we can't ask the block allocator "is there a closer block available to relocate?" here.
11524+ In the _forward_ parent-first relocate context (not here) we actually call the block
11525+ allocator to try and find a closer location. */
11526+static int
11527+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11528+ const reiser4_block_nr * nblk)
11529+{
11530+ reiser4_block_nr dist;
11531+
11532+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
71430cf6
MT
11533+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11534+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
44254afd
MT
11535+
11536+ /* Distance is the absolute value. */
11537+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11538+
11539+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11540+ block, do not relocate. */
11541+ if (dist <= get_current_super_private()->flush.relocate_distance) {
11542+ return 0;
11543+ }
11544+
11545+ return 1;
11546+}
11547+
11548+/* This function is a predicate that tests for relocation. Always called in the
11549+ reverse-parent-first context, when we are asking whether the current node should be
11550+ relocated in order to expand the flush by dirtying the parent level (and thus
11551+ proceeding to flush that level). When traversing in the forward parent-first direction
11552+ (not here), relocation decisions are handled in two places: allocate_znode() and
11553+ extent_needs_allocation(). */
11554+static int
11555+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11556+ flush_pos_t * pos)
11557+{
11558+ reiser4_block_nr pblk = 0;
11559+ reiser4_block_nr nblk = 0;
11560+
11561+ assert("jmacd-8989", !jnode_is_root(node));
11562+
11563+ /*
11564+ * This function is called only from the
11565+ * reverse_relocate_check_dirty_parent() and only if the parent
11566+ * node is clean. This implies that the parent has the real (i.e., not
11567+ * fake) block number, and, so does the child, because otherwise the
11568+ * parent would be dirty.
11569+ */
11570+
11571+ /* New nodes are treated as if they are being relocated. */
11572+ if (JF_ISSET (node, JNODE_CREATED) ||
11573+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11574+ return 1;
11575+ }
11576+
11577+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously
11578+ existing node, the coord may be leftmost even though the child is not the
11579+ parent-first preceder of the parent. If the first dirty node appears somewhere
11580+ in the middle of the first extent unit, this preceder calculation is wrong.
11581+ Needs more logic in here. */
11582+ if (coord_is_leftmost_unit(parent_coord)) {
11583+ pblk = *znode_get_block(parent_coord->node);
11584+ } else {
11585+ pblk = pos->preceder.blk;
11586+ }
11587+ check_preceder(pblk);
11588+
11589+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11590+ if (pblk == 0) {
11591+ return 1;
11592+ }
11593+
11594+ nblk = *jnode_get_block(node);
11595+
71430cf6 11596+ if (reiser4_blocknr_is_fake(&nblk))
44254afd
MT
11597+ /* child is unallocated, mark parent dirty */
11598+ return 1;
11599+
11600+ return reverse_relocate_if_close_enough(&pblk, &nblk);
11601+}
11602+
11603+/* This function calls reverse_relocate_test to make a reverse-parent-first
11604+ relocation decision and then, if yes, it marks the parent dirty. */
11605+static int
11606+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11607+ flush_pos_t * pos)
11608+{
11609+ int ret;
11610+
11611+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11612+
11613+ ret = reverse_relocate_test(node, parent_coord, pos);
11614+ if (ret < 0) {
11615+ return ret;
11616+ }
11617+
11618+ /* FIXME-ZAM
11619+ if parent is already relocated - we do not want to grab space, right? */
11620+ if (ret == 1) {
11621+ int grabbed;
11622+
11623+ grabbed = get_current_context()->grabbed_blocks;
11624+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11625+ 0)
11626+ reiser4_panic("umka-1250",
11627+ "No space left during flush.");
11628+
11629+ assert("jmacd-18923",
11630+ znode_is_write_locked(parent_coord->node));
11631+ znode_make_dirty(parent_coord->node);
11632+ grabbed2free_mark(grabbed);
11633+ }
11634+ }
11635+
11636+ return 0;
11637+}
11638+
11639+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11640+ PARENT-FIRST LOOP BEGINS) */
11641+
11642+/* Get the leftmost child for given coord. */
11643+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11644+{
11645+ int ret;
11646+
11647+ ret = item_utmost_child(coord, LEFT_SIDE, child);
11648+
11649+ if (ret)
11650+ return ret;
11651+
11652+ if (IS_ERR(*child))
11653+ return PTR_ERR(*child);
11654+
11655+ return 0;
11656+}
11657+
11658+/* This step occurs after the left- and right-scans are completed, before starting the
11659+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting
11660+ flush point, which means continuing in the reverse parent-first direction to the
11661+ parent, grandparent, and so on (as long as the child is a leftmost child). This
11662+ routine calls a recursive process, alloc_one_ancestor, which does the real work,
11663+ except there is special-case handling here for the first ancestor, which may be a twig.
11664+ At each level (here and alloc_one_ancestor), we check for relocation and then, if
11665+ the child is a leftmost child, repeat at the next level. On the way back down (the
11666+ recursion), we allocate the ancestors in parent-first order. */
11667+static int alloc_pos_and_ancestors(flush_pos_t * pos)
11668+{
11669+ int ret = 0;
11670+ lock_handle plock;
11671+ load_count pload;
11672+ coord_t pcoord;
11673+
11674+ if (znode_check_flushprepped(pos->lock.node))
11675+ return 0;
11676+
11677+ coord_init_invalid(&pcoord, NULL);
11678+ init_lh(&plock);
11679+ init_load_count(&pload);
11680+
11681+ if (pos->state == POS_ON_EPOINT) {
11682+ /* a special case for pos on twig level, where we already have
11683+ a lock on parent node. */
11684+ /* The parent may not be dirty, in which case we should decide
11685+ whether to relocate the child now. If decision is made to
11686+ relocate the child, the parent is marked dirty. */
11687+ ret =
11688+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11689+ pos);
11690+ if (ret)
11691+ goto exit;
11692+
11693+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11694+ is leftmost) and the leaf/child, so recursion is not needed.
11695+ Levels above the twig will be allocated for
11696+ write-optimization before the transaction commits. */
11697+
11698+ /* Do the recursive step, allocating zero or more of our
11699+ * ancestors. */
11700+ ret = alloc_one_ancestor(&pos->coord, pos);
11701+
11702+ } else {
11703+ if (!znode_is_root(pos->lock.node)) {
11704+ /* all formatted nodes except tree root */
11705+ ret =
11706+ reiser4_get_parent(&plock, pos->lock.node,
11707+ ZNODE_WRITE_LOCK);
11708+ if (ret)
11709+ goto exit;
11710+
11711+ ret = incr_load_count_znode(&pload, plock.node);
11712+ if (ret)
11713+ goto exit;
11714+
11715+ ret =
11716+ find_child_ptr(plock.node, pos->lock.node, &pcoord);
11717+ if (ret)
11718+ goto exit;
11719+
11720+ ret =
11721+ reverse_relocate_check_dirty_parent(ZJNODE
11722+ (pos->lock.
11723+ node), &pcoord,
11724+ pos);
11725+ if (ret)
11726+ goto exit;
11727+
11728+ ret = alloc_one_ancestor(&pcoord, pos);
11729+ if (ret)
11730+ goto exit;
11731+ }
11732+
11733+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
11734+ }
11735+ exit:
11736+ done_load_count(&pload);
11737+ done_lh(&plock);
11738+ return ret;
11739+}
11740+
11741+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
11742+ call to set_preceder, which is the next function described, this checks if the
11743+ child is a leftmost child and returns if it is not. If the child is a leftmost child
11744+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive
11745+ step. */
11746+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11747+{
11748+ int ret = 0;
11749+ lock_handle alock;
11750+ load_count aload;
11751+ coord_t acoord;
11752+
11753+ /* As we ascend at the left-edge of the region to flush, take this opportunity at
11754+ the twig level to find our parent-first preceder unless we have already set
11755+ it. */
11756+ if (pos->preceder.blk == 0) {
11757+ ret = set_preceder(coord, pos);
11758+ if (ret != 0)
11759+ return ret;
11760+ }
11761+
11762+ /* If the ancestor is clean or already allocated, or if the child is not a
11763+ leftmost child, stop going up, even leaving coord->node not flushprepped. */
11764+ if (znode_check_flushprepped(coord->node)
11765+ || !coord_is_leftmost_unit(coord))
11766+ return 0;
11767+
11768+ init_lh(&alock);
11769+ init_load_count(&aload);
11770+ coord_init_invalid(&acoord, NULL);
11771+
11772+ /* Only ascend to the next level if it is a leftmost child, but write-lock the
11773+ parent in case we will relocate the child. */
11774+ if (!znode_is_root(coord->node)) {
11775+
11776+ ret =
11777+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11778+ &alock, &aload, ZNODE_WRITE_LOCK,
11779+ 0);
11780+ if (ret != 0) {
11781+ /* FIXME(C): check EINVAL, E_DEADLOCK */
11782+ goto exit;
11783+ }
11784+
11785+ ret =
11786+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11787+ &acoord, pos);
11788+ if (ret != 0) {
11789+ goto exit;
11790+ }
11791+
11792+ /* Recursive call. */
11793+ if (!znode_check_flushprepped(acoord.node)) {
11794+ ret = alloc_one_ancestor(&acoord, pos);
11795+ if (ret)
11796+ goto exit;
11797+ }
11798+ }
11799+
11800+ /* Note: we call allocate with the parent write-locked (except at the root) in
11801+ case we relocate the child, in which case it will modify the parent during this
11802+ call. */
11803+ ret = allocate_znode(coord->node, &acoord, pos);
11804+
11805+ exit:
11806+ done_load_count(&aload);
11807+ done_lh(&alock);
11808+ return ret;
11809+}
11810+
11811+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11812+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
11813+ should this node be relocated (in reverse parent-first context)? We repeat this
11814+ process as long as the child is the leftmost child, eventually reaching an ancestor of
11815+ the flush point that is not a leftmost child. The preceder of that ancestors, which is
11816+ not a leftmost child, is actually on the leaf level. The preceder of that block is the
11817+ left-neighbor of the flush point. The preceder of that block is the rightmost child of
11818+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
11819+ level, it stops momentarily to remember the block of the rightmost child of the twig on
11820+ the left and sets it to the flush_position's preceder_hint.
11821+
11822+ There is one other place where we may set the flush_position's preceder hint, which is
11823+ during scan-left.
11824+*/
11825+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11826+{
11827+ int ret;
11828+ coord_t coord;
11829+ lock_handle left_lock;
11830+ load_count left_load;
11831+
11832+ coord_dup(&coord, coord_in);
11833+
11834+ init_lh(&left_lock);
11835+ init_load_count(&left_load);
11836+
11837+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11838+ coord_is_leftmost_unit is not the right test if the unformatted child is in the
11839+ middle of the first extent unit. */
11840+ if (!coord_is_leftmost_unit(&coord)) {
11841+ coord_prev_unit(&coord);
11842+ } else {
11843+ ret =
11844+ reiser4_get_left_neighbor(&left_lock, coord.node,
11845+ ZNODE_READ_LOCK, GN_SAME_ATOM);
11846+ if (ret) {
11847+ /* If we fail for any reason it doesn't matter because the
11848+ preceder is only a hint. We are low-priority at this point, so
11849+ this must be the case. */
11850+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11851+ ret == -ENOENT || ret == -EINVAL
11852+ || ret == -E_DEADLOCK) {
11853+ ret = 0;
11854+ }
11855+ goto exit;
11856+ }
11857+
11858+ ret = incr_load_count_znode(&left_load, left_lock.node);
11859+ if (ret)
11860+ goto exit;
11861+
11862+ coord_init_last_unit(&coord, left_lock.node);
11863+ }
11864+
11865+ ret =
11866+ item_utmost_child_real_block(&coord, RIGHT_SIDE,
11867+ &pos->preceder.blk);
11868+ exit:
11869+ check_preceder(pos->preceder.blk);
11870+ done_load_count(&left_load);
11871+ done_lh(&left_lock);
11872+ return ret;
11873+}
11874+
11875+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11876+
11877+/* This procedure implements the outer loop of the flush algorithm. To put this in
11878+ context, here is the general list of steps taken by the flush routine as a whole:
11879+
11880+ 1. Scan-left
11881+ 2. Scan-right (maybe)
11882+ 3. Allocate initial flush position and its ancestors
11883+ 4. <handle extents>
11884+ 5. <squeeze and next position and its ancestors to-the-right,
11885+ then update position to-the-right>
11886+ 6. <repeat from #4 until flush is stopped>
11887+
11888+ This procedure implements the loop in steps 4 through 6 in the above listing.
11889+
11890+ Step 4: if the current flush position is an extent item (position on the twig level),
11891+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11892+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
11893+ If the next coordinate is an internal item, we descend back to the leaf level,
11894+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
11895+ brings us past the end of the twig level, then we call
11896+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11897+ step #5 which moves to the right.
11898+
11899+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11900+ tree to allocate any ancestors of the next-right flush position that are not also
11901+ ancestors of the current position. Those ancestors (in top-down order) are the next in
11902+ parent-first order. We squeeze adjacent nodes on the way up until the right node and
11903+ current node share the same parent, then allocate on the way back down. Finally, this
11904+ step sets the flush position to the next-right node. Then repeat steps 4 and 5.
11905+*/
11906+
11907+/* SQUEEZE CODE */
11908+
11909+/* squalloc_right_twig helper function, cut a range of extent items from
11910+ cut node to->node from the beginning up to coord @to. */
11911+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11912+ znode * left)
11913+{
11914+ coord_t from;
11915+ reiser4_key from_key;
11916+
11917+ coord_init_first_unit(&from, to->node);
11918+ item_key_by_coord(&from, &from_key);
11919+
11920+ return cut_node_content(&from, to, &from_key, to_key, NULL);
11921+}
11922+
11923+/* Copy as much of the leading extents from @right to @left, allocating
11924+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
11925+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
11926+ internal item it calls shift_one_internal_unit and may then return
11927+ SUBTREE_MOVED. */
11928+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11929+{
11930+ int ret = SUBTREE_MOVED;
11931+ coord_t coord; /* used to iterate over items */
11932+ reiser4_key stop_key;
11933+
11934+ assert("jmacd-2008", !node_is_empty(right));
11935+ coord_init_first_unit(&coord, right);
11936+
11937+ /* FIXME: can be optimized to cut once */
11938+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11939+ ON_DEBUG(void *vp);
11940+
11941+ assert("vs-1468", coord_is_leftmost_unit(&coord));
11942+ ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11943+
11944+ /* stop_key is used to find what was copied and what to cut */
71430cf6 11945+ stop_key = *reiser4_min_key();
44254afd
MT
11946+ ret = squalloc_extent(left, &coord, pos, &stop_key);
11947+ if (ret != SQUEEZE_CONTINUE) {
11948+ ON_DEBUG(kfree(vp));
11949+ break;
11950+ }
71430cf6 11951+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
44254afd
MT
11952+
11953+ /* Helper function to do the cutting. */
11954+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11955+ check_me("vs-1466",
11956+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11957+
11958+ ON_DEBUG(shift_check(vp, left, coord.node));
11959+ }
11960+
11961+ if (node_is_empty(coord.node))
11962+ ret = SQUEEZE_SOURCE_EMPTY;
11963+
11964+ if (ret == SQUEEZE_TARGET_FULL) {
11965+ goto out;
11966+ }
11967+
11968+ if (node_is_empty(right)) {
11969+ /* The whole right node was copied into @left. */
11970+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
11971+ goto out;
11972+ }
11973+
11974+ coord_init_first_unit(&coord, right);
11975+
11976+ if (!item_is_internal(&coord)) {
11977+ /* we do not want to squeeze anything else to left neighbor because "slum"
11978+ is over */
11979+ ret = SQUEEZE_TARGET_FULL;
11980+ goto out;
11981+ }
11982+ assert("jmacd-433", item_is_internal(&coord));
11983+
11984+ /* Shift an internal unit. The child must be allocated before shifting any more
11985+ extents, so we stop here. */
11986+ ret = shift_one_internal_unit(left, right);
11987+
11988+ out:
11989+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
11990+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
11991+
11992+ if (ret == SQUEEZE_TARGET_FULL) {
11993+ /* We submit prepped nodes here and expect that this @left twig
11994+ * will not be modified again during this jnode_flush() call. */
11995+ int ret1;
11996+
11997+ /* NOTE: seems like io is done under long term locks. */
11998+ ret1 = write_prepped_nodes(pos);
11999+ if (ret1 < 0)
12000+ return ret1;
12001+ }
12002+
12003+ return ret;
12004+}
12005+
12006+#if REISER4_DEBUG
12007+static void item_convert_invariant(flush_pos_t * pos)
12008+{
12009+ assert("edward-1225", coord_is_existing_item(&pos->coord));
12010+ if (chaining_data_present(pos)) {
12011+ item_plugin *iplug = item_convert_plug(pos);
12012+
12013+ assert("edward-1000",
12014+ iplug == item_plugin_by_coord(&pos->coord));
12015+ assert("edward-1001", iplug->f.convert != NULL);
12016+ } else
12017+ assert("edward-1226", pos->child == NULL);
12018+}
12019+#else
12020+
12021+#define item_convert_invariant(pos) noop
12022+
12023+#endif
12024+
12025+/* Scan node items starting from the first one and apply for each
12026+ item its flush ->convert() method (if any). This method may
12027+ resize/kill the item so the tree will be changed.
12028+*/
12029+static int convert_node(flush_pos_t * pos, znode * node)
12030+{
12031+ int ret = 0;
12032+ item_plugin *iplug;
12033+
12034+ assert("edward-304", pos != NULL);
12035+ assert("edward-305", pos->child == NULL);
12036+ assert("edward-475", znode_convertible(node));
12037+ assert("edward-669", znode_is_wlocked(node));
12038+ assert("edward-1210", !node_is_empty(node));
12039+
12040+ if (znode_get_level(node) != LEAF_LEVEL)
12041+ /* unsupported */
12042+ goto exit;
12043+
12044+ coord_init_first_unit(&pos->coord, node);
12045+
12046+ while (1) {
12047+ ret = 0;
12048+ coord_set_to_left(&pos->coord);
12049+ item_convert_invariant(pos);
12050+
12051+ iplug = item_plugin_by_coord(&pos->coord);
12052+ assert("edward-844", iplug != NULL);
12053+
12054+ if (iplug->f.convert) {
12055+ ret = iplug->f.convert(pos);
12056+ if (ret)
12057+ goto exit;
12058+ }
12059+ assert("edward-307", pos->child == NULL);
12060+
12061+ if (coord_next_item(&pos->coord)) {
12062+ /* node is over */
12063+
12064+ if (!chaining_data_present(pos))
12065+ /* finished this node */
12066+ break;
12067+ if (should_chain_next_node(pos)) {
12068+ /* go to next node */
12069+ move_chaining_data(pos, 0 /* to next node */ );
12070+ break;
12071+ }
12072+ /* repeat this node */
12073+ move_chaining_data(pos, 1 /* this node */ );
12074+ continue;
12075+ }
12076+ /* Node is not over.
12077+ Check if there is attached convert data.
12078+ If so roll one item position back and repeat
12079+ on this node
12080+ */
12081+ if (chaining_data_present(pos)) {
12082+
12083+ if (iplug != item_plugin_by_coord(&pos->coord))
12084+ set_item_convert_count(pos, 0);
12085+
12086+ ret = coord_prev_item(&pos->coord);
12087+ assert("edward-1003", !ret);
12088+
12089+ move_chaining_data(pos, 1 /* this node */ );
12090+ }
12091+ }
12092+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12093+ znode_make_dirty(node);
12094+ exit:
12095+ assert("edward-1004", !ret);
12096+ return ret;
12097+}
12098+
12099+/* Squeeze and allocate the right neighbor. This is called after @left and
12100+ its current children have been squeezed and allocated already. This
12101+ procedure's job is to squeeze and items from @right to @left.
12102+
12103+ If at the leaf level, use the shift_everything_left memcpy-optimized
12104+ version of shifting (squeeze_right_leaf).
12105+
12106+ If at the twig level, extents are allocated as they are shifted from @right
12107+ to @left (squalloc_right_twig).
12108+
12109+ At any other level, shift one internal item and return to the caller
12110+ (squalloc_parent_first) so that the shifted-subtree can be processed in
12111+ parent-first order.
12112+
12113+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12114+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12115+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12116+ is returned.
12117+*/
12118+
12119+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12120+ znode * right)
12121+{
12122+ int ret;
12123+
12124+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12125+ * tree owing to error (for example, ENOSPC) in write */
12126+ /* assert("jmacd-9321", !node_is_empty(left)); */
12127+ assert("jmacd-9322", !node_is_empty(right));
12128+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12129+
12130+ switch (znode_get_level(left)) {
12131+ case TWIG_LEVEL:
12132+ /* Shift with extent allocating until either an internal item
12133+ is encountered or everything is shifted or no free space
12134+ left in @left */
12135+ ret = squeeze_right_twig(left, right, pos);
12136+ break;
12137+
12138+ default:
12139+ /* All other levels can use shift_everything until we implement per-item
12140+ flush plugins. */
12141+ ret = squeeze_right_non_twig(left, right);
12142+ break;
12143+ }
12144+
12145+ assert("jmacd-2011", (ret < 0 ||
12146+ ret == SQUEEZE_SOURCE_EMPTY
12147+ || ret == SQUEEZE_TARGET_FULL
12148+ || ret == SUBTREE_MOVED));
12149+ return ret;
12150+}
12151+
12152+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12153+ znode * right)
12154+{
12155+ int ret;
12156+
12157+ ret = squeeze_right_twig(pos->lock.node, right, pos);
12158+ if (ret < 0)
12159+ return ret;
12160+ if (ret > 0) {
12161+ coord_init_after_last_item(&pos->coord, pos->lock.node);
12162+ return ret;
12163+ }
12164+
12165+ coord_init_last_unit(&pos->coord, pos->lock.node);
12166+ return 0;
12167+}
12168+
12169+/* forward declaration */
12170+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12171+
12172+/* do a fast check for "same parents" condition before calling
12173+ * squalloc_upper_levels() */
12174+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12175+ znode * left,
12176+ znode * right)
12177+{
12178+ if (znode_same_parents(left, right))
12179+ return 0;
12180+
12181+ return squalloc_upper_levels(pos, left, right);
12182+}
12183+
12184+/* Check whether the parent of given @right node needs to be processes
12185+ ((re)allocated) prior to processing of the child. If @left and @right do not
12186+ share at least the parent of the @right is after the @left but before the
12187+ @right in parent-first order, we have to (re)allocate it before the @right
12188+ gets (re)allocated. */
12189+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12190+{
12191+ int ret;
12192+
12193+ lock_handle left_parent_lock;
12194+ lock_handle right_parent_lock;
12195+
12196+ load_count left_parent_load;
12197+ load_count right_parent_load;
12198+
12199+ init_lh(&left_parent_lock);
12200+ init_lh(&right_parent_lock);
12201+
12202+ init_load_count(&left_parent_load);
12203+ init_load_count(&right_parent_load);
12204+
12205+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12206+ if (ret)
12207+ goto out;
12208+
12209+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12210+ if (ret)
12211+ goto out;
12212+
12213+ /* Check for same parents */
12214+ if (left_parent_lock.node == right_parent_lock.node)
12215+ goto out;
12216+
12217+ if (znode_check_flushprepped(right_parent_lock.node)) {
12218+ /* Keep parent-first order. In the order, the right parent node stands
12219+ before the @right node. If it is already allocated, we set the
12220+ preceder (next block search start point) to its block number, @right
12221+ node should be allocated after it.
12222+
12223+ However, preceder is set only if the right parent is on twig level.
12224+ The explanation is the following: new branch nodes are allocated over
12225+ already allocated children while the tree grows, it is difficult to
12226+ keep tree ordered, we assume that only leaves and twings are correctly
12227+ allocated. So, only twigs are used as a preceder for allocating of the
12228+ rest of the slum. */
12229+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12230+ pos->preceder.blk =
12231+ *znode_get_block(right_parent_lock.node);
12232+ check_preceder(pos->preceder.blk);
12233+ }
12234+ goto out;
12235+ }
12236+
12237+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12238+ if (ret)
12239+ goto out;
12240+
12241+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12242+ if (ret)
12243+ goto out;
12244+
12245+ ret =
12246+ squeeze_right_neighbor(pos, left_parent_lock.node,
12247+ right_parent_lock.node);
12248+ /* We stop if error. We stop if some items/units were shifted (ret == 0)
12249+ * and thus @right changed its parent. It means we have not process
12250+ * right_parent node prior to processing of @right. Positive return
12251+ * values say that shifting items was not happen because of "empty
12252+ * source" or "target full" conditions. */
12253+ if (ret <= 0)
12254+ goto out;
12255+
12256+ /* parent(@left) and parent(@right) may have different parents also. We
12257+ * do a recursive call for checking that. */
12258+ ret =
12259+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12260+ right_parent_lock.node);
12261+ if (ret)
12262+ goto out;
12263+
12264+ /* allocate znode when going down */
12265+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12266+
12267+ out:
12268+ done_load_count(&left_parent_load);
12269+ done_load_count(&right_parent_load);
12270+
12271+ done_lh(&left_parent_lock);
12272+ done_lh(&right_parent_lock);
12273+
12274+ return ret;
12275+}
12276+
12277+/* Check the leftmost child "flushprepped" status, also returns true if child
12278+ * node was not found in cache. */
12279+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12280+{
12281+ int ret;
12282+ int prepped;
12283+
12284+ jnode *child;
12285+
12286+ ret = get_leftmost_child_of_unit(coord, &child);
12287+
12288+ if (ret)
12289+ return ret;
12290+
12291+ if (child) {
12292+ prepped = jnode_check_flushprepped(child);
12293+ jput(child);
12294+ } else {
12295+ /* We consider not existing child as a node which slum
12296+ processing should not continue to. Not cached node is clean,
12297+ so it is flushprepped. */
12298+ prepped = 1;
12299+ }
12300+
12301+ return prepped;
12302+}
12303+
12304+/* (re)allocate znode with automated getting parent node */
12305+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12306+{
12307+ int ret;
12308+ lock_handle parent_lock;
12309+ load_count parent_load;
12310+ coord_t pcoord;
12311+
12312+ assert("zam-851", znode_is_write_locked(node));
12313+
12314+ init_lh(&parent_lock);
12315+ init_load_count(&parent_load);
12316+
12317+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12318+ if (ret)
12319+ goto out;
12320+
12321+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12322+ if (ret)
12323+ goto out;
12324+
12325+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
12326+ if (ret)
12327+ goto out;
12328+
12329+ ret = allocate_znode(node, &pcoord, pos);
12330+
12331+ out:
12332+ done_load_count(&parent_load);
12333+ done_lh(&parent_lock);
12334+ return ret;
12335+}
12336+
12337+/* Process nodes on leaf level until unformatted node or rightmost node in the
12338+ * slum reached. */
12339+static int handle_pos_on_formatted(flush_pos_t * pos)
12340+{
12341+ int ret;
12342+ lock_handle right_lock;
12343+ load_count right_load;
12344+
12345+ init_lh(&right_lock);
12346+ init_load_count(&right_load);
12347+
12348+ if (should_convert_node(pos, pos->lock.node)) {
12349+ ret = convert_node(pos, pos->lock.node);
12350+ if (ret)
12351+ return ret;
12352+ }
12353+
12354+ while (1) {
71430cf6
MT
12355+ int expected;
12356+ expected = should_convert_next_node(pos);
12357+ ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12358+ ZNODE_WRITE_LOCK, !expected, expected);
12359+ if (ret) {
12360+ if (expected)
12361+ warning("edward-1495",
12362+ "Expected neighbor not found (ret = %d). Fsck?",
12363+ ret);
44254afd 12364+ break;
71430cf6 12365+ }
44254afd
MT
12366+
12367+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12368+ * can be optimal. For now we choose to live with the risk that it will
12369+ * be suboptimal because it would be quite complex to code it to be
12370+ * smarter. */
12371+ if (znode_check_flushprepped(right_lock.node)
12372+ && !znode_convertible(right_lock.node)) {
71430cf6 12373+ assert("edward-1005", !should_convert_next_node(pos));
44254afd
MT
12374+ pos_stop(pos);
12375+ break;
12376+ }
12377+
12378+ ret = incr_load_count_znode(&right_load, right_lock.node);
12379+ if (ret)
12380+ break;
44254afd
MT
12381+ if (should_convert_node(pos, right_lock.node)) {
12382+ ret = convert_node(pos, right_lock.node);
12383+ if (ret)
12384+ break;
12385+ if (node_is_empty(right_lock.node)) {
12386+ /* node became empty after converting, repeat */
12387+ done_load_count(&right_load);
12388+ done_lh(&right_lock);
12389+ continue;
12390+ }
12391+ }
12392+
12393+ /* squeeze _before_ going upward. */
12394+ ret =
12395+ squeeze_right_neighbor(pos, pos->lock.node,
12396+ right_lock.node);
12397+ if (ret < 0)
12398+ break;
12399+
12400+ if (znode_check_flushprepped(right_lock.node)) {
71430cf6 12401+ if (should_convert_next_node(pos)) {
44254afd
MT
12402+ /* in spite of flushprepped status of the node,
12403+ its right slum neighbor should be converted */
12404+ assert("edward-953", convert_data(pos));
12405+ assert("edward-954", item_convert_data(pos));
12406+
12407+ if (node_is_empty(right_lock.node)) {
12408+ done_load_count(&right_load);
12409+ done_lh(&right_lock);
12410+ } else
12411+ move_flush_pos(pos, &right_lock,
12412+ &right_load, NULL);
12413+ continue;
12414+ }
12415+ pos_stop(pos);
12416+ break;
12417+ }
12418+
12419+ if (node_is_empty(right_lock.node)) {
12420+ /* repeat if right node was squeezed completely */
12421+ done_load_count(&right_load);
12422+ done_lh(&right_lock);
12423+ continue;
12424+ }
12425+
12426+ /* parent(right_lock.node) has to be processed before
12427+ * (right_lock.node) due to "parent-first" allocation order. */
12428+ ret =
12429+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12430+ right_lock.node);
12431+ if (ret)
12432+ break;
12433+ /* (re)allocate _after_ going upward */
12434+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12435+ if (ret)
12436+ break;
44254afd
MT
12437+ if (should_terminate_squalloc(pos)) {
12438+ set_item_convert_count(pos, 0);
12439+ break;
12440+ }
12441+
12442+ /* advance the flush position to the right neighbor */
12443+ move_flush_pos(pos, &right_lock, &right_load, NULL);
12444+
12445+ ret = rapid_flush(pos);
12446+ if (ret)
12447+ break;
12448+ }
71430cf6 12449+ check_convert_info(pos);
44254afd
MT
12450+ done_load_count(&right_load);
12451+ done_lh(&right_lock);
12452+
12453+ /* This function indicates via pos whether to stop or go to twig or continue on current
12454+ * level. */
12455+ return ret;
12456+
12457+}
12458+
12459+/* Process nodes on leaf level until unformatted node or rightmost node in the
12460+ * slum reached. */
12461+static int handle_pos_on_leaf(flush_pos_t * pos)
12462+{
12463+ int ret;
12464+
12465+ assert("zam-845", pos->state == POS_ON_LEAF);
12466+
12467+ ret = handle_pos_on_formatted(pos);
12468+
12469+ if (ret == -E_NO_NEIGHBOR) {
12470+ /* cannot get right neighbor, go process extents. */
12471+ pos->state = POS_TO_TWIG;
12472+ return 0;
12473+ }
12474+
12475+ return ret;
12476+}
12477+
12478+/* Process slum on level > 1 */
12479+static int handle_pos_on_internal(flush_pos_t * pos)
12480+{
12481+ assert("zam-850", pos->state == POS_ON_INTERNAL);
12482+ return handle_pos_on_formatted(pos);
12483+}
12484+
12485+/* check whether squalloc should stop before processing given extent */
12486+static int squalloc_extent_should_stop(flush_pos_t * pos)
12487+{
12488+ assert("zam-869", item_is_extent(&pos->coord));
12489+
12490+ /* pos->child is a jnode handle_pos_on_extent() should start with in
12491+ * stead of the first child of the first extent unit. */
12492+ if (pos->child) {
12493+ int prepped;
12494+
12495+ assert("vs-1383", jnode_is_unformatted(pos->child));
12496+ prepped = jnode_check_flushprepped(pos->child);
12497+ pos->pos_in_unit =
12498+ jnode_get_index(pos->child) -
12499+ extent_unit_index(&pos->coord);
12500+ assert("vs-1470",
12501+ pos->pos_in_unit < extent_unit_width(&pos->coord));
12502+ assert("nikita-3434",
12503+ ergo(extent_is_unallocated(&pos->coord),
12504+ pos->pos_in_unit == 0));
12505+ jput(pos->child);
12506+ pos->child = NULL;
12507+
12508+ return prepped;
12509+ }
12510+
12511+ pos->pos_in_unit = 0;
12512+ if (extent_is_unallocated(&pos->coord))
12513+ return 0;
12514+
12515+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12516+}
12517+
12518+/* Handle the case when regular reiser4 tree (znodes connected one to its
12519+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
12520+ * unformatted nodes. By having a lock on twig level and use extent code
12521+ * routines to process unformatted nodes we swim around an irregular part of
12522+ * reiser4 tree. */
12523+static int handle_pos_on_twig(flush_pos_t * pos)
12524+{
12525+ int ret;
12526+
12527+ assert("zam-844", pos->state == POS_ON_EPOINT);
12528+ assert("zam-843", item_is_extent(&pos->coord));
12529+
12530+ /* We decide should we continue slum processing with current extent
12531+ unit: if leftmost child of current extent unit is flushprepped
12532+ (i.e. clean or already processed by flush) we stop squalloc(). There
12533+ is a fast check for unallocated extents which we assume contain all
12534+ not flushprepped nodes. */
12535+ /* FIXME: Here we implement simple check, we are only looking on the
12536+ leftmost child. */
12537+ ret = squalloc_extent_should_stop(pos);
12538+ if (ret != 0) {
12539+ pos_stop(pos);
12540+ return ret;
12541+ }
12542+
12543+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12544+ && item_is_extent(&pos->coord)) {
71430cf6 12545+ ret = reiser4_alloc_extent(pos);
44254afd
MT
12546+ if (ret) {
12547+ break;
12548+ }
12549+ coord_next_unit(&pos->coord);
12550+ }
12551+
12552+ if (coord_is_after_rightmost(&pos->coord)) {
12553+ pos->state = POS_END_OF_TWIG;
12554+ return 0;
12555+ }
12556+ if (item_is_internal(&pos->coord)) {
12557+ pos->state = POS_TO_LEAF;
12558+ return 0;
12559+ }
12560+
12561+ assert("zam-860", item_is_extent(&pos->coord));
12562+
12563+ /* "slum" is over */
12564+ pos->state = POS_INVALID;
12565+ return 0;
12566+}
12567+
12568+/* When we about to return flush position from twig to leaf level we can process
12569+ * the right twig node or move position to the leaf. This processes right twig
12570+ * if it is possible and jump to leaf level if not. */
12571+static int handle_pos_end_of_twig(flush_pos_t * pos)
12572+{
12573+ int ret;
12574+ lock_handle right_lock;
12575+ load_count right_load;
12576+ coord_t at_right;
12577+ jnode *child = NULL;
12578+
12579+ assert("zam-848", pos->state == POS_END_OF_TWIG);
12580+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
12581+
12582+ init_lh(&right_lock);
12583+ init_load_count(&right_load);
12584+
12585+ /* We get a lock on the right twig node even it is not dirty because
12586+ * slum continues or discontinues on leaf level not on next twig. This
12587+ * lock on the right twig is needed for getting its leftmost child. */
12588+ ret =
12589+ reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12590+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12591+ if (ret)
12592+ goto out;
12593+
12594+ ret = incr_load_count_znode(&right_load, right_lock.node);
12595+ if (ret)
12596+ goto out;
12597+
12598+ /* right twig could be not dirty */
12599+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12600+ /* If right twig node is dirty we always attempt to squeeze it
12601+ * content to the left... */
12602+ became_dirty:
12603+ ret =
12604+ squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12605+ if (ret <= 0) {
12606+ /* pos->coord is on internal item, go to leaf level, or
12607+ * we have an error which will be caught in squalloc() */
12608+ pos->state = POS_TO_LEAF;
12609+ goto out;
12610+ }
12611+
12612+ /* If right twig was squeezed completely we wave to re-lock
12613+ * right twig. now it is done through the top-level squalloc
12614+ * routine. */
12615+ if (node_is_empty(right_lock.node))
12616+ goto out;
12617+
12618+ /* ... and prep it if it is not yet prepped */
12619+ if (!znode_check_flushprepped(right_lock.node)) {
12620+ /* As usual, process parent before ... */
12621+ ret =
12622+ check_parents_and_squalloc_upper_levels(pos,
12623+ pos->lock.
12624+ node,
12625+ right_lock.
12626+ node);
12627+ if (ret)
12628+ goto out;
12629+
12630+ /* ... processing the child */
12631+ ret =
12632+ lock_parent_and_allocate_znode(right_lock.node,
12633+ pos);
12634+ if (ret)
12635+ goto out;
12636+ }
12637+ } else {
12638+ coord_init_first_unit(&at_right, right_lock.node);
12639+
12640+ /* check first child of next twig, should we continue there ? */
12641+ ret = get_leftmost_child_of_unit(&at_right, &child);
12642+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
12643+ pos_stop(pos);
12644+ goto out;
12645+ }
12646+
12647+ /* check clean twig for possible relocation */
12648+ if (!znode_check_flushprepped(right_lock.node)) {
12649+ ret =
12650+ reverse_relocate_check_dirty_parent(child,
12651+ &at_right, pos);
12652+ if (ret)
12653+ goto out;
12654+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12655+ goto became_dirty;
12656+ }
12657+ }
12658+
12659+ assert("zam-875", znode_check_flushprepped(right_lock.node));
12660+
12661+ /* Update the preceder by a block number of just processed right twig
12662+ * node. The code above could miss the preceder updating because
12663+ * allocate_znode() could not be called for this node. */
12664+ pos->preceder.blk = *znode_get_block(right_lock.node);
12665+ check_preceder(pos->preceder.blk);
12666+
12667+ coord_init_first_unit(&at_right, right_lock.node);
12668+ assert("zam-868", coord_is_existing_unit(&at_right));
12669+
12670+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12671+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
12672+
12673+ out:
12674+ done_load_count(&right_load);
12675+ done_lh(&right_lock);
12676+
12677+ if (child)
12678+ jput(child);
12679+
12680+ return ret;
12681+}
12682+
12683+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12684+ * continue there. */
12685+static int handle_pos_to_leaf(flush_pos_t * pos)
12686+{
12687+ int ret;
12688+ lock_handle child_lock;
12689+ load_count child_load;
12690+ jnode *child;
12691+
12692+ assert("zam-846", pos->state == POS_TO_LEAF);
12693+ assert("zam-847", item_is_internal(&pos->coord));
12694+
12695+ init_lh(&child_lock);
12696+ init_load_count(&child_load);
12697+
12698+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
12699+ if (ret)
12700+ return ret;
12701+ if (child == NULL) {
12702+ pos_stop(pos);
12703+ return 0;
12704+ }
12705+
12706+ if (jnode_check_flushprepped(child)) {
12707+ pos->state = POS_INVALID;
12708+ goto out;
12709+ }
12710+
12711+ ret =
12712+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12713+ ZNODE_LOCK_LOPRI);
12714+ if (ret)
12715+ goto out;
12716+
12717+ ret = incr_load_count_znode(&child_load, JZNODE(child));
12718+ if (ret)
12719+ goto out;
12720+
12721+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12722+ if (ret)
12723+ goto out;
12724+
12725+ /* move flush position to leaf level */
12726+ pos->state = POS_ON_LEAF;
12727+ move_flush_pos(pos, &child_lock, &child_load, NULL);
12728+
12729+ if (node_is_empty(JZNODE(child))) {
12730+ ret = delete_empty_node(JZNODE(child));
12731+ pos->state = POS_INVALID;
12732+ }
12733+ out:
12734+ done_load_count(&child_load);
12735+ done_lh(&child_lock);
12736+ jput(child);
12737+
12738+ return ret;
12739+}
12740+
12741+/* move pos from leaf to twig, and move lock from leaf to twig. */
12742+/* Move pos->lock to upper (twig) level */
12743+static int handle_pos_to_twig(flush_pos_t * pos)
12744+{
12745+ int ret;
12746+
12747+ lock_handle parent_lock;
12748+ load_count parent_load;
12749+ coord_t pcoord;
12750+
12751+ assert("zam-852", pos->state == POS_TO_TWIG);
12752+
12753+ init_lh(&parent_lock);
12754+ init_load_count(&parent_load);
12755+
12756+ ret =
12757+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12758+ if (ret)
12759+ goto out;
12760+
12761+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12762+ if (ret)
12763+ goto out;
12764+
12765+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12766+ if (ret)
12767+ goto out;
12768+
12769+ assert("zam-870", item_is_internal(&pcoord));
12770+ coord_next_item(&pcoord);
12771+
12772+ if (coord_is_after_rightmost(&pcoord))
12773+ pos->state = POS_END_OF_TWIG;
12774+ else if (item_is_extent(&pcoord))
12775+ pos->state = POS_ON_EPOINT;
12776+ else {
12777+ /* Here we understand that getting -E_NO_NEIGHBOR in
12778+ * handle_pos_on_leaf() was because of just a reaching edge of
12779+ * slum */
12780+ pos_stop(pos);
12781+ goto out;
12782+ }
12783+
12784+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12785+
12786+ out:
12787+ done_load_count(&parent_load);
12788+ done_lh(&parent_lock);
12789+
12790+ return ret;
12791+}
12792+
12793+typedef int (*pos_state_handle_t) (flush_pos_t *);
12794+static pos_state_handle_t flush_pos_handlers[] = {
12795+ /* process formatted nodes on leaf level, keep lock on a leaf node */
12796+ [POS_ON_LEAF] = handle_pos_on_leaf,
12797+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12798+ * being processed */
12799+ [POS_ON_EPOINT] = handle_pos_on_twig,
12800+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12801+ [POS_TO_TWIG] = handle_pos_to_twig,
12802+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12803+ * pos->coord points to the leaf node we jump to */
12804+ [POS_TO_LEAF] = handle_pos_to_leaf,
12805+ /* after processing last extent in the twig node, attempting to shift items from the twigs
12806+ * right neighbor and process them while shifting */
12807+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12808+ /* process formatted nodes on internal level, keep lock on an internal node */
12809+ [POS_ON_INTERNAL] = handle_pos_on_internal
12810+};
12811+
12812+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12813+ * encrypt) nodes and their ancestors in "parent-first" order */
12814+static int squalloc(flush_pos_t * pos)
12815+{
12816+ int ret = 0;
12817+
12818+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12819+ * greater CPU efficiency? Measure and see.... -Hans */
12820+ while (pos_valid(pos)) {
12821+ ret = flush_pos_handlers[pos->state] (pos);
12822+ if (ret < 0)
12823+ break;
12824+
12825+ ret = rapid_flush(pos);
12826+ if (ret)
12827+ break;
12828+ }
12829+
12830+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12831+ routines, -E_NO_NEIGHBOR means that slum edge was reached */
12832+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
12833+ ret = 0;
12834+
12835+ return ret;
12836+}
12837+
12838+static void update_ldkey(znode * node)
12839+{
12840+ reiser4_key ldkey;
12841+
12842+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12843+ if (node_is_empty(node))
12844+ return;
12845+
12846+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12847+}
12848+
12849+/* this is to be called after calling of shift node's method to shift data from @right to
12850+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12851+ and @right correspondingly and sets right delimiting key of @left to first key of @right */
12852+static void update_znode_dkeys(znode * left, znode * right)
12853+{
12854+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12855+ assert("vs-1629", (znode_is_write_locked(left) &&
12856+ znode_is_write_locked(right)));
12857+
12858+ /* we need to update left delimiting of left if it was empty before shift */
12859+ update_ldkey(left);
12860+ update_ldkey(right);
12861+ if (node_is_empty(right))
12862+ znode_set_rd_key(left, znode_get_rd_key(right));
12863+ else
12864+ znode_set_rd_key(left, znode_get_ld_key(right));
12865+}
12866+
12867+/* try to shift everything from @right to @left. If everything was shifted -
12868+ @right is removed from the tree. Result is the number of bytes shifted. */
12869+static int
12870+shift_everything_left(znode * right, znode * left, carry_level * todo)
12871+{
12872+ coord_t from;
12873+ node_plugin *nplug;
12874+ carry_plugin_info info;
12875+
12876+ coord_init_after_last_item(&from, right);
12877+
12878+ nplug = node_plugin_by_node(right);
12879+ info.doing = NULL;
12880+ info.todo = todo;
12881+ return nplug->shift(&from, left, SHIFT_LEFT,
12882+ 1 /* delete @right if it becomes empty */ ,
12883+ 1
12884+ /* move coord @from to node @left if everything will be shifted */
12885+ ,
12886+ &info);
12887+}
12888+
12889+/* Shift as much as possible from @right to @left using the memcpy-optimized
12890+ shift_everything_left. @left and @right are formatted neighboring nodes on
12891+ leaf level. */
12892+static int squeeze_right_non_twig(znode * left, znode * right)
12893+{
12894+ int ret;
12895+ carry_pool *pool;
12896+ carry_level *todo;
12897+
12898+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12899+
12900+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12901+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12902+ return SQUEEZE_TARGET_FULL;
12903+
12904+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12905+ if (IS_ERR(pool))
12906+ return PTR_ERR(pool);
12907+ todo = (carry_level *) (pool + 1);
12908+ init_carry_level(todo, pool);
12909+
12910+ ret = shift_everything_left(right, left, todo);
12911+ if (ret > 0) {
12912+ /* something was shifted */
12913+ reiser4_tree *tree;
12914+ __u64 grabbed;
12915+
12916+ znode_make_dirty(left);
12917+ znode_make_dirty(right);
12918+
12919+ /* update delimiting keys of nodes which participated in
12920+ shift. FIXME: it would be better to have this in shift
12921+ node's operation. But it can not be done there. Nobody
12922+ remembers why, though */
12923+ tree = znode_get_tree(left);
12924+ write_lock_dk(tree);
12925+ update_znode_dkeys(left, right);
12926+ write_unlock_dk(tree);
12927+
12928+ /* Carry is called to update delimiting key and, maybe, to remove empty
12929+ node. */
12930+ grabbed = get_current_context()->grabbed_blocks;
12931+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12932+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
71430cf6 12933+ ret = reiser4_carry(todo, NULL /* previous level */ );
44254afd
MT
12934+ grabbed2free_mark(grabbed);
12935+ } else {
12936+ /* Shifting impossible, we return appropriate result code */
12937+ ret =
12938+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12939+ SQUEEZE_TARGET_FULL;
12940+ }
12941+
12942+ done_carry_pool(pool);
12943+
12944+ return ret;
12945+}
12946+
12947+#if REISER4_DEBUG
12948+static int sibling_link_is_ok(const znode *left, const znode *right)
12949+{
12950+ int result;
12951+
12952+ read_lock_tree(znode_get_tree(left));
12953+ result = (left->right == right && left == right->left);
12954+ read_unlock_tree(znode_get_tree(left));
12955+ return result;
12956+}
12957+#endif
12958+
12959+/* Shift first unit of first item if it is an internal one. Return
12960+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
12961+ SUBTREE_MOVED. */
12962+static int shift_one_internal_unit(znode * left, znode * right)
12963+{
12964+ int ret;
12965+ carry_pool *pool;
12966+ carry_level *todo;
12967+ coord_t *coord;
12968+ carry_plugin_info *info;
12969+ int size, moved;
12970+
12971+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
12972+ assert("nikita-2435", znode_is_write_locked(left));
12973+ assert("nikita-2436", znode_is_write_locked(right));
12974+ assert("nikita-2434", sibling_link_is_ok(left, right));
12975+
12976+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
12977+ sizeof(*coord) + sizeof(*info)
12978+#if REISER4_DEBUG
12979+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
12980+#endif
12981+ );
12982+ if (IS_ERR(pool))
12983+ return PTR_ERR(pool);
12984+ todo = (carry_level *) (pool + 1);
12985+ init_carry_level(todo, pool);
12986+
12987+ coord = (coord_t *) (todo + 3);
12988+ coord_init_first_unit(coord, right);
12989+ info = (carry_plugin_info *) (coord + 1);
12990+
12991+#if REISER4_DEBUG
12992+ if (!node_is_empty(left)) {
12993+ coord_t *last;
12994+ reiser4_key *right_key;
12995+ reiser4_key *left_key;
12996+
12997+ last = (coord_t *) (info + 1);
12998+ right_key = (reiser4_key *) (last + 1);
12999+ left_key = right_key + 1;
13000+ coord_init_last_unit(last, left);
13001+
13002+ assert("nikita-2463",
13003+ keyle(item_key_by_coord(last, left_key),
13004+ item_key_by_coord(coord, right_key)));
13005+ }
13006+#endif
13007+
13008+ assert("jmacd-2007", item_is_internal(coord));
13009+
13010+ size = item_length_by_coord(coord);
13011+ info->todo = todo;
13012+ info->doing = NULL;
13013+
13014+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13015+ 1
13016+ /* delete @right if it becomes empty */
13017+ ,
13018+ 0
13019+ /* do not move coord @coord to node @left */
13020+ ,
13021+ info);
13022+
13023+ /* If shift returns positive, then we shifted the item. */
13024+ assert("vs-423", ret <= 0 || size == ret);
13025+ moved = (ret > 0);
13026+
13027+ if (moved) {
13028+ /* something was moved */
13029+ reiser4_tree *tree;
13030+ int grabbed;
13031+
13032+ znode_make_dirty(left);
13033+ znode_make_dirty(right);
13034+ tree = znode_get_tree(left);
13035+ write_lock_dk(tree);
13036+ update_znode_dkeys(left, right);
13037+ write_unlock_dk(tree);
13038+
13039+ /* reserve space for delimiting keys after shifting */
13040+ grabbed = get_current_context()->grabbed_blocks;
13041+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13042+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13043+
71430cf6 13044+ ret = reiser4_carry(todo, NULL /* previous level */ );
44254afd
MT
13045+ grabbed2free_mark(grabbed);
13046+ }
13047+
13048+ done_carry_pool(pool);
13049+
13050+ if (ret != 0) {
13051+ /* Shift or carry operation failed. */
13052+ assert("jmacd-7325", ret < 0);
13053+ return ret;
13054+ }
13055+
13056+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13057+}
13058+
13059+/* Make the final relocate/wander decision during forward parent-first squalloc for a
13060+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13061+static int
13062+allocate_znode_loaded(znode * node,
13063+ const coord_t * parent_coord, flush_pos_t * pos)
13064+{
13065+ int ret;
13066+ reiser4_super_info_data *sbinfo = get_current_super_private();
13067+ /* FIXME(D): We have the node write-locked and should have checked for !
13068+ allocated() somewhere before reaching this point, but there can be a race, so
13069+ this assertion is bogus. */
13070+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13071+ assert("jmacd-7988", znode_is_write_locked(node));
13072+ assert("jmacd-7989", coord_is_invalid(parent_coord)
13073+ || znode_is_write_locked(parent_coord->node));
13074+
13075+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13076+ znode_is_root(node) ||
13077+ /* We have enough nodes to relocate no matter what. */
13078+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13079+ /* No need to decide with new nodes, they are treated the same as
13080+ relocate. If the root node is dirty, relocate. */
13081+ if (pos->preceder.blk == 0) {
13082+ /* preceder is unknown and we have decided to relocate node --
13083+ using of default value for search start is better than search
13084+ from block #0. */
13085+ get_blocknr_hint_default(&pos->preceder.blk);
13086+ check_preceder(pos->preceder.blk);
13087+ }
13088+
13089+ goto best_reloc;
13090+
13091+ } else if (pos->preceder.blk == 0) {
13092+ /* If we don't know the preceder, leave it where it is. */
13093+ jnode_make_wander(ZJNODE(node));
13094+ } else {
13095+ /* Make a decision based on block distance. */
13096+ reiser4_block_nr dist;
13097+ reiser4_block_nr nblk = *znode_get_block(node);
13098+
71430cf6
MT
13099+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13100+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
44254afd
MT
13101+ assert("jmacd-6174", pos->preceder.blk != 0);
13102+
13103+ if (pos->preceder.blk == nblk - 1) {
13104+ /* Ideal. */
13105+ jnode_make_wander(ZJNODE(node));
13106+ } else {
13107+
13108+ dist =
13109+ (nblk <
13110+ pos->preceder.blk) ? (pos->preceder.blk -
13111+ nblk) : (nblk -
13112+ pos->preceder.blk);
13113+
13114+ /* See if we can find a closer block (forward direction only). */
13115+ pos->preceder.max_dist =
13116+ min((reiser4_block_nr) sbinfo->flush.
13117+ relocate_distance, dist);
13118+ pos->preceder.level = znode_get_level(node);
13119+
13120+ ret = allocate_znode_update(node, parent_coord, pos);
13121+
13122+ pos->preceder.max_dist = 0;
13123+
13124+ if (ret && (ret != -ENOSPC))
13125+ return ret;
13126+
13127+ if (ret == 0) {
13128+ /* Got a better allocation. */
13129+ znode_make_reloc(node, pos->fq);
13130+ } else if (dist < sbinfo->flush.relocate_distance) {
13131+ /* The present allocation is good enough. */
13132+ jnode_make_wander(ZJNODE(node));
13133+ } else {
13134+ /* Otherwise, try to relocate to the best position. */
13135+ best_reloc:
13136+ ret =
13137+ allocate_znode_update(node, parent_coord,
13138+ pos);
13139+ if (ret != 0)
13140+ return ret;
13141+
13142+ /* set JNODE_RELOC bit _after_ node gets allocated */
13143+ znode_make_reloc(node, pos->fq);
13144+ }
13145+ }
13146+ }
13147+
13148+ /* This is the new preceder. */
13149+ pos->preceder.blk = *znode_get_block(node);
13150+ check_preceder(pos->preceder.blk);
13151+ pos->alloc_cnt += 1;
13152+
71430cf6 13153+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
44254afd
MT
13154+
13155+ return 0;
13156+}
13157+
13158+static int
13159+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13160+{
13161+ /*
13162+ * perform znode allocation with znode pinned in memory to avoid races
13163+ * with asynchronous emergency flush (which plays with
13164+ * JNODE_FLUSH_RESERVED bit).
13165+ */
13166+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13167+}
13168+
13169+/* A subroutine of allocate_znode, this is called first to see if there is a close
13170+ position to relocate to. It may return ENOSPC if there is no close position. If there
13171+ is no close position it may not relocate. This takes care of updating the parent node
13172+ with the relocated block address. */
13173+static int
13174+allocate_znode_update(znode * node, const coord_t * parent_coord,
13175+ flush_pos_t * pos)
13176+{
13177+ int ret;
13178+ reiser4_block_nr blk;
13179+ lock_handle uber_lock;
13180+ int flush_reserved_used = 0;
13181+ int grabbed;
13182+ reiser4_context *ctx;
13183+ reiser4_super_info_data *sbinfo;
13184+
13185+ init_lh(&uber_lock);
13186+
13187+ ctx = get_current_context();
13188+ sbinfo = get_super_private(ctx->super);
13189+
13190+ grabbed = ctx->grabbed_blocks;
13191+
13192+ /* discard e-flush allocation */
13193+ ret = zload(node);
13194+ if (ret)
13195+ return ret;
13196+
13197+ if (ZF_ISSET(node, JNODE_CREATED)) {
71430cf6 13198+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
44254afd
MT
13199+ pos->preceder.block_stage = BLOCK_UNALLOCATED;
13200+ } else {
13201+ pos->preceder.block_stage = BLOCK_GRABBED;
13202+
13203+ /* The disk space for relocating the @node is already reserved in "flush reserved"
13204+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13205+ * space from whole disk not from only 95%). */
13206+ if (znode_get_level(node) == LEAF_LEVEL) {
13207+ /*
13208+ * earlier (during do_jnode_make_dirty()) we decided
13209+ * that @node can possibly go into overwrite set and
13210+ * reserved block for its wandering location.
13211+ */
13212+ txn_atom *atom = get_current_atom_locked();
13213+ assert("nikita-3449",
13214+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13215+ flush_reserved2grabbed(atom, (__u64) 1);
13216+ spin_unlock_atom(atom);
13217+ /*
13218+ * we are trying to move node into relocate
13219+ * set. Allocation of relocated position "uses"
13220+ * reserved block.
13221+ */
13222+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
13223+ flush_reserved_used = 1;
13224+ } else {
13225+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13226+ if (ret != 0)
13227+ goto exit;
13228+ }
13229+ }
13230+
13231+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13232+ ret = reiser4_alloc_block(&pos->preceder, &blk,
13233+ BA_FORMATTED | BA_PERMANENT);
13234+ if (ret)
13235+ goto exit;
13236+
13237+ if (!ZF_ISSET(node, JNODE_CREATED) &&
13238+ (ret =
13239+ reiser4_dealloc_block(znode_get_block(node), 0,
13240+ BA_DEFER | BA_FORMATTED)))
13241+ goto exit;
13242+
13243+ if (likely(!znode_is_root(node))) {
13244+ item_plugin *iplug;
13245+
13246+ iplug = item_plugin_by_coord(parent_coord);
13247+ assert("nikita-2954", iplug->f.update != NULL);
13248+ iplug->f.update(parent_coord, &blk);
13249+
13250+ znode_make_dirty(parent_coord->node);
13251+
13252+ } else {
13253+ reiser4_tree *tree = znode_get_tree(node);
13254+ znode *uber;
13255+
13256+ /* We take a longterm lock on the fake node in order to change
13257+ the root block number. This may cause atom fusion. */
13258+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13259+ &uber_lock);
13260+ /* The fake node cannot be deleted, and we must have priority
13261+ here, and may not be confused with ENOSPC. */
13262+ assert("jmacd-74412",
13263+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13264+
13265+ if (ret)
13266+ goto exit;
13267+
13268+ uber = uber_lock.node;
13269+
13270+ write_lock_tree(tree);
13271+ tree->root_block = blk;
13272+ write_unlock_tree(tree);
13273+
13274+ znode_make_dirty(uber);
13275+ }
13276+
13277+ ret = znode_rehash(node, &blk);
13278+ exit:
13279+ if (ret) {
13280+ /* Get flush reserved block back if something fails, because
13281+ * callers assume that on error block wasn't relocated and its
13282+ * flush reserved block wasn't used. */
13283+ if (flush_reserved_used) {
13284+ /*
13285+ * ok, we failed to move node into relocate
13286+ * set. Restore status quo.
13287+ */
13288+ grabbed2flush_reserved((__u64) 1);
13289+ ZF_SET(node, JNODE_FLUSH_RESERVED);
13290+ }
13291+ }
13292+ zrelse(node);
13293+ done_lh(&uber_lock);
13294+ grabbed2free_mark(grabbed);
13295+ return ret;
13296+}
13297+
13298+/* JNODE INTERFACE */
13299+
13300+/* Lock a node (if formatted) and then get its parent locked, set the child's
13301+ coordinate in the parent. If the child is the root node, the above_root
13302+ znode is returned but the coord is not set. This function may cause atom
13303+ fusion, but it is only used for read locks (at this point) and therefore
13304+ fusion only occurs when the parent is already dirty. */
13305+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13306+ pointer in jnodes. */
13307+static int
13308+jnode_lock_parent_coord(jnode * node,
13309+ coord_t * coord,
13310+ lock_handle * parent_lh,
13311+ load_count * parent_zh,
13312+ znode_lock_mode parent_mode, int try)
13313+{
13314+ int ret;
13315+
13316+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13317+ assert("edward-54", jnode_is_unformatted(node)
13318+ || znode_is_any_locked(JZNODE(node)));
13319+
13320+ if (!jnode_is_znode(node)) {
13321+ reiser4_key key;
13322+ tree_level stop_level = TWIG_LEVEL;
13323+ lookup_bias bias = FIND_EXACT;
13324+
13325+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13326+
13327+ /* The case when node is not znode, but can have parent coord
13328+ (unformatted node, node which represents cluster page,
13329+ etc..). Generate a key for the appropriate entry, search
13330+ in the tree using coord_by_key, which handles locking for
13331+ us. */
13332+
13333+ /*
13334+ * nothing is locked at this moment, so, nothing prevents
13335+ * concurrent truncate from removing jnode from inode. To
13336+ * prevent this spin-lock jnode. jnode can be truncated just
13337+ * after call to the jnode_build_key(), but this is ok,
13338+ * because coord_by_key() will just fail to find appropriate
13339+ * extent.
13340+ */
13341+ spin_lock_jnode(node);
13342+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13343+ jnode_build_key(node, &key);
13344+ ret = 0;
13345+ } else
13346+ ret = RETERR(-ENOENT);
13347+ spin_unlock_jnode(node);
13348+
13349+ if (ret != 0)
13350+ return ret;
13351+
13352+ if (jnode_is_cluster_page(node))
13353+ stop_level = LEAF_LEVEL;
13354+
13355+ assert("jmacd-1812", coord != NULL);
13356+
13357+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13358+ parent_mode, bias, stop_level, stop_level,
13359+ CBK_UNIQUE, NULL /*ra_info */ );
13360+ switch (ret) {
13361+ case CBK_COORD_NOTFOUND:
13362+ assert("edward-1038",
13363+ ergo(jnode_is_cluster_page(node),
13364+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13365+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13366+ warning("nikita-3177", "Parent not found");
13367+ return ret;
13368+ case CBK_COORD_FOUND:
13369+ if (coord->between != AT_UNIT) {
13370+ /* FIXME: comment needed */
13371+ done_lh(parent_lh);
13372+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13373+ warning("nikita-3178",
13374+ "Found but not happy: %i",
13375+ coord->between);
13376+ }
13377+ return RETERR(-ENOENT);
13378+ }
13379+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13380+ if (ret != 0)
13381+ return ret;
13382+ /* if (jnode_is_cluster_page(node)) {
13383+ races with write() are possible
13384+ check_child_cluster (parent_lh->node);
13385+ }
13386+ */
13387+ break;
13388+ default:
13389+ return ret;
13390+ }
13391+
13392+ } else {
13393+ int flags;
13394+ znode *z;
13395+
13396+ z = JZNODE(node);
13397+ /* Formatted node case: */
13398+ assert("jmacd-2061", !znode_is_root(z));
13399+
13400+ flags = GN_ALLOW_NOT_CONNECTED;
13401+ if (try)
13402+ flags |= GN_TRY_LOCK;
13403+
13404+ ret =
13405+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13406+ if (ret != 0)
13407+ /* -E_REPEAT is ok here, it is handled by the caller. */
13408+ return ret;
13409+
13410+ /* Make the child's position "hint" up-to-date. (Unless above
13411+ root, which caller must check.) */
13412+ if (coord != NULL) {
13413+
13414+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13415+ if (ret != 0) {
13416+ warning("jmacd-976812386",
13417+ "incr_load_count_znode failed: %d",
13418+ ret);
13419+ return ret;
13420+ }
13421+
13422+ ret = find_child_ptr(parent_lh->node, z, coord);
13423+ if (ret != 0) {
13424+ warning("jmacd-976812",
13425+ "find_child_ptr failed: %d", ret);
13426+ return ret;
13427+ }
13428+ }
13429+ }
13430+
13431+ return 0;
13432+}
13433+
13434+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13435+ If there is no next neighbor or the neighbor is not in memory or if there is a
13436+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13437+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13438+static int neighbor_in_slum(znode * node, /* starting point */
13439+ lock_handle * lock, /* lock on starting point */
13440+ sideof side, /* left or right direction we seek the next node in */
13441+ znode_lock_mode mode, /* kind of lock we want */
71430cf6
MT
13442+ int check_dirty, /* true if the neighbor should be dirty */
13443+ int use_upper_levels /* get neighbor by going though
13444+ upper levels */)
13445+{
44254afd 13446+ int ret;
71430cf6 13447+ int flags;
44254afd
MT
13448+
13449+ assert("jmacd-6334", znode_is_connected(node));
13450+
71430cf6
MT
13451+ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13452+ if (use_upper_levels)
13453+ flags |= GN_CAN_USE_UPPER_LEVELS;
44254afd 13454+
71430cf6 13455+ ret = reiser4_get_neighbor(lock, node, mode, flags);
44254afd
MT
13456+ if (ret) {
13457+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
13458+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13459+ if (ret == -ENOENT) {
13460+ ret = RETERR(-E_NO_NEIGHBOR);
13461+ }
44254afd
MT
13462+ return ret;
13463+ }
13464+ if (!check_dirty)
13465+ return 0;
13466+ /* Check dirty bit of locked znode, no races here */
13467+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13468+ return 0;
13469+
13470+ done_lh(lock);
13471+ return RETERR(-E_NO_NEIGHBOR);
13472+}
13473+
13474+/* Return true if two znodes have the same parent. This is called with both nodes
13475+ write-locked (for squeezing) so no tree lock is needed. */
13476+static int znode_same_parents(znode * a, znode * b)
13477+{
13478+ int result;
13479+
13480+ assert("jmacd-7011", znode_is_write_locked(a));
13481+ assert("jmacd-7012", znode_is_write_locked(b));
13482+
13483+ /* We lock the whole tree for this check.... I really don't like whole tree
13484+ * locks... -Hans */
13485+ read_lock_tree(znode_get_tree(a));
13486+ result = (znode_parent(a) == znode_parent(b));
13487+ read_unlock_tree(znode_get_tree(a));
13488+ return result;
13489+}
13490+
13491+/* FLUSH SCAN */
13492+
13493+/* Initialize the flush_scan data structure. */
13494+static void scan_init(flush_scan * scan)
13495+{
13496+ memset(scan, 0, sizeof(*scan));
13497+ init_lh(&scan->node_lock);
13498+ init_lh(&scan->parent_lock);
13499+ init_load_count(&scan->parent_load);
13500+ init_load_count(&scan->node_load);
13501+ coord_init_invalid(&scan->parent_coord, NULL);
13502+}
13503+
13504+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13505+static void scan_done(flush_scan * scan)
13506+{
13507+ done_load_count(&scan->node_load);
13508+ if (scan->node != NULL) {
13509+ jput(scan->node);
13510+ scan->node = NULL;
13511+ }
13512+ done_load_count(&scan->parent_load);
13513+ done_lh(&scan->parent_lock);
13514+ done_lh(&scan->node_lock);
13515+}
13516+
13517+/* Returns true if flush scanning is finished. */
71430cf6 13518+int reiser4_scan_finished(flush_scan * scan)
44254afd
MT
13519+{
13520+ return scan->stop || (scan->direction == RIGHT_SIDE &&
13521+ scan->count >= scan->max_count);
13522+}
13523+
13524+/* Return true if the scan should continue to the @tonode. True if the node meets the
13525+ same_slum_check condition. If not, deref the "left" node and stop the scan. */
71430cf6 13526+int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
44254afd
MT
13527+{
13528+ int go = same_slum_check(scan->node, tonode, 1, 0);
13529+
13530+ if (!go) {
13531+ scan->stop = 1;
13532+ jput(tonode);
13533+ }
13534+
13535+ return go;
13536+}
13537+
13538+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13539+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13540+ parent coordinate. */
13541+int
13542+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13543+ const coord_t * parent)
13544+{
13545+ /* Release the old references, take the new reference. */
13546+ done_load_count(&scan->node_load);
13547+
13548+ if (scan->node != NULL) {
13549+ jput(scan->node);
13550+ }
13551+ scan->node = node;
13552+ scan->count += add_count;
13553+
71430cf6 13554+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
44254afd
MT
13555+ delay this update step until it finishes and update the parent_coord only once.
13556+ It did that before, but there was a bug and this was the easiest way to make it
13557+ correct. */
13558+ if (parent != NULL) {
13559+ coord_dup(&scan->parent_coord, parent);
13560+ }
13561+
13562+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13563+ is safely taken. */
13564+ return incr_load_count_jnode(&scan->node_load, node);
13565+}
13566+
13567+/* Return true if scanning in the leftward direction. */
71430cf6 13568+int reiser4_scanning_left(flush_scan * scan)
44254afd
MT
13569+{
13570+ return scan->direction == LEFT_SIDE;
13571+}
13572+
13573+/* Performs leftward scanning starting from either kind of node. Counts the starting
13574+ node. The right-scan object is passed in for the left-scan in order to copy the parent
13575+ of an unformatted starting position. This way we avoid searching for the unformatted
13576+ node's parent when scanning in each direction. If we search for the parent once it is
13577+ set in both scan objects. The limit parameter tells flush-scan when to stop.
13578+
13579+ Rapid scanning is used only during scan_left, where we are interested in finding the
13580+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child
13581+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
13582+ problem is finding a way to flush only those nodes without unallocated children, and it
13583+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The
13584+ problem can be solved by scanning left at every level as we go upward, but this would
13585+ basically bring us back to using a top-down allocation strategy, which we already tried
13586+ (see BK history from May 2002), and has a different set of problems. The top-down
13587+ strategy makes avoiding unallocated children easier, but makes it difficult to
13588+ propertly flush dirty children with clean parents that would otherwise stop the
13589+ top-down flush, only later to dirty the parent once the children are flushed. So we
13590+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13591+ only.
13592+
13593+ The first step in solving the problem is this rapid leftward scan. After we determine
13594+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13595+ are no longer interested in the exact count, we are only interested in finding a the
13596+ best place to start the flush. We could choose one of two possibilities:
13597+
13598+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13599+ This requires checking one leaf per rapid-scan twig
13600+
13601+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13602+ to the left. This requires checking possibly all of the in-memory children of each
13603+ twig during the rapid scan.
13604+
13605+ For now we implement the first policy.
13606+*/
13607+static int
13608+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13609+{
13610+ int ret = 0;
13611+
13612+ scan->max_count = limit;
13613+ scan->direction = LEFT_SIDE;
13614+
13615+ ret = scan_set_current(scan, jref(node), 1, NULL);
13616+ if (ret != 0) {
13617+ return ret;
13618+ }
13619+
13620+ ret = scan_common(scan, right);
13621+ if (ret != 0) {
13622+ return ret;
13623+ }
13624+
13625+ /* Before rapid scanning, we need a lock on scan->node so that we can get its
13626+ parent, only if formatted. */
13627+ if (jnode_is_znode(scan->node)) {
13628+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13629+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13630+ }
13631+
13632+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13633+ return ret;
13634+}
13635+
13636+/* Performs rightward scanning... Does not count the starting node. The limit parameter
13637+ is described in scan_left. If the starting node is unformatted then the
13638+ parent_coord was already set during scan_left. The rapid_after parameter is not used
13639+ during right-scanning.
13640+
13641+ scan_right is only called if the scan_left operation does not count at least
13642+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
13643+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13644+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13645+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13646+{
13647+ int ret;
13648+
13649+ scan->max_count = limit;
13650+ scan->direction = RIGHT_SIDE;
13651+
13652+ ret = scan_set_current(scan, jref(node), 0, NULL);
13653+ if (ret != 0) {
13654+ return ret;
13655+ }
13656+
13657+ return scan_common(scan, NULL);
13658+}
13659+
13660+/* Common code to perform left or right scanning. */
13661+static int scan_common(flush_scan * scan, flush_scan * other)
13662+{
13663+ int ret;
13664+
13665+ assert("nikita-2376", scan->node != NULL);
13666+ assert("edward-54", jnode_is_unformatted(scan->node)
13667+ || jnode_is_znode(scan->node));
13668+
13669+ /* Special case for starting at an unformatted node. Optimization: we only want
13670+ to search for the parent (which requires a tree traversal) once. Obviously, we
13671+ shouldn't have to call it once for the left scan and once for the right scan.
13672+ For this reason, if we search for the parent during scan-left we then duplicate
13673+ the coord/lock/load into the scan-right object. */
13674+ if (jnode_is_unformatted(scan->node)) {
13675+ ret = scan_unformatted(scan, other);
13676+ if (ret != 0)
13677+ return ret;
13678+ }
13679+ /* This loop expects to start at a formatted position and performs chaining of
13680+ formatted regions */
71430cf6 13681+ while (!reiser4_scan_finished(scan)) {
44254afd
MT
13682+
13683+ ret = scan_formatted(scan);
13684+ if (ret != 0) {
13685+ return ret;
13686+ }
13687+ }
13688+
13689+ return 0;
13690+}
13691+
13692+static int scan_unformatted(flush_scan * scan, flush_scan * other)
13693+{
13694+ int ret = 0;
13695+ int try = 0;
13696+
13697+ if (!coord_is_invalid(&scan->parent_coord))
13698+ goto scan;
13699+
13700+ /* set parent coord from */
13701+ if (!jnode_is_unformatted(scan->node)) {
13702+ /* formatted position */
13703+
13704+ lock_handle lock;
13705+ assert("edward-301", jnode_is_znode(scan->node));
13706+ init_lh(&lock);
13707+
13708+ /*
13709+ * when flush starts from unformatted node, first thing it
13710+ * does is tree traversal to find formatted parent of starting
13711+ * node. This parent is then kept lock across scans to the
13712+ * left and to the right. This means that during scan to the
13713+ * left we cannot take left-ward lock, because this is
13714+ * dead-lock prone. So, if we are scanning to the left and
13715+ * there is already lock held by this thread,
13716+ * jnode_lock_parent_coord() should use try-lock.
13717+ */
71430cf6 13718+ try = reiser4_scanning_left(scan)
44254afd
MT
13719+ && !lock_stack_isclean(get_current_lock_stack());
13720+ /* Need the node locked to get the parent lock, We have to
13721+ take write lock since there is at least one call path
13722+ where this znode is already write-locked by us. */
13723+ ret =
13724+ longterm_lock_znode(&lock, JZNODE(scan->node),
13725+ ZNODE_WRITE_LOCK,
71430cf6
MT
13726+ reiser4_scanning_left(scan) ?
13727+ ZNODE_LOCK_LOPRI :
44254afd
MT
13728+ ZNODE_LOCK_HIPRI);
13729+ if (ret != 0)
13730+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
13731+ scanned too far and can't back out, just start over. */
13732+ return ret;
13733+
13734+ ret = jnode_lock_parent_coord(scan->node,
13735+ &scan->parent_coord,
13736+ &scan->parent_lock,
13737+ &scan->parent_load,
13738+ ZNODE_WRITE_LOCK, try);
13739+
13740+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13741+ done_lh(&lock);
13742+ if (ret == -E_REPEAT) {
13743+ scan->stop = 1;
13744+ return 0;
13745+ }
13746+ if (ret)
13747+ return ret;
13748+
13749+ } else {
13750+ /* unformatted position */
13751+
13752+ ret =
13753+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13754+ &scan->parent_lock,
13755+ &scan->parent_load,
13756+ ZNODE_WRITE_LOCK, try);
13757+
13758+ if (IS_CBKERR(ret))
13759+ return ret;
13760+
13761+ if (ret == CBK_COORD_NOTFOUND)
13762+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13763+ return ret;
13764+
13765+ /* parent was found */
13766+ assert("jmacd-8661", other != NULL);
13767+ /* Duplicate the reference into the other flush_scan. */
13768+ coord_dup(&other->parent_coord, &scan->parent_coord);
13769+ copy_lh(&other->parent_lock, &scan->parent_lock);
13770+ copy_load_count(&other->parent_load, &scan->parent_load);
13771+ }
13772+ scan:
13773+ return scan_by_coord(scan);
13774+}
13775+
13776+/* Performs left- or rightward scanning starting from a formatted node. Follow left
13777+ pointers under tree lock as long as:
13778+
13779+ - node->left/right is non-NULL
13780+ - node->left/right is connected, dirty
13781+ - node->left/right belongs to the same atom
13782+ - scan has not reached maximum count
13783+*/
13784+static int scan_formatted(flush_scan * scan)
13785+{
13786+ int ret;
13787+ znode *neighbor = NULL;
13788+
71430cf6 13789+ assert("jmacd-1401", !reiser4_scan_finished(scan));
44254afd
MT
13790+
13791+ do {
13792+ znode *node = JZNODE(scan->node);
13793+
13794+ /* Node should be connected, but if not stop the scan. */
13795+ if (!znode_is_connected(node)) {
13796+ scan->stop = 1;
13797+ break;
13798+ }
13799+
13800+ /* Lock the tree, check-for and reference the next sibling. */
13801+ read_lock_tree(znode_get_tree(node));
13802+
13803+ /* It may be that a node is inserted or removed between a node and its
13804+ left sibling while the tree lock is released, but the flush-scan count
13805+ does not need to be precise. Thus, we release the tree lock as soon as
13806+ we get the neighboring node. */
71430cf6
MT
13807+ neighbor =
13808+ reiser4_scanning_left(scan) ? node->left : node->right;
44254afd
MT
13809+ if (neighbor != NULL) {
13810+ zref(neighbor);
13811+ }
13812+
13813+ read_unlock_tree(znode_get_tree(node));
13814+
13815+ /* If neighbor is NULL at the leaf level, need to check for an unformatted
13816+ sibling using the parent--break in any case. */
13817+ if (neighbor == NULL) {
13818+ break;
13819+ }
13820+
13821+ /* Check the condition for going left, break if it is not met. This also
13822+ releases (jputs) the neighbor if false. */
71430cf6 13823+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
44254afd
MT
13824+ break;
13825+ }
13826+
13827+ /* Advance the flush_scan state to the left, repeat. */
13828+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13829+ if (ret != 0) {
13830+ return ret;
13831+ }
13832+
71430cf6 13833+ } while (!reiser4_scan_finished(scan));
44254afd
MT
13834+
13835+ /* If neighbor is NULL then we reached the end of a formatted region, or else the
13836+ sibling is out of memory, now check for an extent to the left (as long as
13837+ LEAF_LEVEL). */
13838+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
71430cf6 13839+ || reiser4_scan_finished(scan)) {
44254afd
MT
13840+ scan->stop = 1;
13841+ return 0;
13842+ }
13843+ /* Otherwise, calls scan_by_coord for the right(left)most item of the
13844+ left(right) neighbor on the parent level, then possibly continue. */
13845+
13846+ coord_init_invalid(&scan->parent_coord, NULL);
13847+ return scan_unformatted(scan, NULL);
13848+}
13849+
13850+/* NOTE-EDWARD:
13851+ This scans adjacent items of the same type and calls scan flush plugin for each one.
13852+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
13853+ from unformatted node, then we continue only if the next neighbor is also unformatted.
13854+ When called from scan_formatted, we skip first iteration (to make sure that
13855+ right(left)most item of the left(right) neighbor on the parent level is of the same
13856+ type and set appropriate coord). */
13857+static int scan_by_coord(flush_scan * scan)
13858+{
13859+ int ret = 0;
13860+ int scan_this_coord;
13861+ lock_handle next_lock;
13862+ load_count next_load;
13863+ coord_t next_coord;
13864+ jnode *child;
13865+ item_plugin *iplug;
13866+
13867+ init_lh(&next_lock);
13868+ init_load_count(&next_load);
13869+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13870+
13871+ /* set initial item id */
13872+ iplug = item_plugin_by_coord(&scan->parent_coord);
13873+
71430cf6 13874+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
44254afd
MT
13875+ if (scan_this_coord) {
13876+ /* Here we expect that unit is scannable. it would not be so due
13877+ * to race with extent->tail conversion. */
13878+ if (iplug->f.scan == NULL) {
13879+ scan->stop = 1;
13880+ ret = -E_REPEAT;
13881+ /* skip the check at the end. */
13882+ goto race;
13883+ }
13884+
13885+ ret = iplug->f.scan(scan);
13886+ if (ret != 0)
13887+ goto exit;
13888+
71430cf6 13889+ if (reiser4_scan_finished(scan)) {
44254afd
MT
13890+ checkchild(scan);
13891+ break;
13892+ }
13893+ } else {
13894+ /* the same race against truncate as above is possible
13895+ * here, it seems */
13896+
13897+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13898+ the first coordinate. */
13899+ assert("jmacd-1231",
13900+ item_is_internal(&scan->parent_coord));
13901+ }
13902+
13903+ if (iplug->f.utmost_child == NULL
13904+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13905+ /* stop this coord and continue on parrent level */
13906+ ret =
13907+ scan_set_current(scan,
13908+ ZJNODE(zref
13909+ (scan->parent_coord.node)),
13910+ 1, NULL);
13911+ if (ret != 0)
13912+ goto exit;
13913+ break;
13914+ }
13915+
13916+ /* Either way, the invariant is that scan->parent_coord is set to the
13917+ parent of scan->node. Now get the next unit. */
13918+ coord_dup(&next_coord, &scan->parent_coord);
13919+ coord_sideof_unit(&next_coord, scan->direction);
13920+
13921+ /* If off-the-end of the twig, try the next twig. */
13922+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13923+ /* We take the write lock because we may start flushing from this
13924+ * coordinate. */
71430cf6
MT
13925+ ret = neighbor_in_slum(next_coord.node,
13926+ &next_lock,
13927+ scan->direction,
13928+ ZNODE_WRITE_LOCK,
13929+ 1 /* check dirty */,
13930+ 0 /* don't go though upper
13931+ levels */);
44254afd
MT
13932+ if (ret == -E_NO_NEIGHBOR) {
13933+ scan->stop = 1;
13934+ ret = 0;
13935+ break;
13936+ }
13937+
13938+ if (ret != 0) {
13939+ goto exit;
13940+ }
13941+
13942+ ret = incr_load_count_znode(&next_load, next_lock.node);
13943+ if (ret != 0) {
13944+ goto exit;
13945+ }
13946+
13947+ coord_init_sideof_unit(&next_coord, next_lock.node,
13948+ sideof_reverse(scan->direction));
13949+ }
13950+
13951+ iplug = item_plugin_by_coord(&next_coord);
13952+
13953+ /* Get the next child. */
13954+ ret =
13955+ iplug->f.utmost_child(&next_coord,
13956+ sideof_reverse(scan->direction),
13957+ &child);
13958+ if (ret != 0)
13959+ goto exit;
13960+ /* If the next child is not in memory, or, item_utmost_child
13961+ failed (due to race with unlink, most probably), stop
13962+ here. */
13963+ if (child == NULL || IS_ERR(child)) {
13964+ scan->stop = 1;
13965+ checkchild(scan);
13966+ break;
13967+ }
13968+
13969+ assert("nikita-2374", jnode_is_unformatted(child)
13970+ || jnode_is_znode(child));
13971+
13972+ /* See if it is dirty, part of the same atom. */
71430cf6 13973+ if (!reiser4_scan_goto(scan, child)) {
44254afd
MT
13974+ checkchild(scan);
13975+ break;
13976+ }
13977+
13978+ /* If so, make this child current. */
13979+ ret = scan_set_current(scan, child, 1, &next_coord);
13980+ if (ret != 0)
13981+ goto exit;
13982+
13983+ /* Now continue. If formatted we release the parent lock and return, then
13984+ proceed. */
13985+ if (jnode_is_znode(child))
13986+ break;
13987+
13988+ /* Otherwise, repeat the above loop with next_coord. */
13989+ if (next_load.node != NULL) {
13990+ done_lh(&scan->parent_lock);
13991+ move_lh(&scan->parent_lock, &next_lock);
13992+ move_load_count(&scan->parent_load, &next_load);
13993+ }
13994+ }
13995+
71430cf6
MT
13996+ assert("jmacd-6233",
13997+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
44254afd
MT
13998+ exit:
13999+ checkchild(scan);
14000+ race: /* skip the above check */
14001+ if (jnode_is_znode(scan->node)) {
14002+ done_lh(&scan->parent_lock);
14003+ done_load_count(&scan->parent_load);
14004+ }
14005+
14006+ done_load_count(&next_load);
14007+ done_lh(&next_lock);
14008+ return ret;
14009+}
14010+
14011+/* FLUSH POS HELPERS */
14012+
14013+/* Initialize the fields of a flush_position. */
14014+static void pos_init(flush_pos_t * pos)
14015+{
14016+ memset(pos, 0, sizeof *pos);
14017+
14018+ pos->state = POS_INVALID;
14019+ coord_init_invalid(&pos->coord, NULL);
14020+ init_lh(&pos->lock);
14021+ init_load_count(&pos->load);
14022+
71430cf6 14023+ reiser4_blocknr_hint_init(&pos->preceder);
44254afd
MT
14024+}
14025+
14026+/* The flush loop inside squalloc periodically checks pos_valid to
14027+ determine when "enough flushing" has been performed. This will return true until one
14028+ of the following conditions is met:
14029+
14030+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14031+ parameter, meaning we have flushed as many blocks as the kernel requested. When
14032+ flushing to commit, this parameter is NULL.
14033+
14034+ 2. pos_stop() is called because squalloc discovers that the "next" node in the
14035+ flush order is either non-existant, not dirty, or not in the same atom.
14036+*/
14037+
14038+static int pos_valid(flush_pos_t * pos)
14039+{
14040+ return pos->state != POS_INVALID;
14041+}
14042+
14043+/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14044+static void pos_done(flush_pos_t * pos)
14045+{
14046+ pos_stop(pos);
71430cf6 14047+ reiser4_blocknr_hint_done(&pos->preceder);
44254afd
MT
14048+ if (convert_data(pos))
14049+ free_convert_data(pos);
14050+}
14051+
14052+/* Reset the point and parent. Called during flush subroutines to terminate the
14053+ squalloc loop. */
14054+static int pos_stop(flush_pos_t * pos)
14055+{
14056+ pos->state = POS_INVALID;
14057+ done_lh(&pos->lock);
14058+ done_load_count(&pos->load);
14059+ coord_init_invalid(&pos->coord, NULL);
14060+
14061+ if (pos->child) {
14062+ jput(pos->child);
14063+ pos->child = NULL;
14064+ }
14065+
14066+ return 0;
14067+}
14068+
14069+/* Return the flush_position's block allocator hint. */
71430cf6 14070+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
44254afd
MT
14071+{
14072+ return &pos->preceder;
14073+}
14074+
71430cf6 14075+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
44254afd
MT
14076+{
14077+ return pos->fq;
14078+}
14079+
14080+/* Make Linus happy.
14081+ Local variables:
14082+ c-indentation-style: "K&R"
14083+ mode-name: "LC"
14084+ c-basic-offset: 8
14085+ tab-width: 8
14086+ fill-column: 90
14087+ LocalWords: preceder
14088+ End:
14089+*/
71430cf6
MT
14090diff -urN linux-2.6.22.orig/fs/reiser4/flush.h linux-2.6.22/fs/reiser4/flush.h
14091--- linux-2.6.22.orig/fs/reiser4/flush.h 1970-01-01 03:00:00.000000000 +0300
14092+++ linux-2.6.22/fs/reiser4/flush.h 2007-07-29 00:25:34.864693371 +0400
14093@@ -0,0 +1,295 @@
44254afd
MT
14094+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14095+
14096+/* DECLARATIONS: */
14097+
14098+#if !defined(__REISER4_FLUSH_H__)
14099+#define __REISER4_FLUSH_H__
14100+
14101+#include "plugin/cluster.h"
14102+
14103+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14104+ single level of the tree. A flush-scan is used for counting the number of adjacent
14105+ nodes to flush, which is used to determine whether we should relocate, and it is also
14106+ used to find a starting point for flush. A flush-scan object can scan in both right
14107+ and left directions via the scan_left() and scan_right() interfaces. The
14108+ right- and left-variations are similar but perform different functions. When scanning
14109+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14110+ When scanning right we are simply counting the number of adjacent, dirty nodes. */
14111+struct flush_scan {
14112+
14113+ /* The current number of nodes scanned on this level. */
14114+ unsigned count;
14115+
14116+ /* There may be a maximum number of nodes for a scan on any single level. When
14117+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14118+ unsigned max_count;
14119+
14120+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14121+ sideof direction;
14122+
14123+ /* Initially @stop is set to false then set true once some condition stops the
14124+ search (e.g., we found a clean node before reaching max_count or we found a
14125+ node belonging to another atom). */
14126+ int stop;
14127+
14128+ /* The current scan position. If @node is non-NULL then its reference count has
14129+ been incremented to reflect this reference. */
14130+ jnode *node;
14131+
14132+ /* A handle for zload/zrelse of current scan position node. */
14133+ load_count node_load;
14134+
14135+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14136+ node is locked using this lock handle. The endpoint needs to be locked for
14137+ transfer to the flush_position object after scanning finishes. */
14138+ lock_handle node_lock;
14139+
14140+ /* When the position is unformatted, its parent, coordinate, and parent
14141+ zload/zrelse handle. */
14142+ lock_handle parent_lock;
14143+ coord_t parent_coord;
14144+ load_count parent_load;
14145+
14146+ /* The block allocator preceder hint. Sometimes flush_scan determines what the
14147+ preceder is and if so it sets it here, after which it is copied into the
14148+ flush_position. Otherwise, the preceder is computed later. */
14149+ reiser4_block_nr preceder_blk;
14150+};
14151+
71430cf6 14152+struct convert_item_info {
44254afd
MT
14153+ dc_item_stat d_cur; /* disk cluster state of the current item */
14154+ dc_item_stat d_next; /* disk cluster state of the next slum item */
14155+ struct inode *inode;
14156+ flow_t flow;
71430cf6 14157+};
44254afd 14158+
71430cf6 14159+struct convert_info {
44254afd 14160+ int count; /* for squalloc terminating */
44254afd 14161+ item_plugin *iplug; /* current item plugin */
71430cf6
MT
14162+ struct convert_item_info *itm; /* current item info */
14163+ struct cluster_handle clust; /* transform cluster */
14164+};
44254afd
MT
14165+
14166+typedef enum flush_position_state {
14167+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
14168+ * processing */
14169+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14170+ * leaf level */
14171+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14172+ * to traverse unformatted nodes */
14173+ POS_TO_LEAF, /* pos is being moved to leaf level */
14174+ POS_TO_TWIG, /* pos is being moved to twig level */
14175+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14176+ * rightmost unit of the current twig */
14177+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14178+} flushpos_state_t;
14179+
14180+/* An encapsulation of the current flush point and all the parameters that are passed
14181+ through the entire squeeze-and-allocate stage of the flush routine. A single
14182+ flush_position object is constructed after left- and right-scanning finishes. */
14183+struct flush_position {
14184+ flushpos_state_t state;
14185+
14186+ coord_t coord; /* coord to traverse unformatted nodes */
14187+ lock_handle lock; /* current lock we hold */
14188+ load_count load; /* load status for current locked formatted node */
14189+
14190+ jnode *child; /* for passing a reference to unformatted child
14191+ * across pos state changes */
14192+
14193+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14194+ int leaf_relocate; /* True if enough leaf-level nodes were
14195+ * found to suggest a relocate policy. */
14196+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14197+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14198+ flush_queue_t *fq;
14199+ long *nr_written; /* number of nodes submitted to disk */
14200+ int flags; /* a copy of jnode_flush flags argument */
14201+
14202+ znode *prev_twig; /* previous parent pointer value, used to catch
14203+ * processing of new twig node */
71430cf6 14204+ struct convert_info *sq; /* convert info */
44254afd
MT
14205+
14206+ unsigned long pos_in_unit; /* for extents only. Position
14207+ within an extent unit of first
14208+ jnode of slum */
14209+ long nr_to_write; /* number of unformatted nodes to handle on flush */
14210+};
14211+
14212+static inline int item_convert_count(flush_pos_t * pos)
14213+{
14214+ return pos->sq->count;
14215+}
14216+static inline void inc_item_convert_count(flush_pos_t * pos)
14217+{
14218+ pos->sq->count++;
14219+}
14220+static inline void set_item_convert_count(flush_pos_t * pos, int count)
14221+{
14222+ pos->sq->count = count;
14223+}
14224+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14225+{
14226+ return pos->sq->iplug;
14227+}
14228+
71430cf6 14229+static inline struct convert_info *convert_data(flush_pos_t * pos)
44254afd
MT
14230+{
14231+ return pos->sq;
14232+}
14233+
71430cf6 14234+static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
44254afd
MT
14235+{
14236+ assert("edward-955", convert_data(pos));
14237+ return pos->sq->itm;
14238+}
14239+
71430cf6 14240+static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
44254afd
MT
14241+{
14242+ return &pos->sq->clust.tc;
14243+}
14244+
71430cf6
MT
14245+static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
14246+ tfm_stream_id id)
44254afd
MT
14247+{
14248+ assert("edward-854", pos->sq != NULL);
71430cf6 14249+ return get_tfm_stream(tfm_cluster_sq(pos), id);
44254afd
MT
14250+}
14251+
14252+static inline int chaining_data_present(flush_pos_t * pos)
14253+{
14254+ return convert_data(pos) && item_convert_data(pos);
14255+}
14256+
14257+/* Returns true if next node contains next item of the disk cluster
14258+ so item convert data should be moved to the right slum neighbor.
14259+*/
14260+static inline int should_chain_next_node(flush_pos_t * pos)
14261+{
14262+ int result = 0;
14263+
14264+ assert("edward-1007", chaining_data_present(pos));
14265+
14266+ switch (item_convert_data(pos)->d_next) {
14267+ case DC_CHAINED_ITEM:
14268+ result = 1;
14269+ break;
14270+ case DC_AFTER_CLUSTER:
14271+ break;
14272+ default:
14273+ impossible("edward-1009", "bad state of next slum item");
14274+ }
14275+ return result;
14276+}
14277+
14278+/* update item state in a disk cluster to assign conversion mode */
14279+static inline void
14280+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14281+{
14282+
14283+ assert("edward-1010", chaining_data_present(pos));
14284+
14285+ if (this_node == 0) {
14286+ /* next item is on the right neighbor */
14287+ assert("edward-1011",
14288+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14289+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14290+ assert("edward-1012",
14291+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14292+
14293+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14294+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14295+ } else {
14296+ /* next item is on the same node */
14297+ assert("edward-1013",
14298+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14299+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14300+ assert("edward-1227",
14301+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14302+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
14303+
14304+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14305+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14306+ }
14307+}
14308+
14309+static inline int should_convert_node(flush_pos_t * pos, znode * node)
14310+{
14311+ return znode_convertible(node);
14312+}
14313+
14314+/* true if there is attached convert item info */
71430cf6 14315+static inline int should_convert_next_node(flush_pos_t * pos)
44254afd
MT
14316+{
14317+ return convert_data(pos) && item_convert_data(pos);
14318+}
14319+
14320+#define SQUALLOC_THRESHOLD 256
14321+
14322+static inline int should_terminate_squalloc(flush_pos_t * pos)
14323+{
14324+ return convert_data(pos) &&
14325+ !item_convert_data(pos) &&
14326+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14327+}
14328+
71430cf6
MT
14329+#if 1
14330+#define check_convert_info(pos) \
14331+do { \
14332+ if (unlikely(should_convert_next_node(pos))){ \
14333+ warning("edward-1006", "unprocessed chained data"); \
14334+ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
14335+ item_convert_data(pos)->d_cur, \
14336+ item_convert_data(pos)->d_next, \
14337+ item_convert_data(pos)->flow.length); \
14338+ printk("inode %llu, size = %llu, cluster %lu\n", \
14339+ (unsigned long long)get_inode_oid \
14340+ (item_convert_data(pos)->inode), \
14341+ i_size_read(item_convert_data(pos)->inode), \
14342+ convert_data(pos)->clust.index); \
14343+ } \
14344+} while (0)
14345+#else
14346+#define check_convert_info(pos)
14347+#endif /* REISER4_DEBUG */
14348+
44254afd
MT
14349+void free_convert_data(flush_pos_t * pos);
14350+/* used in extent.c */
14351+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14352+ const coord_t * parent);
71430cf6
MT
14353+int reiser4_scan_finished(flush_scan * scan);
14354+int reiser4_scanning_left(flush_scan * scan);
14355+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
44254afd 14356+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
71430cf6 14357+int reiser4_alloc_extent(flush_pos_t *flush_pos);
44254afd
MT
14358+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14359+ reiser4_key *stop_key);
71430cf6
MT
14360+extern int reiser4_init_fqs(void);
14361+extern void reiser4_done_fqs(void);
44254afd
MT
14362+
14363+#if REISER4_DEBUG
14364+
71430cf6 14365+extern void reiser4_check_fq(const txn_atom *atom);
44254afd
MT
14366+extern atomic_t flush_cnt;
14367+
14368+#define check_preceder(blk) \
14369+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14370+extern void check_pos(flush_pos_t * pos);
14371+#else
14372+#define check_preceder(b) noop
14373+#define check_pos(pos) noop
14374+#endif
14375+
14376+/* __REISER4_FLUSH_H__ */
14377+#endif
14378+
14379+/* Make Linus happy.
14380+ Local variables:
14381+ c-indentation-style: "K&R"
14382+ mode-name: "LC"
14383+ c-basic-offset: 8
14384+ tab-width: 8
14385+ fill-column: 90
14386+ LocalWords: preceder
14387+ End:
14388+*/
71430cf6
MT
14389diff -urN linux-2.6.22.orig/fs/reiser4/flush_queue.c linux-2.6.22/fs/reiser4/flush_queue.c
14390--- linux-2.6.22.orig/fs/reiser4/flush_queue.c 1970-01-01 03:00:00.000000000 +0300
14391+++ linux-2.6.22/fs/reiser4/flush_queue.c 2007-07-29 00:25:34.864693371 +0400
14392@@ -0,0 +1,680 @@
44254afd
MT
14393+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14394+
14395+#include "debug.h"
14396+#include "super.h"
14397+#include "txnmgr.h"
14398+#include "jnode.h"
14399+#include "znode.h"
14400+#include "page_cache.h"
14401+#include "wander.h"
14402+#include "vfs_ops.h"
14403+#include "writeout.h"
14404+#include "flush.h"
14405+
14406+#include <linux/bio.h>
14407+#include <linux/mm.h>
14408+#include <linux/pagemap.h>
14409+#include <linux/blkdev.h>
14410+#include <linux/writeback.h>
14411+
14412+/* A flush queue object is an accumulator for keeping jnodes prepared
14413+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14414+ kept on the flush queue until memory pressure or atom commit asks
14415+ flush queues to write some or all from their jnodes. */
14416+
14417+/*
14418+ LOCKING:
14419+
14420+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14421+ list protected by atom spin lock. fq->prepped list uses the following
14422+ locking:
14423+
14424+ two ways to protect fq->prepped list for read-only list traversal:
14425+
14426+ 1. atom spin-lock atom.
14427+ 2. fq is IN_USE, atom->nr_running_queues increased.
14428+
14429+ and one for list modification:
14430+
14431+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
14432+ atom->nr_running_queues == 0.
14433+
14434+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
14435+ lock flush queue, then lock jnode.
14436+*/
14437+
14438+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14439+#define fq_ready(fq) (!fq_in_use(fq))
14440+
14441+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14442+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14443+
14444+/* get lock on atom from locked flush queue object */
14445+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14446+{
14447+ /* This code is similar to jnode_get_atom(), look at it for the
14448+ * explanation. */
14449+ txn_atom *atom;
14450+
14451+ assert_spin_locked(&(fq->guard));
14452+
14453+ while (1) {
14454+ atom = fq->atom;
14455+ if (atom == NULL)
14456+ break;
14457+
14458+ if (spin_trylock_atom(atom))
14459+ break;
14460+
14461+ atomic_inc(&atom->refcount);
14462+ spin_unlock(&(fq->guard));
14463+ spin_lock_atom(atom);
14464+ spin_lock(&(fq->guard));
14465+
14466+ if (fq->atom == atom) {
14467+ atomic_dec(&atom->refcount);
14468+ break;
14469+ }
14470+
14471+ spin_unlock(&(fq->guard));
14472+ atom_dec_and_unlock(atom);
14473+ spin_lock(&(fq->guard));
14474+ }
14475+
14476+ return atom;
14477+}
14478+
14479+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14480+{
14481+ txn_atom *atom;
14482+
14483+ spin_lock(&(fq->guard));
14484+ atom = atom_locked_by_fq_nolock(fq);
14485+ spin_unlock(&(fq->guard));
14486+ return atom;
14487+}
14488+
14489+static void init_fq(flush_queue_t * fq)
14490+{
14491+ memset(fq, 0, sizeof *fq);
14492+
14493+ atomic_set(&fq->nr_submitted, 0);
14494+
14495+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14496+
71430cf6 14497+ init_waitqueue_head(&fq->wait);
44254afd
MT
14498+ spin_lock_init(&fq->guard);
14499+}
14500+
14501+/* slab for flush queues */
71430cf6 14502+static struct kmem_cache *fq_slab;
44254afd
MT
14503+
14504+/**
71430cf6 14505+ * reiser4_init_fqs - create flush queue cache
44254afd
MT
14506+ *
14507+ * Initializes slab cache of flush queues. It is part of reiser4 module
14508+ * initialization.
14509+ */
71430cf6 14510+int reiser4_init_fqs(void)
44254afd
MT
14511+{
14512+ fq_slab = kmem_cache_create("fq",
14513+ sizeof(flush_queue_t),
14514+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14515+ if (fq_slab == NULL)
14516+ return RETERR(-ENOMEM);
14517+ return 0;
14518+}
14519+
14520+/**
71430cf6 14521+ * reiser4_done_fqs - delete flush queue cache
44254afd
MT
14522+ *
14523+ * This is called on reiser4 module unloading or system shutdown.
14524+ */
71430cf6 14525+void reiser4_done_fqs(void)
44254afd
MT
14526+{
14527+ destroy_reiser4_cache(&fq_slab);
14528+}
14529+
14530+/* create new flush queue object */
14531+static flush_queue_t *create_fq(gfp_t gfp)
14532+{
14533+ flush_queue_t *fq;
14534+
14535+ fq = kmem_cache_alloc(fq_slab, gfp);
14536+ if (fq)
14537+ init_fq(fq);
14538+
14539+ return fq;
14540+}
14541+
14542+/* adjust atom's and flush queue's counters of queued nodes */
14543+static void count_enqueued_node(flush_queue_t * fq)
14544+{
14545+ ON_DEBUG(fq->atom->num_queued++);
14546+}
14547+
14548+static void count_dequeued_node(flush_queue_t * fq)
14549+{
14550+ assert("zam-993", fq->atom->num_queued > 0);
14551+ ON_DEBUG(fq->atom->num_queued--);
14552+}
14553+
14554+/* attach flush queue object to the atom */
14555+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14556+{
14557+ assert_spin_locked(&(atom->alock));
14558+ list_add(&fq->alink, &atom->flush_queues);
14559+ fq->atom = atom;
14560+ ON_DEBUG(atom->nr_flush_queues++);
14561+}
14562+
14563+static void detach_fq(flush_queue_t * fq)
14564+{
14565+ assert_spin_locked(&(fq->atom->alock));
14566+
14567+ spin_lock(&(fq->guard));
14568+ list_del_init(&fq->alink);
14569+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
14570+ ON_DEBUG(fq->atom->nr_flush_queues--);
14571+ fq->atom = NULL;
14572+ spin_unlock(&(fq->guard));
14573+}
14574+
14575+/* destroy flush queue object */
14576+static void done_fq(flush_queue_t * fq)
14577+{
14578+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14579+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14580+
14581+ kmem_cache_free(fq_slab, fq);
14582+}
14583+
14584+/* */
71430cf6 14585+static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
44254afd
MT
14586+{
14587+ JF_SET(node, JNODE_FLUSH_QUEUED);
14588+ count_enqueued_node(fq);
14589+}
14590+
14591+/* Putting jnode into the flush queue. Both atom and jnode should be
14592+ spin-locked. */
14593+void queue_jnode(flush_queue_t * fq, jnode * node)
14594+{
14595+ assert_spin_locked(&(node->guard));
14596+ assert("zam-713", node->atom != NULL);
14597+ assert_spin_locked(&(node->atom->alock));
14598+ assert("zam-716", fq->atom != NULL);
14599+ assert("zam-717", fq->atom == node->atom);
14600+ assert("zam-907", fq_in_use(fq));
14601+
14602+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14603+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14604+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14605+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14606+
14607+ mark_jnode_queued(fq, node);
14608+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14609+
14610+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14611+ FQ_LIST, 1));
14612+}
14613+
14614+/* repeatable process for waiting io completion on a flush queue object */
14615+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14616+{
14617+ assert("zam-738", fq->atom != NULL);
14618+ assert_spin_locked(&(fq->atom->alock));
14619+ assert("zam-736", fq_in_use(fq));
14620+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14621+
14622+ if (atomic_read(&fq->nr_submitted) != 0) {
14623+ struct super_block *super;
14624+
14625+ spin_unlock_atom(fq->atom);
14626+
71430cf6 14627+ assert("nikita-3013", reiser4_schedulable());
44254afd
MT
14628+
14629+ super = reiser4_get_current_sb();
14630+
14631+ /* FIXME: this is instead of blk_run_queues() */
71430cf6 14632+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
44254afd
MT
14633+
14634+ if (!(super->s_flags & MS_RDONLY))
71430cf6 14635+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
44254afd
MT
14636+
14637+ /* Ask the caller to re-acquire the locks and call this
14638+ function again. Note: this technique is commonly used in
14639+ the txnmgr code. */
14640+ return -E_REPEAT;
14641+ }
14642+
14643+ *nr_io_errors += atomic_read(&fq->nr_errors);
14644+ return 0;
14645+}
14646+
14647+/* wait on I/O completion, re-submit dirty nodes to write */
14648+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14649+{
14650+ int ret;
14651+ txn_atom *atom = fq->atom;
14652+
14653+ assert("zam-801", atom != NULL);
14654+ assert_spin_locked(&(atom->alock));
14655+ assert("zam-762", fq_in_use(fq));
14656+
14657+ ret = wait_io(fq, nr_io_errors);
14658+ if (ret)
14659+ return ret;
14660+
14661+ detach_fq(fq);
14662+ done_fq(fq);
14663+
71430cf6 14664+ reiser4_atom_send_event(atom);
44254afd
MT
14665+
14666+ return 0;
14667+}
14668+
14669+/* wait for all i/o for given atom to be completed, actually do one iteration
14670+ on that and return -E_REPEAT if there more iterations needed */
14671+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14672+{
14673+ flush_queue_t *fq;
14674+
14675+ assert_spin_locked(&(atom->alock));
14676+
14677+ if (list_empty_careful(&atom->flush_queues))
14678+ return 0;
14679+
14680+ list_for_each_entry(fq, &atom->flush_queues, alink) {
14681+ if (fq_ready(fq)) {
14682+ int ret;
14683+
14684+ mark_fq_in_use(fq);
14685+ assert("vs-1247", fq->owner == NULL);
14686+ ON_DEBUG(fq->owner = current);
14687+ ret = finish_fq(fq, nr_io_errors);
14688+
14689+ if (*nr_io_errors)
14690+ reiser4_handle_error();
14691+
14692+ if (ret) {
71430cf6 14693+ reiser4_fq_put(fq);
44254afd
MT
14694+ return ret;
14695+ }
14696+
14697+ spin_unlock_atom(atom);
14698+
14699+ return -E_REPEAT;
14700+ }
14701+ }
14702+
14703+ /* All flush queues are in use; atom remains locked */
14704+ return -EBUSY;
14705+}
14706+
14707+/* wait all i/o for current atom */
14708+int current_atom_finish_all_fq(void)
14709+{
14710+ txn_atom *atom;
14711+ int nr_io_errors = 0;
14712+ int ret = 0;
14713+
14714+ do {
14715+ while (1) {
14716+ atom = get_current_atom_locked();
14717+ ret = finish_all_fq(atom, &nr_io_errors);
14718+ if (ret != -EBUSY)
14719+ break;
71430cf6 14720+ reiser4_atom_wait_event(atom);
44254afd
MT
14721+ }
14722+ } while (ret == -E_REPEAT);
14723+
14724+ /* we do not need locked atom after this function finishes, SUCCESS or
14725+ -EBUSY are two return codes when atom remains locked after
14726+ finish_all_fq */
14727+ if (!ret)
14728+ spin_unlock_atom(atom);
14729+
14730+ assert_spin_not_locked(&(atom->alock));
14731+
14732+ if (ret)
14733+ return ret;
14734+
14735+ if (nr_io_errors)
14736+ return RETERR(-EIO);
14737+
14738+ return 0;
14739+}
14740+
14741+/* change node->atom field for all jnode from given list */
14742+static void
14743+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14744+{
14745+ jnode *cur;
14746+
14747+ list_for_each_entry(cur, list, capture_link) {
14748+ spin_lock_jnode(cur);
14749+ cur->atom = atom;
14750+ spin_unlock_jnode(cur);
14751+ }
14752+}
14753+
14754+/* support for atom fusion operation */
71430cf6 14755+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
44254afd
MT
14756+{
14757+ flush_queue_t *fq;
14758+
14759+ assert_spin_locked(&(to->alock));
14760+ assert_spin_locked(&(from->alock));
14761+
14762+ list_for_each_entry(fq, &from->flush_queues, alink) {
14763+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14764+ spin_lock(&(fq->guard));
14765+ fq->atom = to;
14766+ spin_unlock(&(fq->guard));
14767+ }
14768+
14769+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
14770+
14771+#if REISER4_DEBUG
14772+ to->num_queued += from->num_queued;
14773+ to->nr_flush_queues += from->nr_flush_queues;
14774+ from->nr_flush_queues = 0;
14775+#endif
14776+}
14777+
14778+#if REISER4_DEBUG
14779+int atom_fq_parts_are_clean(txn_atom * atom)
14780+{
14781+ assert("zam-915", atom != NULL);
14782+ return list_empty_careful(&atom->flush_queues);
14783+}
14784+#endif
14785+/* Bio i/o completion routine for reiser4 write operations. */
14786+static int
14787+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
14788+ int err)
14789+{
14790+ int i;
14791+ int nr_errors = 0;
14792+ flush_queue_t *fq;
14793+
14794+ assert("zam-958", bio->bi_rw & WRITE);
14795+
14796+ /* i/o op. is not fully completed */
14797+ if (bio->bi_size != 0)
14798+ return 1;
14799+
14800+ if (err == -EOPNOTSUPP)
14801+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14802+
14803+ /* we expect that bio->private is set to NULL or fq object which is used
14804+ * for synchronization and error counting. */
14805+ fq = bio->bi_private;
14806+ /* Check all elements of io_vec for correct write completion. */
14807+ for (i = 0; i < bio->bi_vcnt; i += 1) {
14808+ struct page *pg = bio->bi_io_vec[i].bv_page;
14809+
14810+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14811+ SetPageError(pg);
14812+ nr_errors++;
14813+ }
14814+
14815+ {
14816+ /* jnode WRITEBACK ("write is in progress bit") is
14817+ * atomically cleared here. */
14818+ jnode *node;
14819+
14820+ assert("zam-736", pg != NULL);
14821+ assert("zam-736", PagePrivate(pg));
14822+ node = jprivate(pg);
14823+
14824+ JF_CLR(node, JNODE_WRITEBACK);
14825+ }
14826+
14827+ end_page_writeback(pg);
14828+ page_cache_release(pg);
14829+ }
14830+
14831+ if (fq) {
14832+ /* count i/o error in fq object */
14833+ atomic_add(nr_errors, &fq->nr_errors);
14834+
14835+ /* If all write requests registered in this "fq" are done we up
71430cf6 14836+ * the waiter. */
44254afd 14837+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
71430cf6 14838+ wake_up(&fq->wait);
44254afd
MT
14839+ }
14840+
14841+ bio_put(bio);
14842+ return 0;
14843+}
14844+
14845+/* Count I/O requests which will be submitted by @bio in given flush queues
14846+ @fq */
14847+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14848+{
14849+ bio->bi_private = fq;
14850+ bio->bi_end_io = end_io_handler;
14851+
14852+ if (fq)
14853+ atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14854+}
14855+
14856+/* Move all queued nodes out from @fq->prepped list. */
14857+static void release_prepped_list(flush_queue_t * fq)
14858+{
14859+ txn_atom *atom;
14860+
14861+ assert("zam-904", fq_in_use(fq));
14862+ atom = atom_locked_by_fq(fq);
14863+
14864+ while (!list_empty(ATOM_FQ_LIST(fq))) {
14865+ jnode *cur;
14866+
14867+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14868+ list_del_init(&cur->capture_link);
14869+
14870+ count_dequeued_node(fq);
14871+ spin_lock_jnode(cur);
14872+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14873+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14874+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14875+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
14876+
14877+ if (JF_ISSET(cur, JNODE_DIRTY)) {
14878+ list_add_tail(&cur->capture_link,
14879+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14880+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14881+ DIRTY_LIST, 1));
14882+ } else {
14883+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14884+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14885+ CLEAN_LIST, 1));
14886+ }
14887+
14888+ spin_unlock_jnode(cur);
14889+ }
14890+
14891+ if (--atom->nr_running_queues == 0)
71430cf6 14892+ reiser4_atom_send_event(atom);
44254afd
MT
14893+
14894+ spin_unlock_atom(atom);
14895+}
14896+
14897+/* Submit write requests for nodes on the already filled flush queue @fq.
14898+
14899+ @fq: flush queue object which contains jnodes we can (and will) write.
14900+ @return: number of submitted blocks (>=0) if success, otherwise -- an error
14901+ code (<0). */
71430cf6 14902+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
44254afd
MT
14903+{
14904+ int ret;
14905+ txn_atom *atom;
14906+
14907+ while (1) {
14908+ atom = atom_locked_by_fq(fq);
14909+ assert("zam-924", atom);
14910+ /* do not write fq in parallel. */
14911+ if (atom->nr_running_queues == 0
14912+ || !(flags & WRITEOUT_SINGLE_STREAM))
14913+ break;
71430cf6 14914+ reiser4_atom_wait_event(atom);
44254afd
MT
14915+ }
14916+
14917+ atom->nr_running_queues++;
14918+ spin_unlock_atom(atom);
14919+
14920+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14921+ release_prepped_list(fq);
14922+
14923+ return ret;
14924+}
14925+
14926+/* Getting flush queue object for exclusive use by one thread. May require
14927+ several iterations which is indicated by -E_REPEAT return code.
14928+
14929+ This function does not contain code for obtaining an atom lock because an
14930+ atom lock is obtained by different ways in different parts of reiser4,
14931+ usually it is current atom, but we need a possibility for getting fq for the
14932+ atom of given jnode. */
14933+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14934+{
14935+ flush_queue_t *fq;
14936+
14937+ assert_spin_locked(&(atom->alock));
14938+
14939+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14940+ while (&atom->flush_queues != &fq->alink) {
14941+ spin_lock(&(fq->guard));
14942+
14943+ if (fq_ready(fq)) {
14944+ mark_fq_in_use(fq);
14945+ assert("vs-1246", fq->owner == NULL);
14946+ ON_DEBUG(fq->owner = current);
14947+ spin_unlock(&(fq->guard));
14948+
14949+ if (*new_fq)
14950+ done_fq(*new_fq);
14951+
14952+ *new_fq = fq;
14953+
14954+ return 0;
14955+ }
14956+
14957+ spin_unlock(&(fq->guard));
14958+
14959+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
14960+ }
14961+
14962+ /* Use previously allocated fq object */
14963+ if (*new_fq) {
14964+ mark_fq_in_use(*new_fq);
14965+ assert("vs-1248", (*new_fq)->owner == 0);
14966+ ON_DEBUG((*new_fq)->owner = current);
14967+ attach_fq(atom, *new_fq);
14968+
14969+ return 0;
14970+ }
14971+
14972+ spin_unlock_atom(atom);
14973+
14974+ *new_fq = create_fq(gfp);
14975+
14976+ if (*new_fq == NULL)
14977+ return RETERR(-ENOMEM);
14978+
14979+ return RETERR(-E_REPEAT);
14980+}
14981+
71430cf6 14982+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
44254afd 14983+{
71430cf6 14984+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
44254afd
MT
14985+}
14986+
71430cf6
MT
14987+/* A wrapper around reiser4_fq_by_atom for getting a flush queue
14988+ object for current atom, if success fq->atom remains locked. */
44254afd
MT
14989+flush_queue_t *get_fq_for_current_atom(void)
14990+{
14991+ flush_queue_t *fq = NULL;
14992+ txn_atom *atom;
14993+ int ret;
14994+
14995+ do {
14996+ atom = get_current_atom_locked();
71430cf6 14997+ ret = reiser4_fq_by_atom(atom, &fq);
44254afd
MT
14998+ } while (ret == -E_REPEAT);
14999+
15000+ if (ret)
15001+ return ERR_PTR(ret);
15002+ return fq;
15003+}
15004+
15005+/* Releasing flush queue object after exclusive use */
71430cf6 15006+void reiser4_fq_put_nolock(flush_queue_t *fq)
44254afd
MT
15007+{
15008+ assert("zam-747", fq->atom != NULL);
15009+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15010+ mark_fq_ready(fq);
15011+ assert("vs-1245", fq->owner == current);
15012+ ON_DEBUG(fq->owner = NULL);
15013+}
15014+
71430cf6 15015+void reiser4_fq_put(flush_queue_t * fq)
44254afd
MT
15016+{
15017+ txn_atom *atom;
15018+
15019+ spin_lock(&(fq->guard));
15020+ atom = atom_locked_by_fq_nolock(fq);
15021+
15022+ assert("zam-746", atom != NULL);
15023+
71430cf6
MT
15024+ reiser4_fq_put_nolock(fq);
15025+ reiser4_atom_send_event(atom);
44254afd
MT
15026+
15027+ spin_unlock(&(fq->guard));
15028+ spin_unlock_atom(atom);
15029+}
15030+
15031+/* A part of atom object initialization related to the embedded flush queue
15032+ list head */
15033+
15034+void init_atom_fq_parts(txn_atom *atom)
15035+{
15036+ INIT_LIST_HEAD(&atom->flush_queues);
15037+}
15038+
15039+#if REISER4_DEBUG
15040+
71430cf6 15041+void reiser4_check_fq(const txn_atom *atom)
44254afd
MT
15042+{
15043+ /* check number of nodes on all atom's flush queues */
15044+ flush_queue_t *fq;
15045+ int count;
15046+ struct list_head *pos;
15047+
15048+ count = 0;
15049+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15050+ spin_lock(&(fq->guard));
15051+ /* calculate number of jnodes on fq' list of prepped jnodes */
15052+ list_for_each(pos, ATOM_FQ_LIST(fq))
15053+ count++;
15054+ spin_unlock(&(fq->guard));
15055+ }
15056+ if (count != atom->fq)
15057+ warning("", "fq counter %d, real %d\n", atom->fq, count);
15058+
15059+}
15060+
15061+#endif
15062+
15063+/*
15064+ * Local variables:
15065+ * c-indentation-style: "K&R"
15066+ * mode-name: "LC"
15067+ * c-basic-offset: 8
15068+ * tab-width: 8
15069+ * fill-column: 79
15070+ * scroll-step: 1
15071+ * End:
15072+ */
71430cf6
MT
15073diff -urN linux-2.6.22.orig/fs/reiser4/forward.h linux-2.6.22/fs/reiser4/forward.h
15074--- linux-2.6.22.orig/fs/reiser4/forward.h 1970-01-01 03:00:00.000000000 +0300
15075+++ linux-2.6.22/fs/reiser4/forward.h 2007-07-29 00:25:34.864693371 +0400
15076@@ -0,0 +1,252 @@
44254afd
MT
15077+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15078+
15079+/* Forward declarations. Thank you Kernighan. */
15080+
15081+#if !defined( __REISER4_FORWARD_H__ )
15082+#define __REISER4_FORWARD_H__
15083+
15084+#include <asm/errno.h>
15085+#include <linux/types.h>
15086+
15087+typedef struct zlock zlock;
15088+typedef struct lock_stack lock_stack;
15089+typedef struct lock_handle lock_handle;
15090+typedef struct znode znode;
15091+typedef struct flow flow_t;
15092+typedef struct coord coord_t;
15093+typedef struct tree_access_pointer tap_t;
44254afd
MT
15094+typedef struct reiser4_object_create_data reiser4_object_create_data;
15095+typedef union reiser4_plugin reiser4_plugin;
15096+typedef __u16 reiser4_plugin_id;
71430cf6 15097+typedef __u64 reiser4_plugin_groups;
44254afd
MT
15098+typedef struct item_plugin item_plugin;
15099+typedef struct jnode_plugin jnode_plugin;
15100+typedef struct reiser4_item_data reiser4_item_data;
15101+typedef union reiser4_key reiser4_key;
15102+typedef struct reiser4_tree reiser4_tree;
15103+typedef struct carry_cut_data carry_cut_data;
15104+typedef struct carry_kill_data carry_kill_data;
15105+typedef struct carry_tree_op carry_tree_op;
15106+typedef struct carry_tree_node carry_tree_node;
15107+typedef struct carry_plugin_info carry_plugin_info;
15108+typedef struct reiser4_journal reiser4_journal;
15109+typedef struct txn_atom txn_atom;
15110+typedef struct txn_handle txn_handle;
15111+typedef struct txn_mgr txn_mgr;
15112+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15113+typedef struct reiser4_context reiser4_context;
15114+typedef struct carry_level carry_level;
44254afd
MT
15115+typedef struct blocknr_set_entry blocknr_set_entry;
15116+/* super_block->s_fs_info points to this */
15117+typedef struct reiser4_super_info_data reiser4_super_info_data;
15118+/* next two objects are fields of reiser4_super_info_data */
15119+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15120+typedef struct reiser4_space_allocator reiser4_space_allocator;
15121+
15122+typedef struct flush_scan flush_scan;
15123+typedef struct flush_position flush_pos_t;
15124+
15125+typedef unsigned short pos_in_node_t;
15126+#define MAX_POS_IN_NODE 65535
15127+
15128+typedef struct jnode jnode;
15129+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15130+
15131+typedef struct uf_coord uf_coord_t;
15132+typedef struct hint hint_t;
15133+
15134+typedef struct ktxnmgrd_context ktxnmgrd_context;
15135+
44254afd
MT
15136+struct inode;
15137+struct page;
15138+struct file;
15139+struct dentry;
15140+struct super_block;
15141+
15142+/* return values of coord_by_key(). cbk == coord_by_key */
15143+typedef enum {
15144+ CBK_COORD_FOUND = 0,
15145+ CBK_COORD_NOTFOUND = -ENOENT,
15146+} lookup_result;
15147+
15148+/* results of lookup with directory file */
15149+typedef enum {
15150+ FILE_NAME_FOUND = 0,
15151+ FILE_NAME_NOTFOUND = -ENOENT,
15152+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15153+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15154+} file_lookup_result;
15155+
15156+/* behaviors of lookup. If coord we are looking for is actually in a tree,
15157+ both coincide. */
15158+typedef enum {
15159+ /* search exactly for the coord with key given */
15160+ FIND_EXACT,
15161+ /* search for coord with the maximal key not greater than one
15162+ given */
15163+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15164+} lookup_bias;
15165+
15166+typedef enum {
15167+ /* number of leaf level of the tree
15168+ The fake root has (tree_level=0). */
15169+ LEAF_LEVEL = 1,
15170+
15171+ /* number of level one above leaf level of the tree.
15172+
15173+ It is supposed that internal tree used by reiser4 to store file
15174+ system data and meta data will have height 2 initially (when
15175+ created by mkfs).
15176+ */
15177+ TWIG_LEVEL = 2,
15178+} tree_level;
15179+
15180+/* The "real" maximum ztree height is the 0-origin size of any per-level
15181+ array, since the zero'th level is not used. */
15182+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15183+
15184+/* enumeration of possible mutual position of item and coord. This enum is
15185+ return type of ->is_in_item() item plugin method which see. */
15186+typedef enum {
15187+ /* coord is on the left of an item */
15188+ IP_ON_THE_LEFT,
15189+ /* coord is inside item */
15190+ IP_INSIDE,
15191+ /* coord is inside item, but to the right of the rightmost unit of
15192+ this item */
15193+ IP_RIGHT_EDGE,
15194+ /* coord is on the right of an item */
15195+ IP_ON_THE_RIGHT
15196+} interposition;
15197+
15198+/* type of lock to acquire on znode before returning it to caller */
15199+typedef enum {
15200+ ZNODE_NO_LOCK = 0,
15201+ ZNODE_READ_LOCK = 1,
15202+ ZNODE_WRITE_LOCK = 2,
15203+} znode_lock_mode;
15204+
15205+/* type of lock request */
15206+typedef enum {
15207+ ZNODE_LOCK_LOPRI = 0,
15208+ ZNODE_LOCK_HIPRI = (1 << 0),
15209+
15210+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15211+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15212+ return the value -E_REPEAT. */
15213+ ZNODE_LOCK_NONBLOCK = (1 << 1),
15214+ /* An option for longterm_lock_znode which prevents atom fusion */
15215+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
15216+} znode_lock_request;
15217+
15218+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15219+
15220+/* used to specify direction of shift. These must be -1 and 1 */
15221+typedef enum {
15222+ SHIFT_LEFT = 1,
15223+ SHIFT_RIGHT = -1
15224+} shift_direction;
15225+
15226+typedef enum {
15227+ LEFT_SIDE,
15228+ RIGHT_SIDE
15229+} sideof;
15230+
15231+#define round_up( value, order ) \
15232+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15233+ ~( ( order ) - 1 ) ) )
15234+
15235+/* values returned by squalloc_right_neighbor and its auxiliary functions */
15236+typedef enum {
15237+ /* unit of internal item is moved */
15238+ SUBTREE_MOVED = 0,
15239+ /* nothing else can be squeezed into left neighbor */
15240+ SQUEEZE_TARGET_FULL = 1,
15241+ /* all content of node is squeezed into its left neighbor */
15242+ SQUEEZE_SOURCE_EMPTY = 2,
15243+ /* one more item is copied (this is only returned by
15244+ allocate_and_copy_extent to squalloc_twig)) */
15245+ SQUEEZE_CONTINUE = 3
15246+} squeeze_result;
15247+
15248+/* Do not change items ids. If you do - there will be format change */
15249+typedef enum {
15250+ STATIC_STAT_DATA_ID = 0x0,
15251+ SIMPLE_DIR_ENTRY_ID = 0x1,
15252+ COMPOUND_DIR_ID = 0x2,
15253+ NODE_POINTER_ID = 0x3,
15254+ EXTENT_POINTER_ID = 0x5,
15255+ FORMATTING_ID = 0x6,
15256+ CTAIL_ID = 0x7,
15257+ BLACK_BOX_ID = 0x8,
15258+ LAST_ITEM_ID = 0x9
15259+} item_id;
15260+
15261+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15262+ whether commit() was called or VM memory pressure was applied. */
15263+typedef enum {
15264+ /* submit flush queue to disk at jnode_flush completion */
15265+ JNODE_FLUSH_WRITE_BLOCKS = 1,
15266+
15267+ /* flush is called for commit */
15268+ JNODE_FLUSH_COMMIT = 2,
15269+ /* not implemented */
15270+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
15271+
15272+ /* not implemented */
15273+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15274+} jnode_flush_flags;
15275+
15276+/* Flags to insert/paste carry operations. Currently they only used in
15277+ flushing code, but in future, they can be used to optimize for repetitive
15278+ accesses. */
15279+typedef enum {
15280+ /* carry is not allowed to shift data to the left when trying to find
15281+ free space */
15282+ COPI_DONT_SHIFT_LEFT = (1 << 0),
15283+ /* carry is not allowed to shift data to the right when trying to find
15284+ free space */
15285+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
15286+ /* carry is not allowed to allocate new node(s) when trying to find
15287+ free space */
15288+ COPI_DONT_ALLOCATE = (1 << 2),
15289+ /* try to load left neighbor if its not in a cache */
15290+ COPI_LOAD_LEFT = (1 << 3),
15291+ /* try to load right neighbor if its not in a cache */
15292+ COPI_LOAD_RIGHT = (1 << 4),
15293+ /* shift insertion point to the left neighbor */
15294+ COPI_GO_LEFT = (1 << 5),
15295+ /* shift insertion point to the right neighbor */
15296+ COPI_GO_RIGHT = (1 << 6),
15297+ /* try to step back into original node if insertion into new node
15298+ fails after shifting data there. */
15299+ COPI_STEP_BACK = (1 << 7)
15300+} cop_insert_flag;
15301+
15302+typedef enum {
15303+ SAFE_UNLINK, /* safe-link for unlink */
15304+ SAFE_TRUNCATE /* safe-link for truncate */
15305+} reiser4_safe_link_t;
15306+
15307+/* this is to show on which list of atom jnode is */
15308+typedef enum {
15309+ NOT_CAPTURED,
15310+ DIRTY_LIST,
15311+ CLEAN_LIST,
15312+ FQ_LIST,
15313+ WB_LIST,
15314+ OVRWR_LIST
15315+} atom_list;
15316+
44254afd
MT
15317+/* __REISER4_FORWARD_H__ */
15318+#endif
15319+
15320+/* Make Linus happy.
15321+ Local variables:
15322+ c-indentation-style: "K&R"
15323+ mode-name: "LC"
15324+ c-basic-offset: 8
15325+ tab-width: 8
15326+ fill-column: 120
15327+ End:
15328+*/
71430cf6
MT
15329diff -urN linux-2.6.22.orig/fs/reiser4/fsdata.c linux-2.6.22/fs/reiser4/fsdata.c
15330--- linux-2.6.22.orig/fs/reiser4/fsdata.c 1970-01-01 03:00:00.000000000 +0300
15331+++ linux-2.6.22/fs/reiser4/fsdata.c 2007-07-29 00:25:34.868694406 +0400
15332@@ -0,0 +1,808 @@
44254afd
MT
15333+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15334+ * reiser4/README */
15335+
15336+#include "fsdata.h"
15337+#include "inode.h"
15338+
15339+
15340+/* cache or dir_cursors */
71430cf6 15341+static struct kmem_cache *d_cursor_cache;
44254afd
MT
15342+static struct shrinker *d_cursor_shrinker;
15343+
15344+/* list of unused cursors */
15345+static LIST_HEAD(cursor_cache);
15346+
15347+/* number of cursors in list of ununsed cursors */
15348+static unsigned long d_cursor_unused = 0;
15349+
15350+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15351+DEFINE_SPINLOCK(d_lock);
15352+
15353+static reiser4_file_fsdata *create_fsdata(struct file *file);
15354+static int file_is_stateless(struct file *file);
15355+static void free_fsdata(reiser4_file_fsdata *fsdata);
15356+static void kill_cursor(dir_cursor *);
15357+
15358+/**
15359+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15360+ * @nr: number of objects to free
15361+ * @mask: GFP mask
15362+ *
15363+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15364+ * number. Return number of still freeable cursors.
15365+ */
15366+static int d_cursor_shrink(int nr, gfp_t mask)
15367+{
15368+ if (nr != 0) {
15369+ dir_cursor *scan;
15370+ int killed;
15371+
15372+ killed = 0;
15373+ spin_lock(&d_lock);
15374+ while (!list_empty(&cursor_cache)) {
15375+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
15376+ assert("nikita-3567", scan->ref == 0);
15377+ kill_cursor(scan);
15378+ ++killed;
15379+ --nr;
15380+ if (nr == 0)
15381+ break;
15382+ }
15383+ spin_unlock(&d_lock);
15384+ }
15385+ return d_cursor_unused;
15386+}
15387+
15388+/**
71430cf6 15389+ * reiser4_init_d_cursor - create d_cursor cache
44254afd
MT
15390+ *
15391+ * Initializes slab cache of d_cursors. It is part of reiser4 module
15392+ * initialization.
15393+ */
71430cf6 15394+int reiser4_init_d_cursor(void)
44254afd
MT
15395+{
15396+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15397+ SLAB_HWCACHE_ALIGN, NULL, NULL);
15398+ if (d_cursor_cache == NULL)
15399+ return RETERR(-ENOMEM);
15400+
15401+ /*
15402+ * actually, d_cursors are "priceless", because there is no way to
15403+ * recover information stored in them. On the other hand, we don't
15404+ * want to consume all kernel memory by them. As a compromise, just
15405+ * assign higher "seeks" value to d_cursor cache, so that it will be
15406+ * shrunk only if system is really tight on memory.
15407+ */
15408+ d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15409+ d_cursor_shrink);
15410+ if (d_cursor_shrinker == NULL) {
15411+ destroy_reiser4_cache(&d_cursor_cache);
15412+ d_cursor_cache = NULL;
15413+ return RETERR(-ENOMEM);
15414+ }
15415+ return 0;
15416+}
15417+
15418+/**
71430cf6 15419+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
44254afd
MT
15420+ *
15421+ * This is called on reiser4 module unloading or system shutdown.
15422+ */
71430cf6 15423+void reiser4_done_d_cursor(void)
44254afd
MT
15424+{
15425+ BUG_ON(d_cursor_shrinker == NULL);
15426+ remove_shrinker(d_cursor_shrinker);
15427+ d_cursor_shrinker = NULL;
15428+
15429+ destroy_reiser4_cache(&d_cursor_cache);
15430+}
15431+
15432+#define D_CURSOR_TABLE_SIZE (256)
15433+
15434+static inline unsigned long
71430cf6 15435+d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
44254afd
MT
15436+{
15437+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15438+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15439+}
15440+
71430cf6
MT
15441+static inline int d_cursor_eq(const struct d_cursor_key *k1,
15442+ const struct d_cursor_key *k2)
44254afd
MT
15443+{
15444+ return k1->cid == k2->cid && k1->oid == k2->oid;
15445+}
15446+
15447+/*
15448+ * define functions to manipulate reiser4 super block's hash table of
15449+ * dir_cursors
15450+ */
71430cf6 15451+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
44254afd
MT
15452+#define KFREE(ptr, size) kfree(ptr)
15453+TYPE_SAFE_HASH_DEFINE(d_cursor,
15454+ dir_cursor,
71430cf6
MT
15455+ struct d_cursor_key,
15456+ key, hash, d_cursor_hash, d_cursor_eq);
44254afd
MT
15457+#undef KFREE
15458+#undef KMALLOC
15459+
15460+/**
71430cf6 15461+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
44254afd
MT
15462+ * @super: super block to initialize
15463+ *
15464+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15465+ * of mount.
15466+ */
71430cf6 15467+int reiser4_init_super_d_info(struct super_block *super)
44254afd 15468+{
71430cf6 15469+ struct d_cursor_info *p;
44254afd
MT
15470+
15471+ p = &get_super_private(super)->d_info;
15472+
71430cf6 15473+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
44254afd
MT
15474+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15475+}
15476+
15477+/**
71430cf6 15478+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
44254afd
MT
15479+ * @super: super block being umounted
15480+ *
15481+ * It is called on umount. Kills all directory cursors attached to suoer block.
15482+ */
71430cf6 15483+void reiser4_done_super_d_info(struct super_block *super)
44254afd 15484+{
71430cf6 15485+ struct d_cursor_info *d_info;
44254afd
MT
15486+ dir_cursor *cursor, *next;
15487+
15488+ d_info = &get_super_private(super)->d_info;
15489+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15490+ kill_cursor(cursor);
15491+
15492+ BUG_ON(d_info->tree.rnode != NULL);
15493+ d_cursor_hash_done(&d_info->table);
15494+}
15495+
15496+/**
15497+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15498+ * @cursor: cursor to free
15499+ *
15500+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15501+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15502+ * indices, hash table, list of unused cursors and frees it.
15503+ */
15504+static void kill_cursor(dir_cursor *cursor)
15505+{
15506+ unsigned long index;
15507+
15508+ assert("nikita-3566", cursor->ref == 0);
15509+ assert("nikita-3572", cursor->fsdata != NULL);
15510+
15511+ index = (unsigned long)cursor->key.oid;
15512+ list_del_init(&cursor->fsdata->dir.linkage);
15513+ free_fsdata(cursor->fsdata);
15514+ cursor->fsdata = NULL;
15515+
15516+ if (list_empty_careful(&cursor->list))
15517+ /* this is last cursor for a file. Kill radix-tree entry */
15518+ radix_tree_delete(&cursor->info->tree, index);
15519+ else {
15520+ void **slot;
15521+
15522+ /*
15523+ * there are other cursors for the same oid.
15524+ */
15525+
15526+ /*
15527+ * if radix tree point to the cursor being removed, re-target
15528+ * radix tree slot to the next cursor in the (non-empty as was
15529+ * checked above) element of the circular list of all cursors
15530+ * for this oid.
15531+ */
15532+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15533+ assert("nikita-3571", *slot != NULL);
15534+ if (*slot == cursor)
15535+ *slot = list_entry(cursor->list.next, dir_cursor, list);
15536+ /* remove cursor from circular list */
15537+ list_del_init(&cursor->list);
15538+ }
15539+ /* remove cursor from the list of unused cursors */
15540+ list_del_init(&cursor->alist);
15541+ /* remove cursor from the hash table */
15542+ d_cursor_hash_remove(&cursor->info->table, cursor);
15543+ /* and free it */
15544+ kmem_cache_free(d_cursor_cache, cursor);
15545+ --d_cursor_unused;
15546+}
15547+
15548+/* possible actions that can be performed on all cursors for the given file */
15549+enum cursor_action {
15550+ /*
15551+ * load all detached state: this is called when stat-data is loaded
15552+ * from the disk to recover information about all pending readdirs
15553+ */
15554+ CURSOR_LOAD,
15555+ /*
15556+ * detach all state from inode, leaving it in the cache. This is called
15557+ * when inode is removed form the memory by memory pressure
15558+ */
15559+ CURSOR_DISPOSE,
15560+ /*
15561+ * detach cursors from the inode, and free them. This is called when
15562+ * inode is destroyed
15563+ */
15564+ CURSOR_KILL
15565+};
15566+
15567+/*
15568+ * return d_cursor data for the file system @inode is in.
15569+ */
71430cf6 15570+static inline struct d_cursor_info *d_info(struct inode *inode)
44254afd
MT
15571+{
15572+ return &get_super_private(inode->i_sb)->d_info;
15573+}
15574+
15575+/*
15576+ * lookup d_cursor in the per-super-block radix tree.
15577+ */
71430cf6
MT
15578+static inline dir_cursor *lookup(struct d_cursor_info * info,
15579+ unsigned long index)
44254afd
MT
15580+{
15581+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15582+}
15583+
15584+/*
15585+ * attach @cursor to the radix tree. There may be multiple cursors for the
15586+ * same oid, they are chained into circular list.
15587+ */
15588+static void bind_cursor(dir_cursor * cursor, unsigned long index)
15589+{
15590+ dir_cursor *head;
15591+
15592+ head = lookup(cursor->info, index);
15593+ if (head == NULL) {
15594+ /* this is the first cursor for this index */
15595+ INIT_LIST_HEAD(&cursor->list);
15596+ radix_tree_insert(&cursor->info->tree, index, cursor);
15597+ } else {
15598+ /* some cursor already exists. Chain ours */
15599+ list_add(&cursor->list, &head->list);
15600+ }
15601+}
15602+
15603+/*
15604+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
15605+ * "unused" list. Called when file descriptor is not longer in active use.
15606+ */
15607+static void clean_fsdata(struct file *file)
15608+{
15609+ dir_cursor *cursor;
15610+ reiser4_file_fsdata *fsdata;
15611+
15612+ assert("nikita-3570", file_is_stateless(file));
15613+
15614+ fsdata = (reiser4_file_fsdata *) file->private_data;
15615+ if (fsdata != NULL) {
15616+ cursor = fsdata->cursor;
15617+ if (cursor != NULL) {
15618+ spin_lock(&d_lock);
15619+ --cursor->ref;
15620+ if (cursor->ref == 0) {
15621+ list_add_tail(&cursor->alist, &cursor_cache);
15622+ ++d_cursor_unused;
15623+ }
15624+ spin_unlock(&d_lock);
15625+ file->private_data = NULL;
15626+ }
15627+ }
15628+}
15629+
15630+/*
15631+ * global counter used to generate "client ids". These ids are encoded into
15632+ * high bits of fpos.
15633+ */
15634+static __u32 cid_counter = 0;
15635+#define CID_SHIFT (20)
15636+#define CID_MASK (0xfffffull)
15637+
15638+static void free_file_fsdata_nolock(struct file *);
15639+
15640+/**
15641+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15642+ * @cursor:
15643+ * @file:
15644+ * @inode:
15645+ *
15646+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15647+ * reiser4 super block's hash table and radix tree.
15648+ add detachable readdir
15649+ * state to the @f
15650+ */
15651+static int insert_cursor(dir_cursor *cursor, struct file *file,
15652+ struct inode *inode)
15653+{
15654+ int result;
15655+ reiser4_file_fsdata *fsdata;
15656+
15657+ memset(cursor, 0, sizeof *cursor);
15658+
15659+ /* this is either first call to readdir, or rewind. Anyway, create new
15660+ * cursor. */
15661+ fsdata = create_fsdata(NULL);
15662+ if (fsdata != NULL) {
71430cf6 15663+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
44254afd 15664+ if (result == 0) {
71430cf6 15665+ struct d_cursor_info *info;
44254afd
MT
15666+ oid_t oid;
15667+
15668+ info = d_info(inode);
15669+ oid = get_inode_oid(inode);
15670+ /* cid occupies higher 12 bits of f->f_pos. Don't
15671+ * allow it to become negative: this confuses
15672+ * nfsd_readdir() */
15673+ cursor->key.cid = (++cid_counter) & 0x7ff;
15674+ cursor->key.oid = oid;
15675+ cursor->fsdata = fsdata;
15676+ cursor->info = info;
15677+ cursor->ref = 1;
15678+
15679+ spin_lock_inode(inode);
15680+ /* install cursor as @f's private_data, discarding old
15681+ * one if necessary */
15682+#if REISER4_DEBUG
15683+ if (file->private_data)
15684+ warning("", "file has fsdata already");
15685+#endif
15686+ clean_fsdata(file);
15687+ free_file_fsdata_nolock(file);
15688+ file->private_data = fsdata;
15689+ fsdata->cursor = cursor;
15690+ spin_unlock_inode(inode);
15691+ spin_lock(&d_lock);
15692+ /* insert cursor into hash table */
15693+ d_cursor_hash_insert(&info->table, cursor);
15694+ /* and chain it into radix-tree */
15695+ bind_cursor(cursor, (unsigned long)oid);
15696+ spin_unlock(&d_lock);
15697+ radix_tree_preload_end();
15698+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15699+ }
15700+ } else
15701+ result = RETERR(-ENOMEM);
15702+ return result;
15703+}
15704+
15705+/**
15706+ * process_cursors - do action on each cursor attached to inode
15707+ * @inode:
15708+ * @act: action to do
15709+ *
15710+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15711+ * and performs action specified by @act on each of cursors.
15712+ */
15713+static void process_cursors(struct inode *inode, enum cursor_action act)
15714+{
15715+ oid_t oid;
15716+ dir_cursor *start;
15717+ struct list_head *head;
15718+ reiser4_context *ctx;
71430cf6 15719+ struct d_cursor_info *info;
44254afd
MT
15720+
15721+ /* this can be called by
15722+ *
15723+ * kswapd->...->prune_icache->..reiser4_destroy_inode
15724+ *
15725+ * without reiser4_context
15726+ */
71430cf6 15727+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
15728+ if (IS_ERR(ctx)) {
15729+ warning("vs-23", "failed to init context");
15730+ return;
15731+ }
15732+
15733+ assert("nikita-3558", inode != NULL);
15734+
15735+ info = d_info(inode);
15736+ oid = get_inode_oid(inode);
15737+ spin_lock_inode(inode);
15738+ head = get_readdir_list(inode);
15739+ spin_lock(&d_lock);
15740+ /* find any cursor for this oid: reference to it is hanging of radix
15741+ * tree */
15742+ start = lookup(info, (unsigned long)oid);
15743+ if (start != NULL) {
15744+ dir_cursor *scan;
15745+ reiser4_file_fsdata *fsdata;
15746+
15747+ /* process circular list of cursors for this oid */
15748+ scan = start;
15749+ do {
15750+ dir_cursor *next;
15751+
15752+ next = list_entry(scan->list.next, dir_cursor, list);
15753+ fsdata = scan->fsdata;
15754+ assert("nikita-3557", fsdata != NULL);
15755+ if (scan->key.oid == oid) {
15756+ switch (act) {
15757+ case CURSOR_DISPOSE:
15758+ list_del_init(&fsdata->dir.linkage);
15759+ break;
15760+ case CURSOR_LOAD:
15761+ list_add(&fsdata->dir.linkage, head);
15762+ break;
15763+ case CURSOR_KILL:
15764+ kill_cursor(scan);
15765+ break;
15766+ }
15767+ }
15768+ if (scan == next)
15769+ /* last cursor was just killed */
15770+ break;
15771+ scan = next;
15772+ } while (scan != start);
15773+ }
15774+ spin_unlock(&d_lock);
15775+ /* check that we killed 'em all */
15776+ assert("nikita-3568",
15777+ ergo(act == CURSOR_KILL,
15778+ list_empty_careful(get_readdir_list(inode))));
15779+ assert("nikita-3569",
15780+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15781+ spin_unlock_inode(inode);
15782+ reiser4_exit_context(ctx);
15783+}
15784+
15785+/**
71430cf6 15786+ * reiser4_dispose_cursors - removes cursors from inode's list
44254afd
MT
15787+ * @inode: inode to dispose cursors of
15788+ *
15789+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15790+ * attached to cursor from inode's readdir list. This is called when inode is
15791+ * removed from the memory by memory pressure.
15792+ */
71430cf6 15793+void reiser4_dispose_cursors(struct inode *inode)
44254afd
MT
15794+{
15795+ process_cursors(inode, CURSOR_DISPOSE);
15796+}
15797+
15798+/**
71430cf6 15799+ * reiser4_load_cursors - attach cursors to inode
44254afd
MT
15800+ * @inode: inode to load cursors to
15801+ *
15802+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15803+ * attached to cursor to inode's readdir list. This is done when inode is
15804+ * loaded into memory.
15805+ */
71430cf6 15806+void reiser4_load_cursors(struct inode *inode)
44254afd
MT
15807+{
15808+ process_cursors(inode, CURSOR_LOAD);
15809+}
15810+
15811+/**
71430cf6 15812+ * reiser4_kill_cursors - kill all inode cursors
44254afd
MT
15813+ * @inode: inode to kill cursors of
15814+ *
15815+ * Frees all cursors for this inode. This is called when inode is destroyed.
15816+ */
71430cf6 15817+void reiser4_kill_cursors(struct inode *inode)
44254afd
MT
15818+{
15819+ process_cursors(inode, CURSOR_KILL);
15820+}
15821+
15822+/**
15823+ * file_is_stateless -
15824+ * @file:
15825+ *
15826+ * true, if file descriptor @f is created by NFS server by "demand" to serve
15827+ * one file system operation. This means that there may be "detached state"
15828+ * for underlying inode.
15829+ */
15830+static int file_is_stateless(struct file *file)
15831+{
15832+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15833+}
15834+
15835+/**
71430cf6 15836+ * reiser4_get_dir_fpos -
44254afd
MT
15837+ * @dir:
15838+ *
15839+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15840+ * in the case of stateless directory operation (readdir-over-nfs), client id
15841+ * was encoded in the high bits of cookie and should me masked off.
15842+ */
71430cf6 15843+loff_t reiser4_get_dir_fpos(struct file *dir)
44254afd
MT
15844+{
15845+ if (file_is_stateless(dir))
15846+ return dir->f_pos & CID_MASK;
15847+ else
15848+ return dir->f_pos;
15849+}
15850+
15851+/**
71430cf6 15852+ * reiser4_attach_fsdata - try to attach fsdata
44254afd
MT
15853+ * @file:
15854+ * @inode:
15855+ *
15856+ * Finds or creates cursor for readdir-over-nfs.
15857+ */
71430cf6 15858+int reiser4_attach_fsdata(struct file *file, struct inode *inode)
44254afd
MT
15859+{
15860+ loff_t pos;
15861+ int result;
15862+ dir_cursor *cursor;
15863+
15864+ /*
15865+ * we are serialized by inode->i_mutex
15866+ */
15867+ if (!file_is_stateless(file))
15868+ return 0;
15869+
15870+ pos = file->f_pos;
15871+ result = 0;
15872+ if (pos == 0) {
15873+ /*
15874+ * first call to readdir (or rewind to the beginning of
15875+ * directory)
15876+ */
71430cf6
MT
15877+ cursor = kmem_cache_alloc(d_cursor_cache,
15878+ reiser4_ctx_gfp_mask_get());
44254afd
MT
15879+ if (cursor != NULL)
15880+ result = insert_cursor(cursor, file, inode);
15881+ else
15882+ result = RETERR(-ENOMEM);
15883+ } else {
15884+ /* try to find existing cursor */
71430cf6 15885+ struct d_cursor_key key;
44254afd
MT
15886+
15887+ key.cid = pos >> CID_SHIFT;
15888+ key.oid = get_inode_oid(inode);
15889+ spin_lock(&d_lock);
15890+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15891+ if (cursor != NULL) {
15892+ /* cursor was found */
15893+ if (cursor->ref == 0) {
15894+ /* move it from unused list */
15895+ list_del_init(&cursor->alist);
15896+ --d_cursor_unused;
15897+ }
15898+ ++cursor->ref;
15899+ }
15900+ spin_unlock(&d_lock);
15901+ if (cursor != NULL) {
15902+ spin_lock_inode(inode);
15903+ assert("nikita-3556", cursor->fsdata->back == NULL);
15904+ clean_fsdata(file);
15905+ free_file_fsdata_nolock(file);
15906+ file->private_data = cursor->fsdata;
15907+ spin_unlock_inode(inode);
15908+ }
15909+ }
15910+ return result;
15911+}
15912+
15913+/**
71430cf6 15914+ * reiser4_detach_fsdata - ???
44254afd
MT
15915+ * @file:
15916+ *
15917+ * detach fsdata, if necessary
15918+ */
71430cf6 15919+void reiser4_detach_fsdata(struct file *file)
44254afd
MT
15920+{
15921+ struct inode *inode;
15922+
15923+ if (!file_is_stateless(file))
15924+ return;
15925+
15926+ inode = file->f_dentry->d_inode;
15927+ spin_lock_inode(inode);
15928+ clean_fsdata(file);
15929+ spin_unlock_inode(inode);
15930+}
15931+
15932+/* slab for reiser4_dentry_fsdata */
71430cf6 15933+static struct kmem_cache *dentry_fsdata_cache;
44254afd
MT
15934+
15935+/**
71430cf6 15936+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
44254afd
MT
15937+ *
15938+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
15939+ * part of reiser4 module initialization.
15940+ */
71430cf6 15941+int reiser4_init_dentry_fsdata(void)
44254afd
MT
15942+{
15943+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
71430cf6
MT
15944+ sizeof(struct reiser4_dentry_fsdata),
15945+ 0,
15946+ SLAB_HWCACHE_ALIGN |
15947+ SLAB_RECLAIM_ACCOUNT, NULL,
15948+ NULL);
44254afd
MT
15949+ if (dentry_fsdata_cache == NULL)
15950+ return RETERR(-ENOMEM);
15951+ return 0;
15952+}
15953+
15954+/**
71430cf6 15955+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
44254afd
MT
15956+ *
15957+ * This is called on reiser4 module unloading or system shutdown.
15958+ */
71430cf6 15959+void reiser4_done_dentry_fsdata(void)
44254afd
MT
15960+{
15961+ destroy_reiser4_cache(&dentry_fsdata_cache);
15962+}
15963+
15964+/**
15965+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
15966+ * @dentry: queried dentry
15967+ *
15968+ * Allocates if necessary and returns per-dentry data that we attach to each
15969+ * dentry.
15970+ */
71430cf6 15971+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
44254afd
MT
15972+{
15973+ assert("nikita-1365", dentry != NULL);
15974+
15975+ if (dentry->d_fsdata == NULL) {
15976+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
71430cf6 15977+ reiser4_ctx_gfp_mask_get());
44254afd
MT
15978+ if (dentry->d_fsdata == NULL)
15979+ return ERR_PTR(RETERR(-ENOMEM));
71430cf6
MT
15980+ memset(dentry->d_fsdata, 0,
15981+ sizeof(struct reiser4_dentry_fsdata));
44254afd
MT
15982+ }
15983+ return dentry->d_fsdata;
15984+}
15985+
15986+/**
15987+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
15988+ * @dentry: dentry to free fsdata of
15989+ *
15990+ * Detaches and frees fs-specific dentry data
15991+ */
15992+void reiser4_free_dentry_fsdata(struct dentry *dentry)
15993+{
15994+ if (dentry->d_fsdata != NULL) {
15995+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
15996+ dentry->d_fsdata = NULL;
15997+ }
15998+}
15999+
44254afd 16000+/* slab for reiser4_file_fsdata */
71430cf6 16001+static struct kmem_cache *file_fsdata_cache;
44254afd
MT
16002+
16003+/**
71430cf6 16004+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
44254afd
MT
16005+ *
16006+ * Initializes slab cache of structures attached to file->private_data. It is
16007+ * part of reiser4 module initialization.
16008+ */
71430cf6 16009+int reiser4_init_file_fsdata(void)
44254afd
MT
16010+{
16011+ file_fsdata_cache = kmem_cache_create("file_fsdata",
16012+ sizeof(reiser4_file_fsdata),
16013+ 0,
16014+ SLAB_HWCACHE_ALIGN |
16015+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16016+ if (file_fsdata_cache == NULL)
16017+ return RETERR(-ENOMEM);
16018+ return 0;
16019+}
16020+
16021+/**
71430cf6 16022+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
44254afd
MT
16023+ *
16024+ * This is called on reiser4 module unloading or system shutdown.
16025+ */
71430cf6 16026+void reiser4_done_file_fsdata(void)
44254afd
MT
16027+{
16028+ destroy_reiser4_cache(&file_fsdata_cache);
16029+}
16030+
16031+/**
16032+ * create_fsdata - allocate and initialize reiser4_file_fsdata
16033+ * @file: what to create file_fsdata for, may be NULL
16034+ *
16035+ * Allocates and initializes reiser4_file_fsdata structure.
16036+ */
16037+static reiser4_file_fsdata *create_fsdata(struct file *file)
16038+{
16039+ reiser4_file_fsdata *fsdata;
16040+
71430cf6
MT
16041+ fsdata = kmem_cache_alloc(file_fsdata_cache,
16042+ reiser4_ctx_gfp_mask_get());
44254afd
MT
16043+ if (fsdata != NULL) {
16044+ memset(fsdata, 0, sizeof *fsdata);
16045+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16046+ fsdata->back = file;
16047+ INIT_LIST_HEAD(&fsdata->dir.linkage);
16048+ }
16049+ return fsdata;
16050+}
16051+
16052+/**
16053+ * free_fsdata - free reiser4_file_fsdata
16054+ * @fsdata: object to free
16055+ *
16056+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
16057+ */
16058+static void free_fsdata(reiser4_file_fsdata *fsdata)
16059+{
16060+ BUG_ON(fsdata == NULL);
16061+ kmem_cache_free(file_fsdata_cache, fsdata);
16062+}
16063+
16064+/**
16065+ * reiser4_get_file_fsdata - get fs-specific file data
16066+ * @file: queried file
16067+ *
16068+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16069+ * to @file.
16070+ */
16071+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16072+{
16073+ assert("nikita-1603", file != NULL);
16074+
16075+ if (file->private_data == NULL) {
16076+ reiser4_file_fsdata *fsdata;
16077+ struct inode *inode;
16078+
16079+ fsdata = create_fsdata(file);
16080+ if (fsdata == NULL)
16081+ return ERR_PTR(RETERR(-ENOMEM));
16082+
16083+ inode = file->f_dentry->d_inode;
16084+ spin_lock_inode(inode);
16085+ if (file->private_data == NULL) {
16086+ file->private_data = fsdata;
16087+ fsdata = NULL;
16088+ }
16089+ spin_unlock_inode(inode);
16090+ if (fsdata != NULL)
16091+ /* other thread initialized ->fsdata */
16092+ kmem_cache_free(file_fsdata_cache, fsdata);
16093+ }
16094+ assert("nikita-2665", file->private_data != NULL);
16095+ return file->private_data;
16096+}
16097+
16098+/**
16099+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16100+ * @file:
16101+ *
16102+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16103+ * readdir list, frees if it is not linked to d_cursor object.
16104+ */
16105+static void free_file_fsdata_nolock(struct file *file)
16106+{
16107+ reiser4_file_fsdata *fsdata;
16108+
16109+ assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16110+ fsdata = file->private_data;
16111+ if (fsdata != NULL) {
16112+ list_del_init(&fsdata->dir.linkage);
16113+ if (fsdata->cursor == NULL)
16114+ free_fsdata(fsdata);
16115+ }
16116+ file->private_data = NULL;
16117+}
16118+
16119+/**
16120+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16121+ * @file:
16122+ *
16123+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16124+ */
16125+void reiser4_free_file_fsdata(struct file *file)
16126+{
16127+ spin_lock_inode(file->f_dentry->d_inode);
16128+ free_file_fsdata_nolock(file);
16129+ spin_unlock_inode(file->f_dentry->d_inode);
16130+}
16131+
16132+/*
16133+ * Local variables:
16134+ * c-indentation-style: "K&R"
16135+ * mode-name: "LC"
16136+ * c-basic-offset: 8
16137+ * tab-width: 8
16138+ * fill-column: 79
16139+ * End:
16140+ */
71430cf6
MT
16141diff -urN linux-2.6.22.orig/fs/reiser4/fsdata.h linux-2.6.22/fs/reiser4/fsdata.h
16142--- linux-2.6.22.orig/fs/reiser4/fsdata.h 1970-01-01 03:00:00.000000000 +0300
16143+++ linux-2.6.22/fs/reiser4/fsdata.h 2007-07-29 00:25:34.868694406 +0400
16144@@ -0,0 +1,205 @@
44254afd
MT
16145+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16146+ * reiser4/README */
16147+
16148+#if !defined( __REISER4_FSDATA_H__ )
16149+#define __REISER4_FSDATA_H__
16150+
16151+#include "debug.h"
16152+#include "kassign.h"
16153+#include "seal.h"
16154+#include "type_safe_hash.h"
16155+#include "plugin/file/file.h"
16156+#include "readahead.h"
16157+
16158+/*
16159+ * comment about reiser4_dentry_fsdata
16160+ *
16161+ *
16162+ */
16163+
16164+/*
16165+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
16166+ * protected by ->i_mutex on inode. Under this lock following invariant
16167+ * holds:
16168+ *
16169+ * file descriptor is "looking" at the entry_no-th directory entry from
16170+ * the beginning of directory. This entry has key dir_entry_key and is
16171+ * pos-th entry with duplicate-key sequence.
16172+ *
16173+ */
16174+
16175+/* logical position within directory */
71430cf6 16176+struct dir_pos {
44254afd
MT
16177+ /* key of directory entry (actually, part of a key sufficient to
16178+ identify directory entry) */
16179+ de_id dir_entry_key;
16180+ /* ordinal number of directory entry among all entries with the same
16181+ key. (Starting from 0.) */
16182+ unsigned pos;
71430cf6 16183+};
44254afd 16184+
71430cf6 16185+struct readdir_pos {
44254afd
MT
16186+ /* f_pos corresponding to this readdir position */
16187+ __u64 fpos;
16188+ /* logical position within directory */
71430cf6 16189+ struct dir_pos position;
44254afd
MT
16190+ /* logical number of directory entry within
16191+ directory */
16192+ __u64 entry_no;
71430cf6 16193+};
44254afd
MT
16194+
16195+/*
16196+ * this is used to speed up lookups for directory entry: on initial call to
16197+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
16198+ * in struct dentry and reused later to avoid tree traversals.
16199+ */
71430cf6 16200+struct de_location {
44254afd
MT
16201+ /* seal covering directory entry */
16202+ seal_t entry_seal;
16203+ /* coord of directory entry */
16204+ coord_t entry_coord;
16205+ /* ordinal number of directory entry among all entries with the same
16206+ key. (Starting from 0.) */
16207+ int pos;
71430cf6 16208+};
44254afd
MT
16209+
16210+/**
16211+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16212+ *
16213+ * This is allocated dynamically and released in d_op->d_release()
16214+ *
16215+ * Currently it only contains cached location (hint) of directory entry, but
16216+ * it is expected that other information will be accumulated here.
16217+ */
71430cf6 16218+struct reiser4_dentry_fsdata {
44254afd
MT
16219+ /*
16220+ * here will go fields filled by ->lookup() to speedup next
16221+ * create/unlink, like blocknr of znode with stat-data, or key of
16222+ * stat-data.
16223+ */
71430cf6 16224+ struct de_location dec;
44254afd
MT
16225+ int stateless; /* created through reiser4_decode_fh, needs special
16226+ * treatment in readdir. */
71430cf6 16227+};
44254afd 16228+
71430cf6
MT
16229+extern int reiser4_init_dentry_fsdata(void);
16230+extern void reiser4_done_dentry_fsdata(void);
16231+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
44254afd
MT
16232+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16233+
44254afd
MT
16234+/**
16235+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16236+ *
16237+ * This is allocated dynamically and released in inode->i_fop->release
16238+ */
16239+typedef struct reiser4_file_fsdata {
16240+ /*
16241+ * pointer back to the struct file which this reiser4_file_fsdata is
16242+ * part of
16243+ */
16244+ struct file *back;
16245+ /* detached cursor for stateless readdir. */
16246+ struct dir_cursor *cursor;
16247+ /*
16248+ * We need both directory and regular file parts here, because there
16249+ * are file system objects that are files and directories.
16250+ */
16251+ struct {
16252+ /*
16253+ * position in directory. It is updated each time directory is
16254+ * modified
16255+ */
71430cf6 16256+ struct readdir_pos readdir;
44254afd
MT
16257+ /* head of this list is reiser4_inode->lists.readdir_list */
16258+ struct list_head linkage;
16259+ } dir;
16260+ /* hints to speed up operations with regular files: read and write. */
16261+ struct {
16262+ hint_t hint;
16263+ } reg;
44254afd
MT
16264+ struct reiser4_file_ra_state ra1;
16265+
16266+} reiser4_file_fsdata;
16267+
71430cf6
MT
16268+extern int reiser4_init_file_fsdata(void);
16269+extern void reiser4_done_file_fsdata(void);
44254afd
MT
16270+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16271+extern void reiser4_free_file_fsdata(struct file *);
16272+
44254afd
MT
16273+/*
16274+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16275+ * used to address problem reiser4 has with readdir accesses via NFS. See
16276+ * plugin/file_ops_readdir.c for more details.
16277+ */
71430cf6 16278+struct d_cursor_key{
44254afd
MT
16279+ __u16 cid;
16280+ __u64 oid;
71430cf6 16281+};
44254afd
MT
16282+
16283+/*
16284+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16285+ * maintain hash table of dir_cursor-s in reiser4's super block
16286+ */
16287+typedef struct dir_cursor dir_cursor;
16288+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16289+
44254afd
MT
16290+struct dir_cursor {
16291+ int ref;
16292+ reiser4_file_fsdata *fsdata;
16293+
16294+ /* link to reiser4 super block hash table of cursors */
16295+ d_cursor_hash_link hash;
16296+
16297+ /*
16298+ * this is to link cursors to reiser4 super block's radix tree of
16299+ * cursors if there are more than one cursor of the same objectid
16300+ */
16301+ struct list_head list;
71430cf6
MT
16302+ struct d_cursor_key key;
16303+ struct d_cursor_info *info;
44254afd
MT
16304+ /* list of unused cursors */
16305+ struct list_head alist;
16306+};
16307+
71430cf6
MT
16308+extern int reiser4_init_d_cursor(void);
16309+extern void reiser4_done_d_cursor(void);
44254afd 16310+
71430cf6
MT
16311+extern int reiser4_init_super_d_info(struct super_block *);
16312+extern void reiser4_done_super_d_info(struct super_block *);
44254afd 16313+
71430cf6
MT
16314+extern loff_t reiser4_get_dir_fpos(struct file *);
16315+extern int reiser4_attach_fsdata(struct file *, struct inode *);
16316+extern void reiser4_detach_fsdata(struct file *);
44254afd
MT
16317+
16318+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16319+ more details */
71430cf6
MT
16320+void reiser4_dispose_cursors(struct inode *inode);
16321+void reiser4_load_cursors(struct inode *inode);
16322+void reiser4_kill_cursors(struct inode *inode);
16323+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16324+ int offset, int adj);
44254afd
MT
16325+
16326+/*
16327+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16328+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16329+ */
16330+struct d_cursor_info {
16331+ d_cursor_hash_table table;
16332+ struct radix_tree_root tree;
16333+};
16334+
16335+/* spinlock protecting readdir cursors */
16336+extern spinlock_t d_lock;
16337+
16338+/* __REISER4_FSDATA_H__ */
16339+#endif
16340+
16341+/*
16342+ * Local variables:
16343+ * c-indentation-style: "K&R"
16344+ * mode-name: "LC"
16345+ * c-basic-offset: 8
16346+ * tab-width: 8
16347+ * fill-column: 120
16348+ * End:
16349+ */
71430cf6
MT
16350diff -urN linux-2.6.22.orig/fs/reiser4/init_super.c linux-2.6.22/fs/reiser4/init_super.c
16351--- linux-2.6.22.orig/fs/reiser4/init_super.c 1970-01-01 03:00:00.000000000 +0300
16352+++ linux-2.6.22/fs/reiser4/init_super.c 2007-07-29 00:25:34.868694406 +0400
16353@@ -0,0 +1,752 @@
44254afd
MT
16354+/* Copyright by Hans Reiser, 2003 */
16355+
16356+#include "super.h"
16357+#include "inode.h"
16358+#include "plugin/plugin_set.h"
16359+
16360+#include <linux/swap.h>
16361+
44254afd
MT
16362+/**
16363+ * init_fs_info - allocate reiser4 specific super block
16364+ * @super: super block of filesystem
16365+ *
16366+ * Allocates and initialize reiser4_super_info_data, attaches it to
16367+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
16368+ */
71430cf6 16369+int reiser4_init_fs_info(struct super_block *super)
44254afd
MT
16370+{
16371+ reiser4_super_info_data *sbinfo;
16372+
71430cf6
MT
16373+ sbinfo = kmalloc(sizeof(reiser4_super_info_data),
16374+ reiser4_ctx_gfp_mask_get());
44254afd
MT
16375+ if (!sbinfo)
16376+ return RETERR(-ENOMEM);
16377+
16378+ super->s_fs_info = sbinfo;
16379+ super->s_op = NULL;
16380+ memset(sbinfo, 0, sizeof(*sbinfo));
16381+
16382+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16383+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16384+
71430cf6 16385+ mutex_init(&sbinfo->delete_mutex);
44254afd
MT
16386+ spin_lock_init(&(sbinfo->guard));
16387+
16388+ /* initialize per-super-block d_cursor resources */
71430cf6 16389+ reiser4_init_super_d_info(super);
44254afd
MT
16390+
16391+ return 0;
16392+}
16393+
16394+/**
71430cf6 16395+ * reiser4_done_fs_info - free reiser4 specific super block
44254afd
MT
16396+ * @super: super block of filesystem
16397+ *
16398+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
16399+ * frees reiser4_super_info_data.
16400+ */
71430cf6 16401+void reiser4_done_fs_info(struct super_block *super)
44254afd
MT
16402+{
16403+ assert("zam-990", super->s_fs_info != NULL);
16404+
16405+ /* release per-super-block d_cursor resources */
71430cf6 16406+ reiser4_done_super_d_info(super);
44254afd
MT
16407+
16408+ /* make sure that there are not jnodes already */
16409+ assert("", list_empty(&get_super_private(super)->all_jnodes));
16410+ assert("", get_current_context()->trans->atom == NULL);
71430cf6 16411+ reiser4_check_block_counters(super);
44254afd
MT
16412+ kfree(super->s_fs_info);
16413+ super->s_fs_info = NULL;
16414+}
16415+
16416+/* type of option parseable by parse_option() */
16417+typedef enum {
16418+ /* value of option is arbitrary string */
16419+ OPT_STRING,
16420+
16421+ /*
16422+ * option specifies bit in a bitmask. When option is set - bit in
16423+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16424+ * dont_load_bitmap, atomic_write.
16425+ */
16426+ OPT_BIT,
16427+
16428+ /*
16429+ * value of option should conform to sprintf() format. Examples are
16430+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16431+ */
16432+ OPT_FORMAT,
16433+
16434+ /*
16435+ * option can take one of predefined values. Example is onerror=panic or
16436+ * onerror=remount-ro
16437+ */
16438+ OPT_ONEOF,
16439+} opt_type_t;
16440+
71430cf6
MT
16441+#if 0
16442+struct opt_bitmask_bit {
44254afd
MT
16443+ const char *bit_name;
16444+ int bit_nr;
71430cf6
MT
16445+};
16446+#endif
44254afd
MT
16447+
16448+/* description of option parseable by parse_option() */
71430cf6 16449+struct opt_desc {
44254afd
MT
16450+ /* option name.
16451+
16452+ parsed portion of string has a form "name=value".
16453+ */
16454+ const char *name;
16455+ /* type of option */
16456+ opt_type_t type;
16457+ union {
16458+ /* where to store value of string option (type == OPT_STRING) */
16459+ char **string;
16460+ /* description of bits for bit option (type == OPT_BIT) */
16461+ struct {
16462+ int nr;
16463+ void *addr;
16464+ } bit;
16465+ /* description of format and targets for format option (type
16466+ == OPT_FORMAT) */
16467+ struct {
16468+ const char *format;
16469+ int nr_args;
16470+ void *arg1;
16471+ void *arg2;
16472+ void *arg3;
16473+ void *arg4;
16474+ } f;
16475+ struct {
16476+ int *result;
16477+ const char *list[10];
16478+ } oneof;
16479+ struct {
16480+ void *addr;
16481+ int nr_bits;
71430cf6 16482+ //struct opt_bitmask_bit *bits;
44254afd
MT
16483+ } bitmask;
16484+ } u;
71430cf6 16485+};
44254afd
MT
16486+
16487+/**
16488+ * parse_option - parse one option
16489+ * @opt_strin: starting point of parsing
16490+ * @opt: option description
16491+ *
16492+ * foo=bar,
16493+ * ^ ^ ^
16494+ * | | +-- replaced to '\0'
16495+ * | +-- val_start
16496+ * +-- opt_string
16497+ * Figures out option type and handles option correspondingly.
16498+ */
71430cf6 16499+static int parse_option(char *opt_string, struct opt_desc *opt)
44254afd
MT
16500+{
16501+ char *val_start;
16502+ int result;
16503+ const char *err_msg;
16504+
16505+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16506+
16507+ val_start = strchr(opt_string, '=');
16508+ if (val_start != NULL) {
16509+ *val_start = '\0';
16510+ ++val_start;
16511+ }
16512+
16513+ err_msg = NULL;
16514+ result = 0;
16515+ switch (opt->type) {
16516+ case OPT_STRING:
16517+ if (val_start == NULL) {
16518+ err_msg = "String arg missing";
16519+ result = RETERR(-EINVAL);
16520+ } else
16521+ *opt->u.string = val_start;
16522+ break;
16523+ case OPT_BIT:
16524+ if (val_start != NULL)
16525+ err_msg = "Value ignored";
16526+ else
16527+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
16528+ break;
16529+ case OPT_FORMAT:
16530+ if (val_start == NULL) {
16531+ err_msg = "Formatted arg missing";
16532+ result = RETERR(-EINVAL);
16533+ break;
16534+ }
16535+ if (sscanf(val_start, opt->u.f.format,
16536+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16537+ opt->u.f.arg4) != opt->u.f.nr_args) {
16538+ err_msg = "Wrong conversion";
16539+ result = RETERR(-EINVAL);
16540+ }
16541+ break;
16542+ case OPT_ONEOF:
16543+ {
16544+ int i = 0;
16545+
16546+ if (val_start == NULL) {
16547+ err_msg = "Value is missing";
16548+ result = RETERR(-EINVAL);
16549+ break;
16550+ }
16551+ err_msg = "Wrong option value";
16552+ result = RETERR(-EINVAL);
16553+ while (opt->u.oneof.list[i]) {
16554+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
16555+ result = 0;
16556+ err_msg = NULL;
16557+ *opt->u.oneof.result = i;
16558+ break;
16559+ }
16560+ i++;
16561+ }
16562+ break;
16563+ }
16564+ default:
16565+ wrong_return_value("nikita-2100", "opt -> type");
16566+ break;
16567+ }
16568+ if (err_msg != NULL) {
16569+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16570+ err_msg, opt->name, val_start ? "=" : "",
16571+ val_start ? : "");
16572+ }
16573+ return result;
16574+}
16575+
16576+/**
16577+ * parse_options - parse reiser4 mount options
16578+ * @opt_string: starting point
16579+ * @opts: array of option description
16580+ * @nr_opts: number of elements in @opts
16581+ *
16582+ * Parses comma separated list of reiser4 mount options.
16583+ */
71430cf6 16584+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
44254afd
MT
16585+{
16586+ int result;
16587+
16588+ result = 0;
16589+ while ((result == 0) && opt_string && *opt_string) {
16590+ int j;
16591+ char *next;
16592+
16593+ next = strchr(opt_string, ',');
16594+ if (next != NULL) {
16595+ *next = '\0';
16596+ ++next;
16597+ }
16598+ for (j = 0; j < nr_opts; ++j) {
16599+ if (!strncmp(opt_string, opts[j].name,
16600+ strlen(opts[j].name))) {
16601+ result = parse_option(opt_string, &opts[j]);
16602+ break;
16603+ }
16604+ }
16605+ if (j == nr_opts) {
16606+ warning("nikita-2307", "Unrecognized option: \"%s\"",
16607+ opt_string);
16608+ /* traditionally, -EINVAL is returned on wrong mount
16609+ option */
16610+ result = RETERR(-EINVAL);
16611+ }
16612+ opt_string = next;
16613+ }
16614+ return result;
16615+}
16616+
16617+#define NUM_OPT( label, fmt, addr ) \
16618+ { \
16619+ .name = ( label ), \
16620+ .type = OPT_FORMAT, \
16621+ .u = { \
16622+ .f = { \
16623+ .format = ( fmt ), \
16624+ .nr_args = 1, \
16625+ .arg1 = ( addr ), \
16626+ .arg2 = NULL, \
16627+ .arg3 = NULL, \
16628+ .arg4 = NULL \
16629+ } \
16630+ } \
16631+ }
16632+
16633+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16634+
16635+#define BIT_OPT(label, bitnr) \
16636+ { \
16637+ .name = label, \
16638+ .type = OPT_BIT, \
16639+ .u = { \
16640+ .bit = { \
16641+ .nr = bitnr, \
16642+ .addr = &sbinfo->fs_flags \
16643+ } \
16644+ } \
16645+ }
16646+
16647+#define MAX_NR_OPTIONS (30)
16648+
16649+/**
71430cf6 16650+ * reiser4_init_super_data - initialize reiser4 private super block
44254afd
MT
16651+ * @super: super block to initialize
16652+ * @opt_string: list of reiser4 mount options
16653+ *
16654+ * Sets various reiser4 parameters to default values. Parses mount options and
16655+ * overwrites default settings.
16656+ */
71430cf6 16657+int reiser4_init_super_data(struct super_block *super, char *opt_string)
44254afd
MT
16658+{
16659+ int result;
71430cf6 16660+ struct opt_desc *opts, *p;
44254afd
MT
16661+ reiser4_super_info_data *sbinfo = get_super_private(super);
16662+
16663+ /* initialize super, export, dentry operations */
16664+ sbinfo->ops.super = reiser4_super_operations;
16665+ sbinfo->ops.export = reiser4_export_operations;
16666+ sbinfo->ops.dentry = reiser4_dentry_operations;
16667+ super->s_op = &sbinfo->ops.super;
16668+ super->s_export_op = &sbinfo->ops.export;
16669+
16670+ /* initialize transaction manager parameters to default values */
16671+ sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16672+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16673+ sbinfo->tmgr.atom_min_size = 256;
16674+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16675+
16676+ /* initialize cbk cache parameter */
16677+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16678+
16679+ /* initialize flush parameters */
16680+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16681+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16682+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16683+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16684+
16685+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16686+
16687+ /* preliminary tree initializations */
16688+ sbinfo->tree.super = super;
16689+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16690+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16691+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16692+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16693+ rwlock_init(&(sbinfo->tree.tree_lock));
16694+ spin_lock_init(&(sbinfo->tree.epoch_lock));
16695+
16696+ /* initialize default readahead params */
16697+ sbinfo->ra_params.max = num_physpages / 4;
16698+ sbinfo->ra_params.flags = 0;
16699+
16700+ /* allocate memory for structure describing reiser4 mount options */
71430cf6
MT
16701+ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16702+ reiser4_ctx_gfp_mask_get());
44254afd
MT
16703+ if (opts == NULL)
16704+ return RETERR(-ENOMEM);
16705+
16706+ /* initialize structure describing reiser4 mount options */
16707+ p = opts;
16708+
16709+#if REISER4_DEBUG
16710+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
16711+ warning ("zam-1046", "opt array is overloaded"); break; \
16712+ }
16713+#else
16714+# define OPT_ARRAY_CHECK noop
16715+#endif
16716+
16717+#define PUSH_OPT(...) \
16718+do { \
71430cf6
MT
16719+ struct opt_desc o = __VA_ARGS__; \
16720+ OPT_ARRAY_CHECK; \
16721+ *p ++ = o; \
44254afd
MT
16722+} while (0)
16723+
16724+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16725+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16726+
16727+ /*
16728+ * tmgr.atom_max_size=N
16729+ * Atoms containing more than N blocks will be forced to commit. N is
16730+ * decimal.
16731+ */
16732+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16733+ /*
16734+ * tmgr.atom_max_age=N
16735+ * Atoms older than N seconds will be forced to commit. N is decimal.
16736+ */
16737+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16738+ /*
16739+ * tmgr.atom_min_size=N
16740+ * In committing an atom to free dirty pages, force the atom less than
16741+ * N in size to fuse with another one.
16742+ */
16743+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16744+ /*
16745+ * tmgr.atom_max_flushers=N
16746+ * limit of concurrent flushers for one atom. 0 means no limit.
16747+ */
16748+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16749+ /*
16750+ * tree.cbk_cache_slots=N
16751+ * Number of slots in the cbk cache.
16752+ */
16753+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16754+ /*
16755+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16756+ * leaf-level blocks it will force them to be relocated.
16757+ */
16758+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16759+ /*
16760+ * If flush finds can find a block allocation closer than at most
16761+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16762+ * position.
16763+ */
16764+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16765+ /*
16766+ * If we have written this much or more blocks before encountering busy
16767+ * jnode in flush list - abort flushing hoping that next time we get
16768+ * called this jnode will be clean already, and we will save some
16769+ * seeks.
16770+ */
16771+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16772+ /* The maximum number of nodes to scan left on a level during flush. */
16773+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16774+ /* preferred IO size */
16775+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16776+ /* carry flags used for insertion of new nodes */
16777+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16778+ /* carry flags used for insertion of new extents */
16779+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16780+ /* carry flags used for paste operations */
16781+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16782+ /* carry flags used for insert operations */
16783+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16784+
16785+#ifdef CONFIG_REISER4_BADBLOCKS
16786+ /*
16787+ * Alternative master superblock location in case if it's original
16788+ * location is not writeable/accessable. This is offset in BYTES.
16789+ */
16790+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
16791+#endif
16792+
16793+ /* turn on BSD-style gid assignment */
16794+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16795+ /* turn on 32 bit times */
16796+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
44254afd
MT
16797+ /*
16798+ * Don't load all bitmap blocks at mount time, it is useful for
16799+ * machines with tiny RAM and large disks.
16800+ */
16801+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16802+ /* disable transaction commits during write() */
16803+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16804+ /* disable use of write barriers in the reiser4 log writer. */
16805+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16806+
16807+ PUSH_OPT(
16808+ {
16809+ /*
16810+ * tree traversal readahead parameters:
16811+ * -o readahead:MAXNUM:FLAGS
16812+ * MAXNUM - max number fo nodes to request readahead for: -1UL
16813+ * will set it to max_sane_readahead()
16814+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16815+ * CONTINUE_ON_PRESENT
16816+ */
16817+ .name = "readahead",
16818+ .type = OPT_FORMAT,
16819+ .u = {
16820+ .f = {
16821+ .format = "%u:%u",
16822+ .nr_args = 2,
16823+ .arg1 = &sbinfo->ra_params.max,
16824+ .arg2 = &sbinfo->ra_params.flags,
16825+ .arg3 = NULL,
16826+ .arg4 = NULL
16827+ }
16828+ }
16829+ }
16830+ );
16831+
16832+ /* What to do in case of fs error */
16833+ PUSH_OPT(
16834+ {
16835+ .name = "onerror",
16836+ .type = OPT_ONEOF,
16837+ .u = {
16838+ .oneof = {
16839+ .result = &sbinfo->onerror,
16840+ .list = {
16841+ "panic", "remount-ro", NULL
16842+ },
16843+ }
16844+ }
16845+ }
16846+ );
16847+
16848+ /* modify default settings to values set by mount options */
16849+ result = parse_options(opt_string, opts, p - opts);
16850+ kfree(opts);
16851+ if (result != 0)
16852+ return result;
16853+
16854+ /* correct settings to sanity values */
16855+ sbinfo->tmgr.atom_max_age *= HZ;
16856+ if (sbinfo->tmgr.atom_max_age <= 0)
16857+ /* overflow */
16858+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16859+
16860+ /* round optimal io size up to 512 bytes */
16861+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16862+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16863+ if (sbinfo->optimal_io_size == 0) {
16864+ warning("nikita-2497", "optimal_io_size is too small");
16865+ return RETERR(-EINVAL);
16866+ }
44254afd
MT
16867+ return result;
16868+}
16869+
16870+/**
71430cf6 16871+ * reiser4_init_read_super - read reiser4 master super block
44254afd
MT
16872+ * @super: super block to fill
16873+ * @silent: if 0 - print warnings
16874+ *
16875+ * Reads reiser4 master super block either from predefined location or from
16876+ * location specified by altsuper mount option, initializes disk format plugin.
16877+ */
71430cf6 16878+int reiser4_init_read_super(struct super_block *super, int silent)
44254afd
MT
16879+{
16880+ struct buffer_head *super_bh;
16881+ struct reiser4_master_sb *master_sb;
16882+ reiser4_super_info_data *sbinfo = get_super_private(super);
16883+ unsigned long blocksize;
16884+
16885+ read_super_block:
16886+#ifdef CONFIG_REISER4_BADBLOCKS
16887+ if (sbinfo->altsuper)
16888+ /*
16889+ * read reiser4 master super block at position specified by
16890+ * mount option
16891+ */
16892+ super_bh = sb_bread(super,
16893+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
16894+ else
16895+#endif
16896+ /* read reiser4 master super block at 16-th 4096 block */
16897+ super_bh = sb_bread(super,
16898+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16899+ if (!super_bh)
16900+ return RETERR(-EIO);
16901+
16902+ master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16903+ /* check reiser4 magic string */
16904+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16905+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
16906+ /* reiser4 master super block contains filesystem blocksize */
16907+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16908+
16909+ if (blocksize != PAGE_CACHE_SIZE) {
16910+ /*
16911+ * currenly reiser4's blocksize must be equal to
16912+ * pagesize
16913+ */
16914+ if (!silent)
16915+ warning("nikita-2609",
16916+ "%s: wrong block size %ld\n", super->s_id,
16917+ blocksize);
16918+ brelse(super_bh);
16919+ return RETERR(-EINVAL);
16920+ }
16921+ if (blocksize != super->s_blocksize) {
16922+ /*
16923+ * filesystem uses different blocksize. Reread master
16924+ * super block with correct blocksize
16925+ */
16926+ brelse(super_bh);
16927+ if (!sb_set_blocksize(super, (int)blocksize))
16928+ return RETERR(-EINVAL);
16929+ goto read_super_block;
16930+ }
16931+
16932+ sbinfo->df_plug =
16933+ disk_format_plugin_by_id(
16934+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16935+ if (sbinfo->df_plug == NULL) {
16936+ if (!silent)
16937+ warning("nikita-26091",
16938+ "%s: unknown disk format plugin %d\n",
16939+ super->s_id,
16940+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16941+ brelse(super_bh);
16942+ return RETERR(-EINVAL);
16943+ }
16944+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16945+ brelse(super_bh);
16946+ return 0;
16947+ }
16948+
16949+ /* there is no reiser4 on the device */
16950+ if (!silent)
16951+ warning("nikita-2608",
16952+ "%s: wrong master super block magic", super->s_id);
16953+ brelse(super_bh);
16954+ return RETERR(-EINVAL);
16955+}
16956+
16957+static struct {
16958+ reiser4_plugin_type type;
16959+ reiser4_plugin_id id;
16960+} default_plugins[PSET_LAST] = {
16961+ [PSET_FILE] = {
16962+ .type = REISER4_FILE_PLUGIN_TYPE,
16963+ .id = UNIX_FILE_PLUGIN_ID
16964+ },
16965+ [PSET_DIR] = {
16966+ .type = REISER4_DIR_PLUGIN_TYPE,
16967+ .id = HASHED_DIR_PLUGIN_ID
16968+ },
16969+ [PSET_HASH] = {
16970+ .type = REISER4_HASH_PLUGIN_TYPE,
16971+ .id = R5_HASH_ID
16972+ },
16973+ [PSET_FIBRATION] = {
16974+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
16975+ .id = FIBRATION_DOT_O
16976+ },
16977+ [PSET_PERM] = {
16978+ .type = REISER4_PERM_PLUGIN_TYPE,
16979+ .id = NULL_PERM_ID
16980+ },
16981+ [PSET_FORMATTING] = {
16982+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
16983+ .id = SMALL_FILE_FORMATTING_ID
16984+ },
16985+ [PSET_SD] = {
16986+ .type = REISER4_ITEM_PLUGIN_TYPE,
16987+ .id = STATIC_STAT_DATA_ID
16988+ },
16989+ [PSET_DIR_ITEM] = {
16990+ .type = REISER4_ITEM_PLUGIN_TYPE,
16991+ .id = COMPOUND_DIR_ID
16992+ },
16993+ [PSET_CIPHER] = {
16994+ .type = REISER4_CIPHER_PLUGIN_TYPE,
16995+ .id = NONE_CIPHER_ID
16996+ },
16997+ [PSET_DIGEST] = {
16998+ .type = REISER4_DIGEST_PLUGIN_TYPE,
16999+ .id = SHA256_32_DIGEST_ID
17000+ },
17001+ [PSET_COMPRESSION] = {
17002+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17003+ .id = LZO1_COMPRESSION_ID
17004+ },
17005+ [PSET_COMPRESSION_MODE] = {
17006+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
71430cf6 17007+ .id = CONVX_COMPRESSION_MODE_ID
44254afd
MT
17008+ },
17009+ [PSET_CLUSTER] = {
17010+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
17011+ .id = CLUSTER_64K_ID
17012+ },
71430cf6
MT
17013+ [PSET_CREATE] = {
17014+ .type = REISER4_FILE_PLUGIN_TYPE,
17015+ .id = UNIX_FILE_PLUGIN_ID
44254afd
MT
17016+ }
17017+};
17018+
17019+/* access to default plugin table */
71430cf6 17020+reiser4_plugin *get_default_plugin(pset_member memb)
44254afd
MT
17021+{
17022+ return plugin_by_id(default_plugins[memb].type,
17023+ default_plugins[memb].id);
17024+}
17025+
17026+/**
71430cf6 17027+ * reiser4_init_root_inode - obtain inode of root directory
44254afd
MT
17028+ * @super: super block of filesystem
17029+ *
17030+ * Obtains inode of root directory (reading it from disk), initializes plugin
17031+ * set it was not initialized.
17032+ */
71430cf6 17033+int reiser4_init_root_inode(struct super_block *super)
44254afd
MT
17034+{
17035+ reiser4_super_info_data *sbinfo = get_super_private(super);
17036+ struct inode *inode;
17037+ int result = 0;
17038+
17039+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17040+ if (IS_ERR(inode))
17041+ return RETERR(PTR_ERR(inode));
17042+
17043+ super->s_root = d_alloc_root(inode);
17044+ if (!super->s_root) {
17045+ iput(inode);
17046+ return RETERR(-ENOMEM);
17047+ }
17048+
17049+ super->s_root->d_op = &sbinfo->ops.dentry;
17050+
17051+ if (!is_inode_loaded(inode)) {
17052+ pset_member memb;
71430cf6 17053+ plugin_set *pset;
44254afd 17054+
71430cf6 17055+ pset = reiser4_inode_data(inode)->pset;
44254afd 17056+ for (memb = 0; memb < PSET_LAST; ++memb) {
44254afd 17057+
71430cf6
MT
17058+ if (aset_get(pset, memb) != NULL)
17059+ continue;
17060+
17061+ result = grab_plugin_pset(inode, NULL, memb);
44254afd
MT
17062+ if (result != 0)
17063+ break;
71430cf6
MT
17064+
17065+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
44254afd
MT
17066+ }
17067+
17068+ if (result == 0) {
17069+ if (REISER4_DEBUG) {
44254afd
MT
17070+ for (memb = 0; memb < PSET_LAST; ++memb)
17071+ assert("nikita-3500",
71430cf6 17072+ aset_get(pset, memb) != NULL);
44254afd
MT
17073+ }
17074+ } else
17075+ warning("nikita-3448", "Cannot set plugins of root: %i",
17076+ result);
17077+ reiser4_iget_complete(inode);
71430cf6
MT
17078+
17079+ /* As the default pset kept in the root dir may has been changed
17080+ (length is unknown), call update_sd. */
17081+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17082+ result = reiser4_grab_space(
17083+ inode_file_plugin(inode)->estimate.update(inode),
17084+ BA_CAN_COMMIT);
17085+
17086+ if (result == 0)
17087+ result = reiser4_update_sd(inode);
17088+
17089+ all_grabbed2free();
17090+ }
44254afd 17091+ }
71430cf6 17092+
44254afd
MT
17093+ super->s_maxbytes = MAX_LFS_FILESIZE;
17094+ return result;
17095+}
17096+
17097+/*
17098+ * Local variables:
17099+ * c-indentation-style: "K&R"
17100+ * mode-name: "LC"
17101+ * c-basic-offset: 8
17102+ * tab-width: 8
17103+ * fill-column: 79
17104+ * End:
17105+ */
71430cf6
MT
17106diff -urN linux-2.6.22.orig/fs/reiser4/inode.c linux-2.6.22/fs/reiser4/inode.c
17107--- linux-2.6.22.orig/fs/reiser4/inode.c 1970-01-01 03:00:00.000000000 +0300
17108+++ linux-2.6.22/fs/reiser4/inode.c 2007-07-29 00:25:34.872695441 +0400
17109@@ -0,0 +1,709 @@
44254afd
MT
17110+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17111+
17112+/* Inode specific operations. */
17113+
17114+#include "forward.h"
17115+#include "debug.h"
17116+#include "key.h"
17117+#include "kassign.h"
17118+#include "coord.h"
17119+#include "seal.h"
17120+#include "dscale.h"
17121+#include "plugin/item/item.h"
17122+#include "plugin/security/perm.h"
17123+#include "plugin/plugin.h"
17124+#include "plugin/object.h"
17125+#include "znode.h"
17126+#include "vfs_ops.h"
17127+#include "inode.h"
17128+#include "super.h"
17129+#include "reiser4.h"
17130+
17131+#include <linux/fs.h> /* for struct super_block, address_space */
17132+
17133+/* return reiser4 internal tree which inode belongs to */
17134+/* Audited by: green(2002.06.17) */
71430cf6 17135+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
44254afd
MT
17136+{
17137+ assert("nikita-256", inode != NULL);
17138+ assert("nikita-257", inode->i_sb != NULL);
71430cf6 17139+ return reiser4_get_tree(inode->i_sb);
44254afd
MT
17140+}
17141+
17142+/* return reiser4-specific inode flags */
17143+static inline unsigned long *inode_flags(const struct inode *const inode)
17144+{
17145+ assert("nikita-2842", inode != NULL);
17146+ return &reiser4_inode_data(inode)->flags;
17147+}
17148+
17149+/* set reiser4-specific flag @f in @inode */
71430cf6 17150+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
44254afd
MT
17151+{
17152+ assert("nikita-2248", inode != NULL);
17153+ set_bit((int)f, inode_flags(inode));
17154+}
17155+
17156+/* clear reiser4-specific flag @f in @inode */
71430cf6 17157+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
44254afd
MT
17158+{
17159+ assert("nikita-2250", inode != NULL);
17160+ clear_bit((int)f, inode_flags(inode));
17161+}
17162+
17163+/* true if reiser4-specific flag @f is set in @inode */
71430cf6
MT
17164+int reiser4_inode_get_flag(const struct inode *inode,
17165+ reiser4_file_plugin_flags f)
44254afd
MT
17166+{
17167+ assert("nikita-2251", inode != NULL);
17168+ return test_bit((int)f, inode_flags(inode));
17169+}
17170+
17171+/* convert oid to inode number */
17172+ino_t oid_to_ino(oid_t oid)
17173+{
17174+ return (ino_t) oid;
17175+}
17176+
17177+/* convert oid to user visible inode number */
17178+ino_t oid_to_uino(oid_t oid)
17179+{
17180+ /* reiser4 object is uniquely identified by oid which is 64 bit
17181+ quantity. Kernel in-memory inode is indexed (in the hash table) by
17182+ 32 bit i_ino field, but this is not a problem, because there is a
17183+ way to further distinguish inodes with identical inode numbers
17184+ (find_actor supplied to iget()).
17185+
17186+ But user space expects unique 32 bit inode number. Obviously this
17187+ is impossible. Work-around is to somehow hash oid into user visible
17188+ inode number.
17189+ */
17190+ oid_t max_ino = (ino_t) ~ 0;
17191+
17192+ if (REISER4_INO_IS_OID || (oid <= max_ino))
17193+ return oid;
17194+ else
17195+ /* this is remotely similar to algorithm used to find next pid
17196+ to use for process: after wrap-around start from some
17197+ offset rather than from 0. Idea is that there are some long
17198+ living objects with which we don't want to collide.
17199+ */
17200+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17201+}
17202+
17203+/* check that "inode" is on reiser4 file-system */
17204+int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17205+{
17206+ return inode != NULL && is_reiser4_super(inode->i_sb);
17207+}
17208+
17209+/* Maximal length of a name that can be stored in directory @inode.
17210+
17211+ This is used in check during file creation and lookup. */
17212+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17213+{
17214+ assert("nikita-287", is_reiser4_inode(inode));
17215+ assert("nikita-1710", inode_dir_item_plugin(inode));
17216+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17217+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17218+ else
17219+ return 255;
17220+}
17221+
17222+#if REISER4_USE_COLLISION_LIMIT
17223+/* Maximal number of hash collisions for this directory. */
17224+int max_hash_collisions(const struct inode *dir /* inode queried */ )
17225+{
17226+ assert("nikita-1711", dir != NULL);
17227+ return reiser4_inode_data(dir)->plugin.max_collisions;
17228+}
17229+#endif /* REISER4_USE_COLLISION_LIMIT */
17230+
17231+/* Install file, inode, and address_space operation on @inode, depending on
17232+ its mode. */
17233+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17234+ reiser4_object_create_data * data /* parameters to create
17235+ * object */ )
17236+{
17237+ reiser4_super_info_data *sinfo;
17238+ file_plugin *fplug;
17239+ dir_plugin *dplug;
17240+
17241+ fplug = inode_file_plugin(inode);
17242+ dplug = inode_dir_plugin(inode);
17243+
17244+ sinfo = get_super_private(inode->i_sb);
17245+
17246+ switch (inode->i_mode & S_IFMT) {
17247+ case S_IFSOCK:
17248+ case S_IFBLK:
17249+ case S_IFCHR:
17250+ case S_IFIFO:
17251+ {
17252+ dev_t rdev; /* to keep gcc happy */
17253+
17254+ assert("vs-46", fplug != NULL);
17255+ /* ugly hack with rdev */
17256+ if (data == NULL) {
17257+ rdev = inode->i_rdev;
17258+ inode->i_rdev = 0;
17259+ } else
17260+ rdev = data->rdev;
17261+ inode->i_blocks = 0;
17262+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17263+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17264+ /* initialize inode->i_fop and inode->i_rdev for block and char
17265+ devices */
17266+ init_special_inode(inode, inode->i_mode, rdev);
17267+ /* all address space operations are null */
17268+ inode->i_mapping->a_ops =
17269+ &file_plugins[fplug->h.id].as_ops;
17270+ break;
17271+ }
17272+ case S_IFLNK:
17273+ assert("vs-46", fplug != NULL);
17274+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17275+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17276+ inode->i_fop = NULL;
17277+ /* all address space operations are null */
17278+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17279+ break;
17280+ case S_IFDIR:
17281+ assert("vs-46", dplug != NULL);
17282+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17283+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17284+ inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17285+ inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17286+ inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17287+ break;
17288+ case S_IFREG:
17289+ assert("vs-46", fplug != NULL);
17290+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
71430cf6 17291+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
44254afd
MT
17292+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17293+ inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17294+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17295+ break;
17296+ default:
17297+ warning("nikita-291", "wrong file mode: %o for %llu",
17298+ inode->i_mode,
17299+ (unsigned long long)get_inode_oid(inode));
17300+ reiser4_make_bad_inode(inode);
17301+ return RETERR(-EINVAL);
17302+ }
17303+ return 0;
17304+}
17305+
71430cf6
MT
17306+/* Initialize inode from disk data. Called with inode locked.
17307+ Return inode locked. */
44254afd
MT
17308+static int init_inode(struct inode *inode /* inode to intialise */ ,
17309+ coord_t * coord /* coord of stat data */ )
17310+{
17311+ int result;
17312+ item_plugin *iplug;
17313+ void *body;
17314+ int length;
17315+ reiser4_inode *state;
17316+
17317+ assert("nikita-292", coord != NULL);
17318+ assert("nikita-293", inode != NULL);
17319+
17320+ coord_clear_iplug(coord);
17321+ result = zload(coord->node);
17322+ if (result)
17323+ return result;
17324+ iplug = item_plugin_by_coord(coord);
17325+ body = item_body_by_coord(coord);
17326+ length = item_length_by_coord(coord);
17327+
17328+ assert("nikita-295", iplug != NULL);
17329+ assert("nikita-296", body != NULL);
17330+ assert("nikita-297", length > 0);
17331+
17332+ /* inode is under I_LOCK now */
17333+
17334+ state = reiser4_inode_data(inode);
17335+ /* call stat-data plugin method to load sd content into inode */
17336+ result = iplug->s.sd.init_inode(inode, body, length);
71430cf6 17337+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
44254afd
MT
17338+ if (result == 0) {
17339+ result = setup_inode_ops(inode, NULL);
71430cf6
MT
17340+ if (result == 0 && inode->i_sb->s_root &&
17341+ inode->i_sb->s_root->d_inode)
17342+ result = finish_pset(inode);
44254afd
MT
17343+ }
17344+ zrelse(coord->node);
17345+ return result;
17346+}
17347+
17348+/* read `inode' from the disk. This is what was previously in
17349+ reiserfs_read_inode2().
17350+
17351+ Must be called with inode locked. Return inode still locked.
17352+*/
17353+static int read_inode(struct inode *inode /* inode to read from disk */ ,
17354+ const reiser4_key * key /* key of stat data */ ,
17355+ int silent)
17356+{
17357+ int result;
17358+ lock_handle lh;
17359+ reiser4_inode *info;
17360+ coord_t coord;
17361+
17362+ assert("nikita-298", inode != NULL);
17363+ assert("nikita-1945", !is_inode_loaded(inode));
17364+
17365+ info = reiser4_inode_data(inode);
17366+ assert("nikita-300", info->locality_id != 0);
17367+
17368+ coord_init_zero(&coord);
17369+ init_lh(&lh);
17370+ /* locate stat-data in a tree and return znode locked */
17371+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17372+ assert("nikita-301", !is_inode_loaded(inode));
17373+ if (result == 0) {
17374+ /* use stat-data plugin to load sd into inode. */
17375+ result = init_inode(inode, &coord);
17376+ if (result == 0) {
17377+ /* initialize stat-data seal */
17378+ spin_lock_inode(inode);
71430cf6 17379+ reiser4_seal_init(&info->sd_seal, &coord, key);
44254afd
MT
17380+ info->sd_coord = coord;
17381+ spin_unlock_inode(inode);
17382+
17383+ /* call file plugin's method to initialize plugin
17384+ * specific part of inode */
17385+ if (inode_file_plugin(inode)->init_inode_data)
17386+ inode_file_plugin(inode)->init_inode_data(inode,
17387+ NULL,
17388+ 0);
17389+ /* load detached directory cursors for stateless
17390+ * directory readers (NFS). */
71430cf6 17391+ reiser4_load_cursors(inode);
44254afd
MT
17392+
17393+ /* Check the opened inode for consistency. */
17394+ result =
17395+ get_super_private(inode->i_sb)->df_plug->
17396+ check_open(inode);
17397+ }
17398+ }
17399+ /* lookup_sd() doesn't release coord because we want znode
17400+ stay read-locked while stat-data fields are accessed in
17401+ init_inode() */
17402+ done_lh(&lh);
17403+
17404+ if (result != 0)
17405+ reiser4_make_bad_inode(inode);
17406+ return result;
17407+}
17408+
17409+/* initialise new reiser4 inode being inserted into hash table. */
17410+static int init_locked_inode(struct inode *inode /* new inode */ ,
17411+ void *opaque /* key of stat data passed to the
17412+ * iget5_locked as cookie */ )
17413+{
17414+ reiser4_key *key;
17415+
17416+ assert("nikita-1995", inode != NULL);
17417+ assert("nikita-1996", opaque != NULL);
17418+ key = opaque;
17419+ set_inode_oid(inode, get_key_objectid(key));
17420+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17421+ return 0;
17422+}
17423+
17424+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17425+
17426+ This function is called by iget5_locked() to distinguish reiser4 inodes
17427+ having the same inode numbers. Such inodes can only exist due to some error
17428+ condition. One of them should be bad. Inodes with identical inode numbers
17429+ (objectids) are distinguished by their packing locality.
17430+
17431+*/
17432+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17433+ * check */ ,
17434+ void *opaque /* "cookie" passed to
17435+ * iget5_locked(). This is stat data
17436+ * key */ )
17437+{
17438+ reiser4_key *key;
17439+
17440+ key = opaque;
17441+ return
17442+ /* oid is unique, so first term is enough, actually. */
17443+ get_inode_oid(inode) == get_key_objectid(key) &&
17444+ /*
17445+ * also, locality should be checked, but locality is stored in
17446+ * the reiser4-specific part of the inode, and actor can be
17447+ * called against arbitrary inode that happened to be in this
17448+ * hash chain. Hence we first have to check that this is
17449+ * reiser4 inode at least. is_reiser4_inode() is probably too
17450+ * early to call, as inode may have ->i_op not yet
17451+ * initialised.
17452+ */
17453+ is_reiser4_super(inode->i_sb) &&
17454+ /*
17455+ * usually objectid is unique, but pseudo files use counter to
17456+ * generate objectid. All pseudo files are placed into special
17457+ * (otherwise unused) locality.
17458+ */
17459+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17460+}
17461+
17462+/* hook for kmem_cache_create */
17463+void loading_init_once(reiser4_inode * info)
17464+{
71430cf6 17465+ mutex_init(&info->loading);
44254afd
MT
17466+}
17467+
17468+/* for reiser4_alloc_inode */
17469+void loading_alloc(reiser4_inode * info)
17470+{
71430cf6 17471+ assert("vs-1717", !mutex_is_locked(&info->loading));
44254afd
MT
17472+}
17473+
17474+/* for reiser4_destroy */
17475+void loading_destroy(reiser4_inode * info)
17476+{
71430cf6 17477+ assert("vs-1717a", !mutex_is_locked(&info->loading));
44254afd
MT
17478+}
17479+
71430cf6 17480+static void loading_begin(reiser4_inode * info)
44254afd 17481+{
71430cf6 17482+ mutex_lock(&info->loading);
44254afd
MT
17483+}
17484+
71430cf6 17485+static void loading_end(reiser4_inode * info)
44254afd 17486+{
71430cf6 17487+ mutex_unlock(&info->loading);
44254afd
MT
17488+}
17489+
17490+/**
17491+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17492+ * @super: super block of filesystem
17493+ * @key: key of inode's stat-data
17494+ * @silent:
17495+ *
17496+ * This is our helper function a la iget(). This is be called by
71430cf6 17497+ * lookup_common() and reiser4_read_super(). Return inode locked or error
44254afd
MT
17498+ * encountered.
17499+ */
17500+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17501+ int silent)
17502+{
17503+ struct inode *inode;
17504+ int result;
17505+ reiser4_inode *info;
17506+
17507+ assert("nikita-302", super != NULL);
17508+ assert("nikita-303", key != NULL);
17509+
17510+ result = 0;
17511+
17512+ /* call iget(). Our ->read_inode() is dummy, so this will either
17513+ find inode in cache or return uninitialised inode */
17514+ inode = iget5_locked(super,
17515+ (unsigned long)get_key_objectid(key),
17516+ reiser4_inode_find_actor,
17517+ init_locked_inode, (reiser4_key *) key);
17518+ if (inode == NULL)
17519+ return ERR_PTR(RETERR(-ENOMEM));
17520+ if (is_bad_inode(inode)) {
17521+ warning("nikita-304", "Bad inode found");
71430cf6 17522+ reiser4_print_key("key", key);
44254afd
MT
17523+ iput(inode);
17524+ return ERR_PTR(RETERR(-EIO));
17525+ }
17526+
17527+ info = reiser4_inode_data(inode);
17528+
17529+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17530+ loaded and initialized inode from just allocated inode. If
17531+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17532+ info->loading. The place in reiser4 which uses not initialized inode
17533+ is the reiser4 repacker, see repacker-related functions in
17534+ plugin/item/extent.c */
17535+ if (!is_inode_loaded(inode)) {
71430cf6 17536+ loading_begin(info);
44254afd
MT
17537+ if (!is_inode_loaded(inode)) {
17538+ /* locking: iget5_locked returns locked inode */
17539+ assert("nikita-1941", !is_inode_loaded(inode));
17540+ assert("nikita-1949",
17541+ reiser4_inode_find_actor(inode,
17542+ (reiser4_key *) key));
17543+ /* now, inode has objectid as ->i_ino and locality in
17544+ reiser4-specific part. This is enough for
17545+ read_inode() to read stat data from the disk */
17546+ result = read_inode(inode, key, silent);
17547+ } else
71430cf6 17548+ loading_end(info);
44254afd
MT
17549+ }
17550+
17551+ if (inode->i_state & I_NEW)
17552+ unlock_new_inode(inode);
17553+
17554+ if (is_bad_inode(inode)) {
17555+ assert("vs-1717", result != 0);
71430cf6 17556+ loading_end(info);
44254afd
MT
17557+ iput(inode);
17558+ inode = ERR_PTR(result);
17559+ } else if (REISER4_DEBUG) {
17560+ reiser4_key found_key;
17561+
17562+ assert("vs-1717", result == 0);
17563+ build_sd_key(inode, &found_key);
17564+ if (!keyeq(&found_key, key)) {
17565+ warning("nikita-305", "Wrong key in sd");
71430cf6
MT
17566+ reiser4_print_key("sought for", key);
17567+ reiser4_print_key("found", &found_key);
44254afd
MT
17568+ }
17569+ if (inode->i_nlink == 0) {
17570+ warning("nikita-3559", "Unlinked inode found: %llu\n",
17571+ (unsigned long long)get_inode_oid(inode));
17572+ }
17573+ }
17574+ return inode;
17575+}
17576+
17577+/* reiser4_iget() may return not fully initialized inode, this function should
17578+ * be called after one completes reiser4 inode initializing. */
17579+void reiser4_iget_complete(struct inode *inode)
17580+{
17581+ assert("zam-988", is_reiser4_inode(inode));
17582+
17583+ if (!is_inode_loaded(inode)) {
71430cf6
MT
17584+ reiser4_inode_set_flag(inode, REISER4_LOADED);
17585+ loading_end(reiser4_inode_data(inode));
44254afd
MT
17586+ }
17587+}
17588+
17589+void reiser4_make_bad_inode(struct inode *inode)
17590+{
17591+ assert("nikita-1934", inode != NULL);
17592+
17593+ /* clear LOADED bit */
71430cf6 17594+ reiser4_inode_clr_flag(inode, REISER4_LOADED);
44254afd
MT
17595+ make_bad_inode(inode);
17596+ return;
17597+}
17598+
17599+file_plugin *inode_file_plugin(const struct inode * inode)
17600+{
17601+ assert("nikita-1997", inode != NULL);
17602+ return reiser4_inode_data(inode)->pset->file;
17603+}
17604+
17605+dir_plugin *inode_dir_plugin(const struct inode * inode)
17606+{
17607+ assert("nikita-1998", inode != NULL);
17608+ return reiser4_inode_data(inode)->pset->dir;
17609+}
17610+
44254afd
MT
17611+formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17612+{
17613+ assert("nikita-2000", inode != NULL);
17614+ return reiser4_inode_data(inode)->pset->formatting;
17615+}
17616+
17617+hash_plugin *inode_hash_plugin(const struct inode * inode)
17618+{
17619+ assert("nikita-2001", inode != NULL);
17620+ return reiser4_inode_data(inode)->pset->hash;
17621+}
17622+
17623+fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17624+{
17625+ assert("nikita-2001", inode != NULL);
17626+ return reiser4_inode_data(inode)->pset->fibration;
17627+}
17628+
17629+cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17630+{
17631+ assert("edward-36", inode != NULL);
17632+ return reiser4_inode_data(inode)->pset->cipher;
17633+}
17634+
17635+compression_plugin *inode_compression_plugin(const struct inode * inode)
17636+{
17637+ assert("edward-37", inode != NULL);
17638+ return reiser4_inode_data(inode)->pset->compression;
17639+}
17640+
17641+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17642+ inode)
17643+{
17644+ assert("edward-1330", inode != NULL);
17645+ return reiser4_inode_data(inode)->pset->compression_mode;
17646+}
17647+
17648+cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17649+{
17650+ assert("edward-1328", inode != NULL);
17651+ return reiser4_inode_data(inode)->pset->cluster;
17652+}
17653+
71430cf6 17654+file_plugin *inode_create_plugin(const struct inode * inode)
44254afd
MT
17655+{
17656+ assert("edward-1329", inode != NULL);
71430cf6 17657+ return reiser4_inode_data(inode)->pset->create;
44254afd
MT
17658+}
17659+
17660+digest_plugin *inode_digest_plugin(const struct inode * inode)
17661+{
17662+ assert("edward-86", inode != NULL);
17663+ return reiser4_inode_data(inode)->pset->digest;
17664+}
17665+
17666+item_plugin *inode_sd_plugin(const struct inode * inode)
17667+{
17668+ assert("vs-534", inode != NULL);
17669+ return reiser4_inode_data(inode)->pset->sd;
17670+}
17671+
17672+item_plugin *inode_dir_item_plugin(const struct inode * inode)
17673+{
17674+ assert("vs-534", inode != NULL);
17675+ return reiser4_inode_data(inode)->pset->dir_item;
17676+}
17677+
71430cf6
MT
17678+file_plugin *child_create_plugin(const struct inode * inode)
17679+{
17680+ assert("edward-1329", inode != NULL);
17681+ return reiser4_inode_data(inode)->hset->create;
17682+}
17683+
44254afd
MT
17684+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17685+{
17686+ reiser4_inode *state;
17687+
17688+ assert("nikita-2716", inode != NULL);
17689+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
17690+ assert("nikita-3491", spin_inode_is_locked(inode));
17691+
17692+ state = reiser4_inode_data(inode);
17693+ state->extmask |= 1 << ext;
17694+ /* force re-calculation of stat-data length on next call to
17695+ update_sd(). */
71430cf6 17696+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
44254afd
MT
17697+}
17698+
71430cf6 17699+void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
44254afd 17700+{
71430cf6
MT
17701+ reiser4_inode *state;
17702+
17703+ assert("vpf-1926", inode != NULL);
17704+ assert("vpf-1927", ext < LAST_SD_EXTENSION);
17705+ assert("vpf-1928", spin_inode_is_locked(inode));
44254afd 17706+
71430cf6
MT
17707+ state = reiser4_inode_data(inode);
17708+ state->extmask &= ~(1 << ext);
17709+ /* force re-calculation of stat-data length on next call to
17710+ update_sd(). */
17711+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
44254afd
MT
17712+}
17713+
17714+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17715+{
17716+ assert("edward-1287", inode != NULL);
17717+ if (!dscale_fit(old, new))
71430cf6 17718+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
44254afd
MT
17719+ return;
17720+}
17721+
17722+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17723+{
17724+ assert("nikita-2875", inode != NULL);
17725+ spin_lock_inode(inode);
17726+ inode_check_scale_nolock(inode, old, new);
17727+ spin_unlock_inode(inode);
17728+}
17729+
17730+/*
17731+ * initialize ->ordering field of inode. This field defines how file stat-data
17732+ * and body is ordered within a tree with respect to other objects within the
17733+ * same parent directory.
17734+ */
17735+void
17736+init_inode_ordering(struct inode *inode,
17737+ reiser4_object_create_data * crd, int create)
17738+{
17739+ reiser4_key key;
17740+
17741+ if (create) {
17742+ struct inode *parent;
17743+
17744+ parent = crd->parent;
17745+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17746+ inode_dir_plugin(parent)->build_entry_key(parent,
17747+ &crd->dentry->d_name,
17748+ &key);
17749+ } else {
17750+ coord_t *coord;
17751+
17752+ coord = &reiser4_inode_data(inode)->sd_coord;
17753+ coord_clear_iplug(coord);
17754+ /* safe to use ->sd_coord, because node is under long term
17755+ * lock */
17756+ WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17757+ }
17758+
17759+ set_inode_ordering(inode, get_key_ordering(&key));
17760+}
17761+
17762+znode *inode_get_vroot(struct inode *inode)
17763+{
17764+ reiser4_block_nr blk;
17765+ znode *result;
17766+
17767+ spin_lock_inode(inode);
17768+ blk = reiser4_inode_data(inode)->vroot;
17769+ spin_unlock_inode(inode);
17770+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
71430cf6 17771+ result = zlook(reiser4_tree_by_inode(inode), &blk);
44254afd
MT
17772+ else
17773+ result = NULL;
17774+ return result;
17775+}
17776+
17777+void inode_set_vroot(struct inode *inode, znode *vroot)
17778+{
17779+ spin_lock_inode(inode);
17780+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17781+ spin_unlock_inode(inode);
17782+}
17783+
17784+#if REISER4_DEBUG
17785+
71430cf6 17786+void reiser4_inode_invariant(const struct inode *inode)
44254afd
MT
17787+{
17788+ assert("nikita-3077", spin_inode_is_locked(inode));
17789+}
17790+
17791+int inode_has_no_jnodes(reiser4_inode * r4_inode)
17792+{
17793+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17794+ r4_inode->nr_jnodes == 0;
17795+}
17796+
17797+#endif
17798+
17799+/* true if directory is empty (only contains dot and dotdot) */
17800+/* FIXME: shouldn't it be dir plugin method? */
17801+int is_dir_empty(const struct inode *dir)
17802+{
17803+ assert("nikita-1976", dir != NULL);
17804+
17805+ /* rely on our method to maintain directory i_size being equal to the
17806+ number of entries. */
17807+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17808+}
17809+
17810+/* Make Linus happy.
17811+ Local variables:
17812+ c-indentation-style: "K&R"
17813+ mode-name: "LC"
17814+ c-basic-offset: 8
17815+ tab-width: 8
17816+ fill-column: 120
17817+ End:
17818+*/
71430cf6
MT
17819diff -urN linux-2.6.22.orig/fs/reiser4/inode.h linux-2.6.22/fs/reiser4/inode.h
17820--- linux-2.6.22.orig/fs/reiser4/inode.h 1970-01-01 03:00:00.000000000 +0300
17821+++ linux-2.6.22/fs/reiser4/inode.h 2007-07-29 00:25:34.872695441 +0400
17822@@ -0,0 +1,449 @@
44254afd
MT
17823+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17824+
17825+/* Inode functions. */
17826+
17827+#if !defined( __REISER4_INODE_H__ )
17828+#define __REISER4_INODE_H__
17829+
17830+#include "forward.h"
17831+#include "debug.h"
17832+#include "key.h"
17833+#include "seal.h"
17834+#include "plugin/plugin.h"
17835+#include "plugin/file/cryptcompress.h"
17836+#include "plugin/file/file.h"
17837+#include "plugin/dir/dir.h"
17838+#include "plugin/plugin_set.h"
17839+#include "plugin/security/perm.h"
17840+#include "vfs_ops.h"
17841+#include "jnode.h"
17842+#include "fsdata.h"
17843+
17844+#include <linux/types.h> /* for __u?? , ino_t */
17845+#include <linux/fs.h> /* for struct super_block, struct
17846+ * rw_semaphore, etc */
17847+#include <linux/spinlock.h>
17848+#include <asm/types.h>
17849+
17850+/* reiser4-specific inode flags. They are "transient" and are not
17851+ supposed to be stored on disk. Used to trace "state" of
17852+ inode
17853+*/
17854+typedef enum {
17855+ /* this is light-weight inode, inheriting some state from its
17856+ parent */
17857+ REISER4_LIGHT_WEIGHT = 0,
17858+ /* stat data wasn't yet created */
17859+ REISER4_NO_SD = 1,
17860+ /* internal immutable flag. Currently is only used
17861+ to avoid race condition during file creation.
17862+ See comment in create_object(). */
17863+ REISER4_IMMUTABLE = 2,
17864+ /* inode was read from storage */
17865+ REISER4_LOADED = 3,
71430cf6 17866+ /* this bit is set for symlinks. inode->i_private points to target
44254afd
MT
17867+ name of symlink. */
17868+ REISER4_GENERIC_PTR_USED = 4,
17869+ /* set if size of stat-data item for this inode is known. If this is
17870+ * set we can avoid recalculating size of stat-data on each update. */
17871+ REISER4_SDLEN_KNOWN = 5,
17872+ /* reiser4_inode->crypt points to the crypto stat */
17873+ REISER4_CRYPTO_STAT_LOADED = 6,
17874+ /* cryptcompress_inode_data points to the secret key */
17875+ REISER4_SECRET_KEY_INSTALLED = 7,
17876+ /* File (possibly) has pages corresponding to the tail items, that
17877+ * were created by ->readpage. It is set by mmap_unix_file() and
17878+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
17879+ * kill-hook of tail items. It is never cleared once set. This bit is
17880+ * modified and inspected under i_mutex. */
17881+ REISER4_HAS_MMAP = 8,
44254afd 17882+ REISER4_PART_MIXED = 9,
71430cf6
MT
17883+ REISER4_PART_IN_CONV = 10,
17884+ /* This flag indicates that file plugin conversion is in progress */
17885+ REISER4_FILE_CONV_IN_PROGRESS = 11
44254afd
MT
17886+} reiser4_file_plugin_flags;
17887+
17888+/* state associated with each inode.
17889+ reiser4 inode.
17890+
17891+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17892+ be of the same size. File-system allocates inodes by itself through
17893+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
17894+ at the time of its creation.
17895+
17896+ Invariants involving parts of this data-type:
17897+
17898+ [inode->eflushed]
17899+
17900+*/
17901+
17902+typedef struct reiser4_inode reiser4_inode;
17903+/* return pointer to reiser4-specific part of inode */
17904+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17905+ /* inode queried */ );
17906+
17907+#if BITS_PER_LONG == 64
17908+
17909+#define REISER4_INO_IS_OID (1)
17910+typedef struct {;
17911+} oid_hi_t;
17912+
17913+/* BITS_PER_LONG == 64 */
17914+#else
17915+
17916+#define REISER4_INO_IS_OID (0)
17917+typedef __u32 oid_hi_t;
17918+
17919+/* BITS_PER_LONG == 64 */
17920+#endif
17921+
17922+struct reiser4_inode {
17923+ /* spin lock protecting fields of this structure. */
17924+ spinlock_t guard;
71430cf6
MT
17925+ /* main plugin set that control the file
17926+ (see comments in plugin/plugin_set.c) */
44254afd 17927+ plugin_set *pset;
71430cf6
MT
17928+ /* plugin set for inheritance
17929+ (see comments in plugin/plugin_set.c) */
44254afd
MT
17930+ plugin_set *hset;
17931+ /* high 32 bits of object id */
17932+ oid_hi_t oid_hi;
17933+ /* seal for stat-data */
17934+ seal_t sd_seal;
17935+ /* locality id for this file */
17936+ oid_t locality_id;
17937+#if REISER4_LARGE_KEY
17938+ __u64 ordering;
17939+#endif
17940+ /* coord of stat-data in sealed node */
17941+ coord_t sd_coord;
17942+ /* bit-mask of stat-data extentions used by this file */
17943+ __u64 extmask;
17944+ /* bitmask of non-default plugins for this inode */
17945+ __u16 plugin_mask;
71430cf6
MT
17946+ /* bitmask of set heir plugins for this inode. */
17947+ __u16 heir_mask;
44254afd
MT
17948+ union {
17949+ struct list_head readdir_list;
17950+ struct list_head not_used;
17951+ } lists;
17952+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17953+ unsigned long flags;
17954+ union {
17955+ /* fields specific to unix_file plugin */
71430cf6
MT
17956+ struct unix_file_info unix_file_info;
17957+ /* fields specific to cryptcompress file plugin */
17958+ struct cryptcompress_info cryptcompress_info;
44254afd
MT
17959+ } file_plugin_data;
17960+
71430cf6
MT
17961+ /* this semaphore is to serialize readers and writers of @pset->file
17962+ * when file plugin conversion is enabled
17963+ */
17964+ struct rw_semaphore conv_sem;
17965+
44254afd
MT
17966+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
17967+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
17968+ struct radix_tree_root jnodes_tree;
17969+#if REISER4_DEBUG
17970+ /* number of unformatted node jnodes of this file in jnode hash table */
17971+ unsigned long nr_jnodes;
17972+#endif
17973+
17974+ /* block number of virtual root for this object. See comment above
17975+ * fs/reiser4/search.c:handle_vroot() */
17976+ reiser4_block_nr vroot;
71430cf6 17977+ struct mutex loading;
44254afd
MT
17978+};
17979+
17980+void loading_init_once(reiser4_inode *);
17981+void loading_alloc(reiser4_inode *);
17982+void loading_destroy(reiser4_inode *);
17983+
71430cf6 17984+struct reiser4_inode_object {
44254afd
MT
17985+ /* private part */
17986+ reiser4_inode p;
17987+ /* generic fields not specific to reiser4, but used by VFS */
17988+ struct inode vfs_inode;
71430cf6 17989+};
44254afd
MT
17990+
17991+/* return pointer to the reiser4 specific portion of @inode */
17992+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17993+ /* inode queried */ )
17994+{
17995+ assert("nikita-254", inode != NULL);
71430cf6 17996+ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
44254afd
MT
17997+}
17998+
17999+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18000+ r4_inode /* inode queried */
18001+ )
18002+{
71430cf6 18003+ return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode;
44254afd
MT
18004+}
18005+
18006+/*
18007+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18008+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18009+ * bits.
18010+ *
18011+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18012+ * of inode, otherwise whole oid is stored in i_ino.
18013+ *
18014+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18015+ */
18016+
18017+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18018+
18019+#if REISER4_INO_IS_OID
18020+
18021+static inline oid_t get_inode_oid(const struct inode *inode)
18022+{
18023+ return inode->i_ino;
18024+}
18025+
18026+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18027+{
18028+ inode->i_ino = oid;
18029+}
18030+
18031+/* REISER4_INO_IS_OID */
18032+#else
18033+
18034+static inline oid_t get_inode_oid(const struct inode *inode)
18035+{
18036+ return
18037+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18038+ inode->i_ino;
18039+}
18040+
18041+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18042+{
18043+ assert("nikita-2519", inode != NULL);
18044+ inode->i_ino = (ino_t) (oid);
18045+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18046+ assert("nikita-2521", get_inode_oid(inode) == (oid));
18047+}
18048+
18049+/* REISER4_INO_IS_OID */
18050+#endif
18051+
18052+static inline oid_t get_inode_locality(const struct inode *inode)
18053+{
18054+ return reiser4_inode_data(inode)->locality_id;
18055+}
18056+
18057+#if REISER4_LARGE_KEY
18058+static inline __u64 get_inode_ordering(const struct inode *inode)
18059+{
18060+ return reiser4_inode_data(inode)->ordering;
18061+}
18062+
18063+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18064+{
18065+ reiser4_inode_data(inode)->ordering = ordering;
18066+}
18067+
18068+#else
18069+
18070+#define get_inode_ordering(inode) (0)
18071+#define set_inode_ordering(inode, val) noop
18072+
18073+#endif
18074+
18075+/* return inode in which @uf_info is embedded */
71430cf6
MT
18076+static inline struct inode *
18077+unix_file_info_to_inode(const struct unix_file_info * uf_info)
44254afd 18078+{
71430cf6 18079+ return &container_of(uf_info, struct reiser4_inode_object,
44254afd
MT
18080+ p.file_plugin_data.unix_file_info)->vfs_inode;
18081+}
18082+
44254afd
MT
18083+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18084+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18085+
71430cf6 18086+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
44254afd
MT
18087+
18088+#if REISER4_DEBUG
71430cf6 18089+extern void reiser4_inode_invariant(const struct inode *inode);
44254afd
MT
18090+extern int inode_has_no_jnodes(reiser4_inode *);
18091+#else
71430cf6 18092+#define reiser4_inode_invariant(inode) noop
44254afd
MT
18093+#endif
18094+
18095+static inline int spin_inode_is_locked(const struct inode *inode)
18096+{
18097+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18098+ return 1;
18099+}
18100+
18101+/**
18102+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
18103+ * @inode: inode to lock
18104+ *
18105+ * In debug mode it checks that lower priority locks are not held and
18106+ * increments reiser4_context's lock counters on which lock ordering checking
18107+ * is based.
18108+ */
18109+static inline void spin_lock_inode(struct inode *inode)
18110+{
18111+ assert("", LOCK_CNT_NIL(spin_locked));
18112+ /* check lock ordering */
18113+ assert_spin_not_locked(&d_lock);
18114+
18115+ spin_lock(&reiser4_inode_data(inode)->guard);
18116+
18117+ LOCK_CNT_INC(spin_locked_inode);
18118+ LOCK_CNT_INC(spin_locked);
18119+
71430cf6 18120+ reiser4_inode_invariant(inode);
44254afd
MT
18121+}
18122+
18123+/**
18124+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18125+ * @inode: inode to unlock
18126+ *
18127+ * In debug mode it checks that spinlock is held and decrements
18128+ * reiser4_context's lock counters on which lock ordering checking is based.
18129+ */
18130+static inline void spin_unlock_inode(struct inode *inode)
18131+{
18132+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18133+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18134+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18135+
71430cf6 18136+ reiser4_inode_invariant(inode);
44254afd
MT
18137+
18138+ LOCK_CNT_DEC(spin_locked_inode);
18139+ LOCK_CNT_DEC(spin_locked);
18140+
18141+ spin_unlock(&reiser4_inode_data(inode)->guard);
18142+}
18143+
44254afd
MT
18144+extern znode *inode_get_vroot(struct inode *inode);
18145+extern void inode_set_vroot(struct inode *inode, znode * vroot);
18146+
18147+extern int reiser4_max_filename_len(const struct inode *inode);
18148+extern int max_hash_collisions(const struct inode *dir);
18149+extern void reiser4_unlock_inode(struct inode *inode);
18150+extern int is_reiser4_inode(const struct inode *inode);
18151+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18152+extern struct inode *reiser4_iget(struct super_block *super,
18153+ const reiser4_key * key, int silent);
18154+extern void reiser4_iget_complete(struct inode *inode);
71430cf6
MT
18155+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18156+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18157+extern int reiser4_inode_get_flag(const struct inode *inode,
18158+ reiser4_file_plugin_flags f);
44254afd
MT
18159+
18160+/* has inode been initialized? */
18161+static inline int
18162+is_inode_loaded(const struct inode *inode /* inode queried */ )
18163+{
18164+ assert("nikita-1120", inode != NULL);
71430cf6 18165+ return reiser4_inode_get_flag(inode, REISER4_LOADED);
44254afd
MT
18166+}
18167+
18168+extern file_plugin *inode_file_plugin(const struct inode *inode);
18169+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18170+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18171+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18172+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18173+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18174+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18175+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18176+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18177+ *inode);
18178+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
71430cf6 18179+extern file_plugin *inode_create_plugin(const struct inode *inode);
44254afd
MT
18180+extern item_plugin *inode_sd_plugin(const struct inode *inode);
18181+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
71430cf6 18182+extern file_plugin *child_create_plugin(const struct inode *inode);
44254afd 18183+
44254afd
MT
18184+extern void reiser4_make_bad_inode(struct inode *inode);
18185+
18186+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
71430cf6 18187+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
44254afd
MT
18188+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18189+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18190+
71430cf6
MT
18191+#define INODE_SET_SIZE(i, value) \
18192+({ \
18193+ struct inode *__i; \
18194+ typeof(value) __v; \
18195+ \
18196+ __i = (i); \
18197+ __v = (value); \
18198+ inode_check_scale(__i, __i->i_size, __v); \
18199+ i_size_write(__i, __v); \
18200+})
18201+
44254afd
MT
18202+/*
18203+ * update field @field in inode @i to contain value @value.
18204+ */
18205+#define INODE_SET_FIELD(i, field, value) \
18206+({ \
18207+ struct inode *__i; \
18208+ typeof(value) __v; \
18209+ \
18210+ __i = (i); \
18211+ __v = (value); \
18212+ inode_check_scale(__i, __i->field, __v); \
18213+ __i->field = __v; \
18214+})
18215+
18216+#define INODE_INC_FIELD(i, field) \
18217+({ \
18218+ struct inode *__i; \
18219+ \
18220+ __i = (i); \
18221+ inode_check_scale(__i, __i->field, __i->field + 1); \
18222+ ++ __i->field; \
18223+})
18224+
18225+#define INODE_DEC_FIELD(i, field) \
18226+({ \
18227+ struct inode *__i; \
18228+ \
18229+ __i = (i); \
18230+ inode_check_scale(__i, __i->field, __i->field - 1); \
18231+ -- __i->field; \
18232+})
18233+
71430cf6 18234+/* See comment before reiser4_readdir_common() for description. */
44254afd
MT
18235+static inline struct list_head *get_readdir_list(const struct inode *inode)
18236+{
18237+ return &reiser4_inode_data(inode)->lists.readdir_list;
18238+}
18239+
18240+extern void init_inode_ordering(struct inode *inode,
18241+ reiser4_object_create_data * crd, int create);
18242+
18243+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18244+{
18245+ return &reiser4_inode_data(inode)->jnodes_tree;
18246+}
18247+
18248+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18249+ * r4_inode)
18250+{
18251+ return &r4_inode->jnodes_tree;
18252+}
18253+
18254+#if REISER4_DEBUG
18255+extern void print_inode(const char *prefix, const struct inode *i);
18256+#endif
18257+
18258+int is_dir_empty(const struct inode *);
18259+
18260+/* __REISER4_INODE_H__ */
18261+#endif
18262+
18263+/* Make Linus happy.
18264+ Local variables:
18265+ c-indentation-style: "K&R"
18266+ mode-name: "LC"
18267+ c-basic-offset: 8
18268+ tab-width: 8
18269+ fill-column: 120
18270+ End:
18271+*/
71430cf6
MT
18272diff -urN linux-2.6.22.orig/fs/reiser4/ioctl.h linux-2.6.22/fs/reiser4/ioctl.h
18273--- linux-2.6.22.orig/fs/reiser4/ioctl.h 1970-01-01 03:00:00.000000000 +0300
18274+++ linux-2.6.22/fs/reiser4/ioctl.h 2007-07-29 00:25:34.872695441 +0400
44254afd
MT
18275@@ -0,0 +1,41 @@
18276+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18277+ * reiser4/README */
18278+
18279+#if !defined( __REISER4_IOCTL_H__ )
18280+#define __REISER4_IOCTL_H__
18281+
18282+#include <linux/fs.h>
18283+
18284+/*
18285+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18286+ * extents and fix in this state. This is used by applications that rely on
18287+ *
18288+ * . files being block aligned, and
18289+ *
18290+ * . files never migrating on disk
18291+ *
18292+ * for example, boot loaders (LILO) need this.
18293+ *
18294+ * This ioctl should be used as
18295+ *
18296+ * result = ioctl(fd, REISER4_IOC_UNPACK);
18297+ *
18298+ * File behind fd descriptor will be converted to the extents (if necessary),
18299+ * and its stat-data will be updated so that it will never be converted back
18300+ * into tails again.
18301+ */
18302+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18303+
18304+/* __REISER4_IOCTL_H__ */
18305+#endif
18306+
18307+/* Make Linus happy.
18308+ Local variables:
18309+ c-indentation-style: "K&R"
18310+ mode-name: "LC"
18311+ c-basic-offset: 8
18312+ tab-width: 8
18313+ fill-column: 120
18314+ scroll-step: 1
18315+ End:
18316+*/
71430cf6
MT
18317diff -urN linux-2.6.22.orig/fs/reiser4/jnode.c linux-2.6.22/fs/reiser4/jnode.c
18318--- linux-2.6.22.orig/fs/reiser4/jnode.c 1970-01-01 03:00:00.000000000 +0300
18319+++ linux-2.6.22/fs/reiser4/jnode.c 2007-07-29 00:25:34.876696477 +0400
18320@@ -0,0 +1,1924 @@
44254afd
MT
18321+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18322+ * reiser4/README */
18323+/* Jnode manipulation functions. */
18324+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18325+
18326+ In particular, jnodes are used to track transactional information
18327+ associated with each block. Each znode contains jnode as ->zjnode field.
18328+
18329+ Jnode stands for either Josh or Journal node.
18330+*/
18331+
18332+/*
18333+ * Taxonomy.
18334+ *
18335+ * Jnode represents block containing data or meta-data. There are jnodes
18336+ * for:
18337+ *
18338+ * unformatted blocks (jnodes proper). There are plans, however to
18339+ * have a handle per extent unit rather than per each unformatted
18340+ * block, because there are so many of them.
18341+ *
18342+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
18343+ * for working and another for "commit" data, together forming bnode.
18344+ *
18345+ * For io-heads. These are used by log writer.
18346+ *
18347+ * For formatted nodes (znode). See comment at the top of znode.c for
18348+ * details specific to the formatted nodes (znodes).
18349+ *
18350+ * Node data.
18351+ *
18352+ * Jnode provides access to the data of node it represents. Data are
18353+ * stored in a page. Page is kept in a page cache. This means, that jnodes
18354+ * are highly interconnected with page cache and VM internals.
18355+ *
18356+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
18357+ * themselves is cached in ->data field to avoid frequent calls to
18358+ * page_address().
18359+ *
18360+ * jnode and page are attached to each other by jnode_attach_page(). This
18361+ * function places pointer to jnode in set_page_private(), sets PG_private
18362+ * flag and increments page counter.
18363+ *
18364+ * Opposite operation is performed by page_clear_jnode().
18365+ *
18366+ * jnode->pg is protected by jnode spin lock, and page->private is
18367+ * protected by page lock. See comment at the top of page_cache.c for
18368+ * more.
18369+ *
18370+ * page can be detached from jnode for two reasons:
18371+ *
18372+ * . jnode is removed from a tree (file is truncated, of formatted
18373+ * node is removed by balancing).
18374+ *
18375+ * . during memory pressure, VM calls ->releasepage() method
18376+ * (reiser4_releasepage()) to evict page from memory.
18377+ *
18378+ * (there, of course, is also umount, but this is special case we are not
18379+ * concerned with here).
18380+ *
18381+ * To protect jnode page from eviction, one calls jload() function that
18382+ * "pins" page in memory (loading it if necessary), increments
18383+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
18384+ * jrelse().
18385+ *
18386+ * Jnode life cycle.
18387+ *
18388+ * jnode is created, placed in hash table, and, optionally, in per-inode
18389+ * radix tree. Page can be attached to jnode, pinned, released, etc.
18390+ *
18391+ * When jnode is captured into atom its reference counter is
18392+ * increased. While being part of an atom, jnode can be "early
18393+ * flushed". This means that as part of flush procedure, jnode is placed
18394+ * into "relocate set", and its page is submitted to the disk. After io
18395+ * completes, page can be detached, then loaded again, re-dirtied, etc.
18396+ *
18397+ * Thread acquired reference to jnode by calling jref() and releases it by
18398+ * jput(). When last reference is removed, jnode is still retained in
18399+ * memory (cached) if it has page attached, _unless_ it is scheduled for
18400+ * destruction (has JNODE_HEARD_BANSHEE bit set).
18401+ *
18402+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
18403+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18404+ * that is, tree lock protected unreferenced jnodes stored in the hash
18405+ * table, from recycling.
18406+ *
18407+ * This resulted in high contention on tree lock, because jref()/jput() is
18408+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
18409+ * is just about to release last reference on jnode it sets JNODE_RIP bit
18410+ * on it, and then proceed with jnode destruction (removing jnode from hash
18411+ * table, cbk_cache, detaching page, etc.). All places that change jnode
18412+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18413+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18414+ * jnode_rip_check() function), and pretend that nothing was found in hash
18415+ * table if bit is set.
18416+ *
18417+ * jput defers actual return of jnode into slab cache to some later time
18418+ * (by call_rcu()), this guarantees that other threads can safely continue
18419+ * working with JNODE_RIP-ped jnode.
18420+ *
18421+ */
18422+
18423+#include "reiser4.h"
18424+#include "debug.h"
18425+#include "dformat.h"
18426+#include "jnode.h"
18427+#include "plugin/plugin_header.h"
18428+#include "plugin/plugin.h"
18429+#include "txnmgr.h"
18430+/*#include "jnode.h"*/
18431+#include "znode.h"
18432+#include "tree.h"
18433+#include "tree_walk.h"
18434+#include "super.h"
18435+#include "inode.h"
18436+#include "page_cache.h"
18437+
18438+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18439+#include <linux/types.h>
18440+#include <linux/slab.h>
18441+#include <linux/pagemap.h>
44254afd
MT
18442+#include <linux/swap.h>
18443+#include <linux/fs.h> /* for struct address_space */
18444+#include <linux/writeback.h> /* for inode_lock */
18445+
71430cf6 18446+static struct kmem_cache *_jnode_slab = NULL;
44254afd
MT
18447+
18448+static void jnode_set_type(jnode * node, jnode_type type);
18449+static int jdelete(jnode * node);
18450+static int jnode_try_drop(jnode * node);
18451+
18452+#if REISER4_DEBUG
18453+static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18454+#endif
18455+
18456+/* true if valid page is attached to jnode */
18457+static inline int jnode_is_parsed(jnode * node)
18458+{
18459+ return JF_ISSET(node, JNODE_PARSED);
18460+}
18461+
18462+/* hash table support */
18463+
18464+/* compare two jnode keys for equality. Used by hash-table macros */
71430cf6
MT
18465+static inline int jnode_key_eq(const struct jnode_key * k1,
18466+ const struct jnode_key * k2)
44254afd
MT
18467+{
18468+ assert("nikita-2350", k1 != NULL);
18469+ assert("nikita-2351", k2 != NULL);
18470+
18471+ return (k1->index == k2->index && k1->objectid == k2->objectid);
18472+}
18473+
18474+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
71430cf6
MT
18475+static inline __u32 jnode_key_hashfn(j_hash_table * table,
18476+ const struct jnode_key * key)
44254afd
MT
18477+{
18478+ assert("nikita-2352", key != NULL);
18479+ assert("nikita-3346", IS_POW(table->_buckets));
18480+
18481+ /* yes, this is remarkable simply (where not stupid) hash function. */
18482+ return (key->objectid + key->index) & (table->_buckets - 1);
18483+}
18484+
18485+/* The hash table definition */
71430cf6 18486+#define KMALLOC(size) reiser4_vmalloc(size)
44254afd 18487+#define KFREE(ptr, size) vfree(ptr)
71430cf6
MT
18488+TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18489+ jnode_key_hashfn, jnode_key_eq);
44254afd
MT
18490+#undef KFREE
18491+#undef KMALLOC
18492+
18493+/* call this to initialise jnode hash table */
18494+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18495+{
18496+ assert("nikita-2359", tree != NULL);
18497+ return j_hash_init(&tree->jhash_table, 16384);
18498+}
18499+
18500+/* call this to destroy jnode hash table. This is called during umount. */
18501+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18502+{
18503+ j_hash_table *jtable;
18504+ jnode *node;
18505+ jnode *next;
18506+
18507+ assert("nikita-2360", tree != NULL);
18508+
18509+ /*
18510+ * Scan hash table and free all jnodes.
18511+ */
18512+ jtable = &tree->jhash_table;
18513+ if (jtable->_table) {
18514+ for_all_in_htable(jtable, j, node, next) {
18515+ assert("nikita-2361", !atomic_read(&node->x_count));
18516+ jdrop(node);
18517+ }
18518+
18519+ j_hash_done(&tree->jhash_table);
18520+ }
18521+ return 0;
18522+}
18523+
18524+/**
18525+ * init_jnodes - create jnode cache
18526+ *
18527+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18528+ */
18529+int init_jnodes(void)
18530+{
18531+ assert("umka-168", _jnode_slab == NULL);
18532+
18533+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18534+ SLAB_HWCACHE_ALIGN |
18535+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
18536+ if (_jnode_slab == NULL)
18537+ return RETERR(-ENOMEM);
18538+
18539+ return 0;
18540+}
18541+
18542+/**
18543+ * done_znodes - delete znode cache
18544+ *
18545+ * This is called on reiser4 module unloading or system shutdown.
18546+ */
18547+void done_jnodes(void)
18548+{
18549+ destroy_reiser4_cache(&_jnode_slab);
18550+}
18551+
18552+/* Initialize a jnode. */
18553+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18554+{
18555+ assert("umka-175", node != NULL);
18556+
18557+ memset(node, 0, sizeof(jnode));
18558+ ON_DEBUG(node->magic = JMAGIC);
18559+ jnode_set_type(node, type);
18560+ atomic_set(&node->d_count, 0);
18561+ atomic_set(&node->x_count, 0);
18562+ spin_lock_init(&node->guard);
18563+ spin_lock_init(&node->load);
18564+ node->atom = NULL;
18565+ node->tree = tree;
18566+ INIT_LIST_HEAD(&node->capture_link);
18567+
18568+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18569+
18570+ INIT_RCU_HEAD(&node->rcu);
18571+
18572+#if REISER4_DEBUG
18573+ {
18574+ reiser4_super_info_data *sbinfo;
18575+
18576+ sbinfo = get_super_private(tree->super);
18577+ spin_lock_irq(&sbinfo->all_guard);
18578+ list_add(&node->jnodes, &sbinfo->all_jnodes);
18579+ spin_unlock_irq(&sbinfo->all_guard);
18580+ }
18581+#endif
18582+}
18583+
18584+#if REISER4_DEBUG
18585+/*
18586+ * Remove jnode from ->all_jnodes list.
18587+ */
18588+static void jnode_done(jnode * node, reiser4_tree * tree)
18589+{
18590+ reiser4_super_info_data *sbinfo;
18591+
18592+ sbinfo = get_super_private(tree->super);
18593+
18594+ spin_lock_irq(&sbinfo->all_guard);
18595+ assert("nikita-2422", !list_empty(&node->jnodes));
18596+ list_del_init(&node->jnodes);
18597+ spin_unlock_irq(&sbinfo->all_guard);
18598+}
18599+#endif
18600+
18601+/* return already existing jnode of page */
18602+jnode *jnode_by_page(struct page *pg)
18603+{
18604+ assert("nikita-2066", pg != NULL);
18605+ assert("nikita-2400", PageLocked(pg));
18606+ assert("nikita-2068", PagePrivate(pg));
18607+ assert("nikita-2067", jprivate(pg) != NULL);
18608+ return jprivate(pg);
18609+}
18610+
18611+/* exported functions to allocate/free jnode objects outside this file */
18612+jnode *jalloc(void)
18613+{
71430cf6 18614+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
44254afd
MT
18615+ return jal;
18616+}
18617+
18618+/* return jnode back to the slab allocator */
18619+inline void jfree(jnode * node)
18620+{
18621+ assert("zam-449", node != NULL);
18622+
18623+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18624+ NODE_LIST(node) == NOT_CAPTURED));
18625+ assert("nikita-3222", list_empty(&node->jnodes));
18626+ assert("nikita-3221", jnode_page(node) == NULL);
18627+
18628+ /* not yet phash_jnode_destroy(node); */
18629+
18630+ kmem_cache_free(_jnode_slab, node);
18631+}
18632+
18633+/*
18634+ * This function is supplied as RCU callback. It actually frees jnode when
18635+ * last reference to it is gone.
18636+ */
18637+static void jnode_free_actor(struct rcu_head *head)
18638+{
18639+ jnode *node;
18640+ jnode_type jtype;
18641+
18642+ node = container_of(head, jnode, rcu);
18643+ jtype = jnode_get_type(node);
18644+
18645+ ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18646+
18647+ switch (jtype) {
18648+ case JNODE_IO_HEAD:
18649+ case JNODE_BITMAP:
18650+ case JNODE_UNFORMATTED_BLOCK:
18651+ jfree(node);
18652+ break;
18653+ case JNODE_FORMATTED_BLOCK:
18654+ zfree(JZNODE(node));
18655+ break;
18656+ case JNODE_INODE:
18657+ default:
18658+ wrong_return_value("nikita-3197", "Wrong jnode type");
18659+ }
18660+}
18661+
18662+/*
18663+ * Free a jnode. Post a callback to be executed later through RCU when all
18664+ * references to @node are released.
18665+ */
18666+static inline void jnode_free(jnode * node, jnode_type jtype)
18667+{
18668+ if (jtype != JNODE_INODE) {
18669+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18670+ call_rcu(&node->rcu, jnode_free_actor);
18671+ } else
18672+ jnode_list_remove(node);
18673+}
18674+
18675+/* allocate new unformatted jnode */
18676+static jnode *jnew_unformatted(void)
18677+{
18678+ jnode *jal;
18679+
18680+ jal = jalloc();
18681+ if (jal == NULL)
18682+ return NULL;
18683+
18684+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18685+ jal->key.j.mapping = NULL;
18686+ jal->key.j.index = (unsigned long)-1;
18687+ jal->key.j.objectid = 0;
18688+ return jal;
18689+}
18690+
18691+/* look for jnode with given mapping and offset within hash table */
18692+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18693+{
71430cf6 18694+ struct jnode_key jkey;
44254afd
MT
18695+ jnode *node;
18696+
18697+ assert("nikita-2353", tree != NULL);
18698+
18699+ jkey.objectid = objectid;
18700+ jkey.index = index;
18701+
18702+ /*
18703+ * hash table is _not_ protected by any lock during lookups. All we
18704+ * have to do is to disable preemption to keep RCU happy.
18705+ */
18706+
18707+ rcu_read_lock();
18708+ node = j_hash_find(&tree->jhash_table, &jkey);
18709+ if (node != NULL) {
18710+ /* protect @node from recycling */
18711+ jref(node);
18712+ assert("nikita-2955", jnode_invariant(node, 0, 0));
18713+ node = jnode_rip_check(tree, node);
18714+ }
18715+ rcu_read_unlock();
18716+ return node;
18717+}
18718+
18719+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18720+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18721+{
18722+ assert("vs-1694", mapping->host != NULL);
18723+
18724+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18725+}
18726+
18727+jnode *jfind(struct address_space * mapping, unsigned long index)
18728+{
18729+ reiser4_tree *tree;
18730+ jnode *node;
18731+
18732+ assert("vs-1694", mapping->host != NULL);
71430cf6 18733+ tree = reiser4_tree_by_inode(mapping->host);
44254afd
MT
18734+
18735+ read_lock_tree(tree);
18736+ node = jfind_nolock(mapping, index);
18737+ if (node != NULL)
18738+ jref(node);
18739+ read_unlock_tree(tree);
18740+ return node;
18741+}
18742+
18743+static void inode_attach_jnode(jnode * node)
18744+{
18745+ struct inode *inode;
18746+ reiser4_inode *info;
18747+ struct radix_tree_root *rtree;
18748+
18749+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18750+ assert("zam-1043", node->key.j.mapping != NULL);
18751+ inode = node->key.j.mapping->host;
18752+ info = reiser4_inode_data(inode);
18753+ rtree = jnode_tree_by_reiser4_inode(info);
18754+ if (rtree->rnode == NULL) {
18755+ /* prevent inode from being pruned when it has jnodes attached
18756+ to it */
18757+ write_lock_irq(&inode->i_data.tree_lock);
18758+ inode->i_data.nrpages++;
18759+ write_unlock_irq(&inode->i_data.tree_lock);
18760+ }
18761+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18762+ check_me("zam-1045",
18763+ !radix_tree_insert(rtree, node->key.j.index, node));
18764+ ON_DEBUG(info->nr_jnodes++);
18765+}
18766+
18767+static void inode_detach_jnode(jnode * node)
18768+{
18769+ struct inode *inode;
18770+ reiser4_inode *info;
18771+ struct radix_tree_root *rtree;
18772+
18773+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18774+ assert("zam-1044", node->key.j.mapping != NULL);
18775+ inode = node->key.j.mapping->host;
18776+ info = reiser4_inode_data(inode);
18777+ rtree = jnode_tree_by_reiser4_inode(info);
18778+
18779+ assert("zam-1051", info->nr_jnodes != 0);
18780+ assert("zam-1052", rtree->rnode != NULL);
18781+ ON_DEBUG(info->nr_jnodes--);
18782+
18783+ /* delete jnode from inode's radix tree of jnodes */
18784+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18785+ if (rtree->rnode == NULL) {
18786+ /* inode can be pruned now */
18787+ write_lock_irq(&inode->i_data.tree_lock);
18788+ inode->i_data.nrpages--;
18789+ write_unlock_irq(&inode->i_data.tree_lock);
18790+ }
18791+}
18792+
18793+/* put jnode into hash table (where they can be found by flush who does not know
18794+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
18795+ faster) in places where mapping is known). Currently it is used by
18796+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18797+ created */
18798+static void
18799+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18800+ unsigned long index)
18801+{
18802+ j_hash_table *jtable;
18803+
18804+ assert("vs-1446", jnode_is_unformatted(node));
18805+ assert("vs-1442", node->key.j.mapping == 0);
18806+ assert("vs-1443", node->key.j.objectid == 0);
18807+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
18808+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18809+
18810+ node->key.j.mapping = mapping;
18811+ node->key.j.objectid = get_inode_oid(mapping->host);
18812+ node->key.j.index = index;
18813+
18814+ jtable = &jnode_get_tree(node)->jhash_table;
18815+
18816+ /* race with some other thread inserting jnode into the hash table is
18817+ * impossible, because we keep the page lock. */
18818+ /*
18819+ * following assertion no longer holds because of RCU: it is possible
18820+ * jnode is in the hash table, but with JNODE_RIP bit set.
18821+ */
18822+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18823+ j_hash_insert_rcu(jtable, node);
18824+ inode_attach_jnode(node);
18825+}
18826+
18827+static void unhash_unformatted_node_nolock(jnode * node)
18828+{
18829+ assert("vs-1683", node->key.j.mapping != NULL);
18830+ assert("vs-1684",
18831+ node->key.j.objectid ==
18832+ get_inode_oid(node->key.j.mapping->host));
18833+
18834+ /* remove jnode from hash-table */
18835+ j_hash_remove_rcu(&node->tree->jhash_table, node);
18836+ inode_detach_jnode(node);
18837+ node->key.j.mapping = NULL;
18838+ node->key.j.index = (unsigned long)-1;
18839+ node->key.j.objectid = 0;
18840+
18841+}
18842+
18843+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18844+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
71430cf6 18845+ reiser4_uncapture_jnode */
44254afd
MT
18846+void unhash_unformatted_jnode(jnode * node)
18847+{
18848+ assert("vs-1445", jnode_is_unformatted(node));
18849+
18850+ write_lock_tree(node->tree);
18851+ unhash_unformatted_node_nolock(node);
18852+ write_unlock_tree(node->tree);
18853+}
18854+
18855+/*
18856+ * search hash table for a jnode with given oid and index. If not found,
18857+ * allocate new jnode, insert it, and also insert into radix tree for the
18858+ * given inode/mapping.
18859+ */
71430cf6
MT
18860+static jnode *find_get_jnode(reiser4_tree * tree,
18861+ struct address_space *mapping,
18862+ oid_t oid, unsigned long index)
44254afd
MT
18863+{
18864+ jnode *result;
18865+ jnode *shadow;
18866+ int preload;
18867+
18868+ result = jnew_unformatted();
18869+
18870+ if (unlikely(result == NULL))
18871+ return ERR_PTR(RETERR(-ENOMEM));
18872+
71430cf6 18873+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
44254afd
MT
18874+ if (preload != 0)
18875+ return ERR_PTR(preload);
18876+
18877+ write_lock_tree(tree);
18878+ shadow = jfind_nolock(mapping, index);
18879+ if (likely(shadow == NULL)) {
18880+ /* add new jnode to hash table and inode's radix tree of jnodes */
18881+ jref(result);
18882+ hash_unformatted_jnode(result, mapping, index);
18883+ } else {
18884+ /* jnode is found in inode's radix tree of jnodes */
18885+ jref(shadow);
18886+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18887+ assert("vs-1498", shadow->key.j.mapping == mapping);
18888+ result = shadow;
18889+ }
18890+ write_unlock_tree(tree);
18891+
18892+ assert("nikita-2955",
18893+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
18894+ radix_tree_preload_end();
18895+ return result;
18896+}
18897+
18898+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18899+ creates) jnode corresponding to page @pg. jnode is attached to page and
18900+ inserted into jnode hash-table. */
18901+static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18902+{
18903+ /*
18904+ * There are two ways to create jnode: starting with pre-existing page
18905+ * and without page.
18906+ *
18907+ * When page already exists, jnode is created
18908+ * (jnode_of_page()->do_jget()) under page lock. This is done in
18909+ * ->writepage(), or when capturing anonymous page dirtied through
18910+ * mmap.
18911+ *
18912+ * Jnode without page is created by index_extent_jnode().
18913+ *
18914+ */
18915+
18916+ jnode *result;
18917+ oid_t oid = get_inode_oid(pg->mapping->host);
18918+
18919+ assert("umka-176", pg != NULL);
18920+ assert("nikita-2394", PageLocked(pg));
18921+
18922+ result = jprivate(pg);
18923+ if (likely(result != NULL))
18924+ return jref(result);
18925+
71430cf6 18926+ tree = reiser4_tree_by_page(pg);
44254afd
MT
18927+
18928+ /* check hash-table first */
18929+ result = jfind(pg->mapping, pg->index);
18930+ if (unlikely(result != NULL)) {
18931+ spin_lock_jnode(result);
18932+ jnode_attach_page(result, pg);
18933+ spin_unlock_jnode(result);
18934+ result->key.j.mapping = pg->mapping;
18935+ return result;
18936+ }
18937+
71430cf6
MT
18938+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18939+ reiser4_ctx_gfp_mask_force(GFP_NOFS);
44254afd
MT
18940+ result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18941+ if (unlikely(IS_ERR(result)))
18942+ return result;
18943+ /* attach jnode to page */
18944+ spin_lock_jnode(result);
18945+ jnode_attach_page(result, pg);
18946+ spin_unlock_jnode(result);
18947+ return result;
18948+}
18949+
18950+/*
18951+ * return jnode for @pg, creating it if necessary.
18952+ */
18953+jnode *jnode_of_page(struct page * pg)
18954+{
18955+ jnode *result;
18956+
18957+ assert("umka-176", pg != NULL);
18958+ assert("nikita-2394", PageLocked(pg));
18959+
71430cf6 18960+ result = do_jget(reiser4_tree_by_page(pg), pg);
44254afd
MT
18961+
18962+ if (REISER4_DEBUG && !IS_ERR(result)) {
18963+ assert("nikita-3210", result == jprivate(pg));
18964+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18965+ if (jnode_is_unformatted(jprivate(pg))) {
18966+ assert("nikita-2364",
18967+ jprivate(pg)->key.j.index == pg->index);
18968+ assert("nikita-2367",
18969+ jprivate(pg)->key.j.mapping == pg->mapping);
18970+ assert("nikita-2365",
18971+ jprivate(pg)->key.j.objectid ==
18972+ get_inode_oid(pg->mapping->host));
18973+ assert("vs-1200",
18974+ jprivate(pg)->key.j.objectid ==
18975+ pg->mapping->host->i_ino);
18976+ assert("nikita-2356",
18977+ jnode_is_unformatted(jnode_by_page(pg)));
18978+ }
18979+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
18980+ }
18981+ return result;
18982+}
18983+
18984+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
18985+ * page.*/
18986+void jnode_attach_page(jnode * node, struct page *pg)
18987+{
18988+ assert("nikita-2060", node != NULL);
18989+ assert("nikita-2061", pg != NULL);
18990+
18991+ assert("nikita-2050", jprivate(pg) == 0ul);
18992+ assert("nikita-2393", !PagePrivate(pg));
18993+ assert("vs-1741", node->pg == NULL);
18994+
18995+ assert("nikita-2396", PageLocked(pg));
18996+ assert_spin_locked(&(node->guard));
18997+
18998+ page_cache_get(pg);
18999+ set_page_private(pg, (unsigned long)node);
19000+ node->pg = pg;
19001+ SetPagePrivate(pg);
19002+}
19003+
19004+/* Dual to jnode_attach_page: break a binding between page and jnode */
19005+void page_clear_jnode(struct page *page, jnode * node)
19006+{
19007+ assert("nikita-2424", page != NULL);
19008+ assert("nikita-2425", PageLocked(page));
19009+ assert("nikita-2426", node != NULL);
19010+ assert_spin_locked(&(node->guard));
19011+ assert("nikita-2428", PagePrivate(page));
19012+
19013+ assert("nikita-3551", !PageWriteback(page));
19014+
19015+ JF_CLR(node, JNODE_PARSED);
19016+ set_page_private(page, 0ul);
19017+ ClearPagePrivate(page);
19018+ node->pg = NULL;
19019+ page_cache_release(page);
19020+}
19021+
71430cf6 19022+#if 0
44254afd
MT
19023+/* it is only used in one place to handle error */
19024+void
19025+page_detach_jnode(struct page *page, struct address_space *mapping,
19026+ unsigned long index)
19027+{
19028+ assert("nikita-2395", page != NULL);
19029+
19030+ lock_page(page);
19031+ if ((page->mapping == mapping) && (page->index == index)
19032+ && PagePrivate(page)) {
19033+ jnode *node;
19034+
19035+ node = jprivate(page);
19036+ spin_lock_jnode(node);
19037+ page_clear_jnode(page, node);
19038+ spin_unlock_jnode(node);
19039+ }
19040+ unlock_page(page);
19041+}
71430cf6 19042+#endif /* 0 */
44254afd
MT
19043+
19044+/* return @node page locked.
19045+
19046+ Locking ordering requires that one first takes page lock and afterwards
19047+ spin lock on node attached to this page. Sometimes it is necessary to go in
19048+ the opposite direction. This is done through standard trylock-and-release
19049+ loop.
19050+*/
19051+static struct page *jnode_lock_page(jnode * node)
19052+{
19053+ struct page *page;
19054+
19055+ assert("nikita-2052", node != NULL);
19056+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19057+
19058+ while (1) {
19059+
19060+ spin_lock_jnode(node);
19061+ page = jnode_page(node);
19062+ if (page == NULL) {
19063+ break;
19064+ }
19065+
19066+ /* no need to page_cache_get( page ) here, because page cannot
19067+ be evicted from memory without detaching it from jnode and
19068+ this requires spin lock on jnode that we already hold.
19069+ */
19070+ if (!TestSetPageLocked(page)) {
19071+ /* We won a lock on jnode page, proceed. */
19072+ break;
19073+ }
19074+
19075+ /* Page is locked by someone else. */
19076+ page_cache_get(page);
19077+ spin_unlock_jnode(node);
19078+ wait_on_page_locked(page);
19079+ /* it is possible that page was detached from jnode and
19080+ returned to the free pool, or re-assigned while we were
19081+ waiting on locked bit. This will be rechecked on the next
19082+ loop iteration.
19083+ */
19084+ page_cache_release(page);
19085+
19086+ /* try again */
19087+ }
19088+ return page;
19089+}
19090+
19091+/*
19092+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19093+ * validness of jnode content.
19094+ */
19095+static inline int jparse(jnode * node)
19096+{
19097+ int result;
19098+
19099+ assert("nikita-2466", node != NULL);
19100+
19101+ spin_lock_jnode(node);
19102+ if (likely(!jnode_is_parsed(node))) {
19103+ result = jnode_ops(node)->parse(node);
19104+ if (likely(result == 0))
19105+ JF_SET(node, JNODE_PARSED);
19106+ } else
19107+ result = 0;
19108+ spin_unlock_jnode(node);
19109+ return result;
19110+}
19111+
19112+/* Lock a page attached to jnode, create and attach page to jnode if it had no
19113+ * one. */
71430cf6 19114+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
44254afd
MT
19115+{
19116+ struct page *page;
19117+
19118+ spin_lock_jnode(node);
19119+ page = jnode_page(node);
19120+
19121+ if (page == NULL) {
19122+ spin_unlock_jnode(node);
19123+ page = find_or_create_page(jnode_get_mapping(node),
19124+ jnode_get_index(node), gfp_flags);
19125+ if (page == NULL)
19126+ return ERR_PTR(RETERR(-ENOMEM));
19127+ } else {
19128+ if (!TestSetPageLocked(page)) {
19129+ spin_unlock_jnode(node);
19130+ return page;
19131+ }
19132+ page_cache_get(page);
19133+ spin_unlock_jnode(node);
19134+ lock_page(page);
19135+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19136+ }
19137+
19138+ spin_lock_jnode(node);
19139+ if (!jnode_page(node))
19140+ jnode_attach_page(node, page);
19141+ spin_unlock_jnode(node);
19142+
19143+ page_cache_release(page);
19144+ assert("zam-894", jnode_page(node) == page);
19145+ return page;
19146+}
19147+
19148+/* Start read operation for jnode's page if page is not up-to-date. */
19149+static int jnode_start_read(jnode * node, struct page *page)
19150+{
19151+ assert("zam-893", PageLocked(page));
19152+
19153+ if (PageUptodate(page)) {
19154+ unlock_page(page);
19155+ return 0;
19156+ }
71430cf6 19157+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
44254afd
MT
19158+}
19159+
19160+#if REISER4_DEBUG
19161+static void check_jload(jnode * node, struct page *page)
19162+{
19163+ if (jnode_is_znode(node)) {
19164+ node40_header *nh;
19165+ znode *z;
19166+
19167+ z = JZNODE(node);
19168+ if (znode_is_any_locked(z)) {
19169+ nh = (node40_header *) kmap(page);
19170+ /* this only works for node40-only file systems. For
19171+ * debugging. */
19172+ assert("nikita-3253",
19173+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19174+ kunmap(page);
19175+ }
19176+ assert("nikita-3565", znode_invariant(z));
19177+ }
19178+}
19179+#else
19180+#define check_jload(node, page) noop
19181+#endif
19182+
19183+/* prefetch jnode to speed up next call to jload. Call this when you are going
19184+ * to call jload() shortly. This will bring appropriate portion of jnode into
19185+ * CPU cache. */
19186+void jload_prefetch(jnode * node)
19187+{
19188+ prefetchw(&node->x_count);
19189+}
19190+
19191+/* load jnode's data into memory */
19192+int jload_gfp(jnode * node /* node to load */ ,
19193+ gfp_t gfp_flags /* allocation flags */ ,
19194+ int do_kmap /* true if page should be kmapped */ )
19195+{
19196+ struct page *page;
19197+ int result = 0;
19198+ int parsed;
19199+
71430cf6 19200+ assert("nikita-3010", reiser4_schedulable());
44254afd
MT
19201+
19202+ prefetchw(&node->pg);
19203+
19204+ /* taking d-reference implies taking x-reference. */
19205+ jref(node);
19206+
19207+ /*
19208+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19209+ * should be atomic, otherwise there is a race against
19210+ * reiser4_releasepage().
19211+ */
19212+ spin_lock(&(node->load));
19213+ add_d_ref(node);
19214+ parsed = jnode_is_parsed(node);
19215+ spin_unlock(&(node->load));
19216+
19217+ if (unlikely(!parsed)) {
19218+ page = jnode_get_page_locked(node, gfp_flags);
19219+ if (unlikely(IS_ERR(page))) {
19220+ result = PTR_ERR(page);
19221+ goto failed;
19222+ }
19223+
19224+ result = jnode_start_read(node, page);
19225+ if (unlikely(result != 0))
19226+ goto failed;
19227+
19228+ wait_on_page_locked(page);
19229+ if (unlikely(!PageUptodate(page))) {
19230+ result = RETERR(-EIO);
19231+ goto failed;
19232+ }
19233+
19234+ if (do_kmap)
19235+ node->data = kmap(page);
19236+
19237+ result = jparse(node);
19238+ if (unlikely(result != 0)) {
19239+ if (do_kmap)
19240+ kunmap(page);
19241+ goto failed;
19242+ }
19243+ check_jload(node, page);
19244+ } else {
19245+ page = jnode_page(node);
19246+ check_jload(node, page);
19247+ if (do_kmap)
19248+ node->data = kmap(page);
19249+ }
19250+
19251+ if (!is_writeout_mode())
19252+ /* We do not mark pages active if jload is called as a part of
19253+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19254+ * and write_logs() add no value to cached data, there is no
19255+ * sense to mark pages as active when they go to disk, it just
19256+ * confuses vm scanning routines because clean page could be
19257+ * moved out from inactive list as a result of this
19258+ * mark_page_accessed() call. */
19259+ mark_page_accessed(page);
19260+
19261+ return 0;
19262+
19263+ failed:
19264+ jrelse_tail(node);
19265+ return result;
19266+
19267+}
19268+
19269+/* start asynchronous reading for given jnode's page. */
19270+int jstartio(jnode * node)
19271+{
19272+ struct page *page;
19273+
71430cf6 19274+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
44254afd
MT
19275+ if (IS_ERR(page))
19276+ return PTR_ERR(page);
19277+
19278+ return jnode_start_read(node, page);
19279+}
19280+
19281+/* Initialize a node by calling appropriate plugin instead of reading
19282+ * node from disk as in jload(). */
19283+int jinit_new(jnode * node, gfp_t gfp_flags)
19284+{
19285+ struct page *page;
19286+ int result;
19287+
19288+ jref(node);
19289+ add_d_ref(node);
19290+
19291+ page = jnode_get_page_locked(node, gfp_flags);
19292+ if (IS_ERR(page)) {
19293+ result = PTR_ERR(page);
19294+ goto failed;
19295+ }
19296+
19297+ SetPageUptodate(page);
19298+ unlock_page(page);
19299+
19300+ node->data = kmap(page);
19301+
19302+ if (!jnode_is_parsed(node)) {
19303+ jnode_plugin *jplug = jnode_ops(node);
19304+ spin_lock_jnode(node);
19305+ result = jplug->init(node);
19306+ spin_unlock_jnode(node);
19307+ if (result) {
19308+ kunmap(page);
19309+ goto failed;
19310+ }
19311+ JF_SET(node, JNODE_PARSED);
19312+ }
19313+
19314+ return 0;
19315+
19316+ failed:
19317+ jrelse(node);
19318+ return result;
19319+}
19320+
19321+/* release a reference to jnode acquired by jload(), decrement ->d_count */
19322+void jrelse_tail(jnode * node /* jnode to release references to */ )
19323+{
19324+ assert("nikita-489", atomic_read(&node->d_count) > 0);
19325+ atomic_dec(&node->d_count);
19326+ /* release reference acquired in jload_gfp() or jinit_new() */
19327+ jput(node);
19328+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
19329+ LOCK_CNT_DEC(d_refs);
19330+}
19331+
19332+/* drop reference to node data. When last reference is dropped, data are
19333+ unloaded. */
19334+void jrelse(jnode * node /* jnode to release references to */ )
19335+{
19336+ struct page *page;
19337+
19338+ assert("nikita-487", node != NULL);
19339+ assert_spin_not_locked(&(node->guard));
19340+
19341+ page = jnode_page(node);
19342+ if (likely(page != NULL)) {
19343+ /*
19344+ * it is safe not to lock jnode here, because at this point
19345+ * @node->d_count is greater than zero (if jrelse() is used
19346+ * correctly, that is). JNODE_PARSED may be not set yet, if,
19347+ * for example, we got here as a result of error handling path
19348+ * in jload(). Anyway, page cannot be detached by
19349+ * reiser4_releasepage(). truncate will invalidate page
19350+ * regardless, but this should not be a problem.
19351+ */
19352+ kunmap(page);
19353+ }
19354+ jrelse_tail(node);
19355+}
19356+
19357+/* called from jput() to wait for io completion */
19358+static void jnode_finish_io(jnode * node)
19359+{
19360+ struct page *page;
19361+
19362+ assert("nikita-2922", node != NULL);
19363+
19364+ spin_lock_jnode(node);
19365+ page = jnode_page(node);
19366+ if (page != NULL) {
19367+ page_cache_get(page);
19368+ spin_unlock_jnode(node);
19369+ wait_on_page_writeback(page);
19370+ page_cache_release(page);
19371+ } else
19372+ spin_unlock_jnode(node);
19373+}
19374+
19375+/*
19376+ * This is called by jput() when last reference to jnode is released. This is
19377+ * separate function, because we want fast path of jput() to be inline and,
19378+ * therefore, small.
19379+ */
19380+void jput_final(jnode * node)
19381+{
19382+ int r_i_p;
19383+
19384+ /* A fast check for keeping node in cache. We always keep node in cache
19385+ * if its page is present and node was not marked for deletion */
19386+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19387+ rcu_read_unlock();
19388+ return;
19389+ }
44254afd
MT
19390+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19391+ /*
19392+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19393+ * this case it is safe to access node after unlock.
19394+ */
19395+ rcu_read_unlock();
19396+ if (r_i_p) {
19397+ jnode_finish_io(node);
19398+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19399+ /* node is removed from the tree. */
19400+ jdelete(node);
19401+ else
19402+ jnode_try_drop(node);
19403+ }
19404+ /* if !r_i_p some other thread is already killing it */
19405+}
19406+
19407+int jwait_io(jnode * node, int rw)
19408+{
19409+ struct page *page;
19410+ int result;
19411+
19412+ assert("zam-447", node != NULL);
19413+ assert("zam-448", jnode_page(node) != NULL);
19414+
19415+ page = jnode_page(node);
19416+
19417+ result = 0;
19418+ if (rw == READ) {
19419+ wait_on_page_locked(page);
19420+ } else {
19421+ assert("nikita-2227", rw == WRITE);
19422+ wait_on_page_writeback(page);
19423+ }
19424+ if (PageError(page))
19425+ result = RETERR(-EIO);
19426+
19427+ return result;
19428+}
19429+
19430+/*
19431+ * jnode types and plugins.
19432+ *
19433+ * jnode by itself is a "base type". There are several different jnode
19434+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19435+ * has to do different things based on jnode type. In the standard reiser4 way
19436+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19437+ *
19438+ * Functions below deal with jnode types and define methods of jnode plugin.
19439+ *
19440+ */
19441+
19442+/* set jnode type. This is done during jnode initialization. */
19443+static void jnode_set_type(jnode * node, jnode_type type)
19444+{
19445+ static unsigned long type_to_mask[] = {
19446+ [JNODE_UNFORMATTED_BLOCK] = 1,
19447+ [JNODE_FORMATTED_BLOCK] = 0,
19448+ [JNODE_BITMAP] = 2,
19449+ [JNODE_IO_HEAD] = 6,
19450+ [JNODE_INODE] = 4
19451+ };
19452+
19453+ assert("zam-647", type < LAST_JNODE_TYPE);
19454+ assert("nikita-2815", !jnode_is_loaded(node));
19455+ assert("nikita-3386", node->state == 0);
19456+
19457+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19458+}
19459+
19460+/* ->init() method of jnode plugin for jnodes that don't require plugin
19461+ * specific initialization. */
19462+static int init_noinit(jnode * node UNUSED_ARG)
19463+{
19464+ return 0;
19465+}
19466+
19467+/* ->parse() method of jnode plugin for jnodes that don't require plugin
19468+ * specific pasring. */
19469+static int parse_noparse(jnode * node UNUSED_ARG)
19470+{
19471+ return 0;
19472+}
19473+
19474+/* ->mapping() method for unformatted jnode */
19475+struct address_space *mapping_jnode(const jnode * node)
19476+{
19477+ struct address_space *map;
19478+
19479+ assert("nikita-2713", node != NULL);
19480+
19481+ /* mapping is stored in jnode */
19482+
19483+ map = node->key.j.mapping;
19484+ assert("nikita-2714", map != NULL);
19485+ assert("nikita-2897", is_reiser4_inode(map->host));
19486+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19487+ return map;
19488+}
19489+
19490+/* ->index() method for unformatted jnodes */
19491+unsigned long index_jnode(const jnode * node)
19492+{
19493+ /* index is stored in jnode */
19494+ return node->key.j.index;
19495+}
19496+
19497+/* ->remove() method for unformatted jnodes */
19498+static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19499+{
19500+ /* remove jnode from hash table and radix tree */
19501+ if (node->key.j.mapping)
19502+ unhash_unformatted_node_nolock(node);
19503+}
19504+
19505+/* ->mapping() method for znodes */
19506+static struct address_space *mapping_znode(const jnode * node)
19507+{
19508+ /* all znodes belong to fake inode */
71430cf6 19509+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
44254afd
MT
19510+}
19511+
19512+/* ->index() method for znodes */
19513+static unsigned long index_znode(const jnode * node)
19514+{
19515+ unsigned long addr;
19516+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19517+
19518+ /* index of znode is just its address (shifted) */
19519+ addr = (unsigned long)node;
19520+ return (addr - PAGE_OFFSET) >> znode_shift_order;
19521+}
19522+
19523+/* ->mapping() method for bitmap jnode */
19524+static struct address_space *mapping_bitmap(const jnode * node)
19525+{
19526+ /* all bitmap blocks belong to special bitmap inode */
19527+ return get_super_private(jnode_get_tree(node)->super)->bitmap->
19528+ i_mapping;
19529+}
19530+
19531+/* ->index() method for jnodes that are indexed by address */
19532+static unsigned long index_is_address(const jnode * node)
19533+{
19534+ unsigned long ind;
19535+
19536+ ind = (unsigned long)node;
19537+ return ind - PAGE_OFFSET;
19538+}
19539+
19540+/* resolve race with jput */
19541+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19542+{
19543+ /*
19544+ * This is used as part of RCU-based jnode handling.
19545+ *
19546+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19547+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19548+ * not protected during this, so concurrent thread may execute
19549+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19550+ * freed in jput_final(). To avoid such races, jput_final() sets
19551+ * JNODE_RIP on jnode (under tree lock). All places that work with
19552+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19553+ * (first without taking tree lock), and if this bit is set, released
19554+ * reference acquired by the current thread and returns NULL.
19555+ *
19556+ * As a result, if jnode is being concurrently freed, NULL is returned
19557+ * and caller should pretend that jnode wasn't found in the first
19558+ * place.
19559+ *
19560+ * Otherwise it's safe to release "rcu-read-lock" and continue with
19561+ * jnode.
19562+ */
19563+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19564+ read_lock_tree(tree);
19565+ if (JF_ISSET(node, JNODE_RIP)) {
19566+ dec_x_ref(node);
19567+ node = NULL;
19568+ }
19569+ read_unlock_tree(tree);
19570+ }
19571+ return node;
19572+}
19573+
19574+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19575+{
19576+ struct inode *inode;
19577+ item_plugin *iplug;
19578+ loff_t off;
19579+
19580+ assert("nikita-3092", node != NULL);
19581+ assert("nikita-3093", key != NULL);
19582+ assert("nikita-3094", jnode_is_unformatted(node));
19583+
19584+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19585+ inode = mapping_jnode(node)->host;
19586+
19587+ if (node->parent_item_id != 0)
19588+ iplug = item_plugin_by_id(node->parent_item_id);
19589+ else
19590+ iplug = NULL;
19591+
19592+ if (iplug != NULL && iplug->f.key_by_offset)
19593+ iplug->f.key_by_offset(inode, off, key);
19594+ else {
19595+ file_plugin *fplug;
19596+
19597+ fplug = inode_file_plugin(inode);
19598+ assert("zam-1007", fplug != NULL);
19599+ assert("zam-1008", fplug->key_by_inode != NULL);
19600+
19601+ fplug->key_by_inode(inode, off, key);
19602+ }
19603+
19604+ return key;
19605+}
19606+
19607+/* ->parse() method for formatted nodes */
19608+static int parse_znode(jnode * node)
19609+{
19610+ return zparse(JZNODE(node));
19611+}
19612+
19613+/* ->delete() method for formatted nodes */
19614+static void delete_znode(jnode * node, reiser4_tree * tree)
19615+{
19616+ znode *z;
19617+
19618+ assert_rw_write_locked(&(tree->tree_lock));
19619+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19620+
19621+ z = JZNODE(node);
19622+ assert("vs-899", z->c_count == 0);
19623+
19624+ /* delete znode from sibling list. */
19625+ sibling_list_remove(z);
19626+
19627+ znode_remove(z, tree);
19628+}
19629+
19630+/* ->remove() method for formatted nodes */
19631+static int remove_znode(jnode * node, reiser4_tree * tree)
19632+{
19633+ znode *z;
19634+
19635+ assert_rw_write_locked(&(tree->tree_lock));
19636+ z = JZNODE(node);
19637+
19638+ if (z->c_count == 0) {
19639+ /* detach znode from sibling list. */
19640+ sibling_list_drop(z);
19641+ /* this is called with tree spin-lock held, so call
19642+ znode_remove() directly (rather than znode_lock_remove()). */
19643+ znode_remove(z, tree);
19644+ return 0;
19645+ }
19646+ return RETERR(-EBUSY);
19647+}
19648+
19649+/* ->init() method for formatted nodes */
19650+static int init_znode(jnode * node)
19651+{
19652+ znode *z;
19653+
19654+ z = JZNODE(node);
19655+ /* call node plugin to do actual initialization */
19656+ return z->nplug->init(z);
19657+}
19658+
19659+/* ->clone() method for formatted nodes */
19660+static jnode *clone_formatted(jnode * node)
19661+{
19662+ znode *clone;
19663+
19664+ assert("vs-1430", jnode_is_znode(node));
71430cf6 19665+ clone = zalloc(reiser4_ctx_gfp_mask_get());
44254afd
MT
19666+ if (clone == NULL)
19667+ return ERR_PTR(RETERR(-ENOMEM));
19668+ zinit(clone, NULL, current_tree);
19669+ jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19670+ /* ZJNODE(clone)->key.z is not initialized */
19671+ clone->level = JZNODE(node)->level;
19672+
19673+ return ZJNODE(clone);
19674+}
19675+
19676+/* jplug->clone for unformatted nodes */
19677+static jnode *clone_unformatted(jnode * node)
19678+{
19679+ jnode *clone;
19680+
19681+ assert("vs-1431", jnode_is_unformatted(node));
19682+ clone = jalloc();
19683+ if (clone == NULL)
19684+ return ERR_PTR(RETERR(-ENOMEM));
19685+
19686+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19687+ jnode_set_block(clone, jnode_get_block(node));
19688+
19689+ return clone;
19690+
19691+}
19692+
19693+/*
19694+ * Setup jnode plugin methods for various jnode types.
19695+ */
19696+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19697+ [JNODE_UNFORMATTED_BLOCK] = {
19698+ .h = {
19699+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19700+ .id = JNODE_UNFORMATTED_BLOCK,
19701+ .pops = NULL,
19702+ .label = "unformatted",
19703+ .desc = "unformatted node",
19704+ .linkage = {NULL, NULL}
19705+ },
19706+ .init = init_noinit,
19707+ .parse = parse_noparse,
19708+ .mapping = mapping_jnode,
19709+ .index = index_jnode,
19710+ .clone = clone_unformatted
19711+ },
19712+ [JNODE_FORMATTED_BLOCK] = {
19713+ .h = {
19714+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19715+ .id = JNODE_FORMATTED_BLOCK,
19716+ .pops = NULL,
19717+ .label = "formatted",
19718+ .desc = "formatted tree node",
19719+ .linkage = {NULL, NULL}
19720+ },
19721+ .init = init_znode,
19722+ .parse = parse_znode,
19723+ .mapping = mapping_znode,
19724+ .index = index_znode,
19725+ .clone = clone_formatted
19726+ },
19727+ [JNODE_BITMAP] = {
19728+ .h = {
19729+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19730+ .id = JNODE_BITMAP,
19731+ .pops = NULL,
19732+ .label = "bitmap",
19733+ .desc = "bitmap node",
19734+ .linkage = {NULL, NULL}
19735+ },
19736+ .init = init_noinit,
19737+ .parse = parse_noparse,
19738+ .mapping = mapping_bitmap,
19739+ .index = index_is_address,
19740+ .clone = NULL
19741+ },
19742+ [JNODE_IO_HEAD] = {
19743+ .h = {
19744+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19745+ .id = JNODE_IO_HEAD,
19746+ .pops = NULL,
19747+ .label = "io head",
19748+ .desc = "io head",
19749+ .linkage = {NULL, NULL}
19750+ },
19751+ .init = init_noinit,
19752+ .parse = parse_noparse,
19753+ .mapping = mapping_bitmap,
19754+ .index = index_is_address,
19755+ .clone = NULL
19756+ },
19757+ [JNODE_INODE] = {
19758+ .h = {
19759+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
19760+ .id = JNODE_INODE,
19761+ .pops = NULL,
19762+ .label = "inode",
19763+ .desc = "inode's builtin jnode",
19764+ .linkage = {NULL, NULL}
19765+ },
19766+ .init = NULL,
19767+ .parse = NULL,
19768+ .mapping = NULL,
19769+ .index = NULL,
19770+ .clone = NULL
19771+ }
19772+};
19773+
19774+/*
19775+ * jnode destruction.
19776+ *
19777+ * Thread may use a jnode after it acquired a reference to it. References are
19778+ * counted in ->x_count field. Reference protects jnode from being
19779+ * recycled. This is different from protecting jnode data (that are stored in
19780+ * jnode page) from being evicted from memory. Data are protected by jload()
19781+ * and released by jrelse().
19782+ *
19783+ * If thread already possesses a reference to the jnode it can acquire another
19784+ * one through jref(). Initial reference is obtained (usually) by locating
19785+ * jnode in some indexing structure that depends on jnode type: formatted
19786+ * nodes are kept in global hash table, where they are indexed by block
19787+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19788+ * table, which is indexed by oid and offset within file, and in per-inode
19789+ * radix tree.
19790+ *
19791+ * Reference to jnode is released by jput(). If last reference is released,
19792+ * jput_final() is called. This function determines whether jnode has to be
19793+ * deleted (this happens when corresponding node is removed from the file
19794+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19795+ * should be just "removed" (deleted from memory).
19796+ *
19797+ * Jnode destruction is signally delicate dance because of locking and RCU.
19798+ */
19799+
19800+/*
19801+ * Returns true if jnode cannot be removed right now. This check is called
19802+ * under tree lock. If it returns true, jnode is irrevocably committed to be
19803+ * deleted/removed.
19804+ */
19805+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19806+{
19807+ /* if other thread managed to acquire a reference to this jnode, don't
19808+ * free it. */
19809+ if (atomic_read(&node->x_count) > 0)
19810+ return 1;
19811+ /* also, don't free znode that has children in memory */
19812+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19813+ return 1;
19814+ return 0;
19815+}
19816+
19817+/*
19818+ * this is called as part of removing jnode. Based on jnode type, call
19819+ * corresponding function that removes jnode from indices and returns it back
19820+ * to the appropriate slab (through RCU).
19821+ */
19822+static inline void
19823+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19824+{
19825+ switch (jtype) {
19826+ case JNODE_UNFORMATTED_BLOCK:
19827+ remove_jnode(node, tree);
19828+ break;
19829+ case JNODE_IO_HEAD:
19830+ case JNODE_BITMAP:
19831+ break;
19832+ case JNODE_INODE:
19833+ break;
19834+ case JNODE_FORMATTED_BLOCK:
19835+ remove_znode(node, tree);
19836+ break;
19837+ default:
19838+ wrong_return_value("nikita-3196", "Wrong jnode type");
19839+ }
19840+}
19841+
19842+/*
19843+ * this is called as part of deleting jnode. Based on jnode type, call
19844+ * corresponding function that removes jnode from indices and returns it back
19845+ * to the appropriate slab (through RCU).
19846+ *
19847+ * This differs from jnode_remove() only for formatted nodes---for them
19848+ * sibling list handling is different for removal and deletion.
19849+ */
19850+static inline void
19851+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19852+{
19853+ switch (jtype) {
19854+ case JNODE_UNFORMATTED_BLOCK:
19855+ remove_jnode(node, tree);
19856+ break;
19857+ case JNODE_IO_HEAD:
19858+ case JNODE_BITMAP:
19859+ break;
19860+ case JNODE_FORMATTED_BLOCK:
19861+ delete_znode(node, tree);
19862+ break;
19863+ case JNODE_INODE:
19864+ default:
19865+ wrong_return_value("nikita-3195", "Wrong jnode type");
19866+ }
19867+}
19868+
19869+#if REISER4_DEBUG
19870+/*
19871+ * remove jnode from the debugging list of all jnodes hanging off super-block.
19872+ */
19873+void jnode_list_remove(jnode * node)
19874+{
19875+ reiser4_super_info_data *sbinfo;
19876+
19877+ sbinfo = get_super_private(jnode_get_tree(node)->super);
19878+
19879+ spin_lock_irq(&sbinfo->all_guard);
19880+ assert("nikita-2422", !list_empty(&node->jnodes));
19881+ list_del_init(&node->jnodes);
19882+ spin_unlock_irq(&sbinfo->all_guard);
19883+}
19884+#endif
19885+
19886+/*
19887+ * this is called by jput_final() to remove jnode when last reference to it is
19888+ * released.
19889+ */
19890+static int jnode_try_drop(jnode * node)
19891+{
19892+ int result;
19893+ reiser4_tree *tree;
19894+ jnode_type jtype;
19895+
19896+ assert("nikita-2491", node != NULL);
19897+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19898+
19899+ tree = jnode_get_tree(node);
19900+ jtype = jnode_get_type(node);
19901+
19902+ spin_lock_jnode(node);
19903+ write_lock_tree(tree);
19904+ /*
19905+ * if jnode has a page---leave it alone. Memory pressure will
19906+ * eventually kill page and jnode.
19907+ */
19908+ if (jnode_page(node) != NULL) {
19909+ write_unlock_tree(tree);
19910+ spin_unlock_jnode(node);
19911+ JF_CLR(node, JNODE_RIP);
19912+ return RETERR(-EBUSY);
19913+ }
19914+
19915+ /* re-check ->x_count under tree lock. */
19916+ result = jnode_is_busy(node, jtype);
19917+ if (result == 0) {
19918+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19919+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19920+
19921+ spin_unlock_jnode(node);
19922+ /* no page and no references---despatch him. */
19923+ jnode_remove(node, jtype, tree);
19924+ write_unlock_tree(tree);
19925+ jnode_free(node, jtype);
19926+ } else {
19927+ /* busy check failed: reference was acquired by concurrent
19928+ * thread. */
19929+ write_unlock_tree(tree);
19930+ spin_unlock_jnode(node);
19931+ JF_CLR(node, JNODE_RIP);
19932+ }
19933+ return result;
19934+}
19935+
19936+/* jdelete() -- Delete jnode from the tree and file system */
19937+static int jdelete(jnode * node /* jnode to finish with */ )
19938+{
19939+ struct page *page;
19940+ int result;
19941+ reiser4_tree *tree;
19942+ jnode_type jtype;
19943+
19944+ assert("nikita-467", node != NULL);
19945+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19946+
19947+ jtype = jnode_get_type(node);
19948+
19949+ page = jnode_lock_page(node);
19950+ assert_spin_locked(&(node->guard));
19951+
19952+ tree = jnode_get_tree(node);
19953+
19954+ write_lock_tree(tree);
19955+ /* re-check ->x_count under tree lock. */
19956+ result = jnode_is_busy(node, jtype);
19957+ if (likely(!result)) {
19958+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19959+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
19960+
19961+ /* detach page */
19962+ if (page != NULL) {
19963+ /*
19964+ * FIXME this is racy against jnode_extent_write().
19965+ */
19966+ page_clear_jnode(page, node);
19967+ }
19968+ spin_unlock_jnode(node);
19969+ /* goodbye */
19970+ jnode_delete(node, jtype, tree);
19971+ write_unlock_tree(tree);
19972+ jnode_free(node, jtype);
19973+ /* @node is no longer valid pointer */
19974+ if (page != NULL)
71430cf6 19975+ reiser4_drop_page(page);
44254afd
MT
19976+ } else {
19977+ /* busy check failed: reference was acquired by concurrent
19978+ * thread. */
19979+ JF_CLR(node, JNODE_RIP);
19980+ write_unlock_tree(tree);
19981+ spin_unlock_jnode(node);
19982+ if (page != NULL)
19983+ unlock_page(page);
19984+ }
19985+ return result;
19986+}
19987+
19988+/* drop jnode on the floor.
19989+
19990+ Return value:
19991+
19992+ -EBUSY: failed to drop jnode, because there are still references to it
19993+
19994+ 0: successfully dropped jnode
19995+
19996+*/
19997+static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
19998+{
19999+ struct page *page;
20000+ jnode_type jtype;
20001+ int result;
20002+
20003+ assert("zam-602", node != NULL);
20004+ assert_rw_not_read_locked(&(tree->tree_lock));
20005+ assert_rw_not_write_locked(&(tree->tree_lock));
20006+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20007+
20008+ jtype = jnode_get_type(node);
20009+
20010+ page = jnode_lock_page(node);
20011+ assert_spin_locked(&(node->guard));
20012+
20013+ write_lock_tree(tree);
20014+
20015+ /* re-check ->x_count under tree lock. */
20016+ result = jnode_is_busy(node, jtype);
20017+ if (!result) {
20018+ assert("nikita-2488", page == jnode_page(node));
20019+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
20020+ if (page != NULL) {
20021+ assert("nikita-2126", !PageDirty(page));
20022+ assert("nikita-2127", PageUptodate(page));
20023+ assert("nikita-2181", PageLocked(page));
20024+ page_clear_jnode(page, node);
20025+ }
20026+ spin_unlock_jnode(node);
20027+ jnode_remove(node, jtype, tree);
20028+ write_unlock_tree(tree);
20029+ jnode_free(node, jtype);
20030+ if (page != NULL) {
71430cf6 20031+ reiser4_drop_page(page);
44254afd
MT
20032+ }
20033+ } else {
20034+ /* busy check failed: reference was acquired by concurrent
20035+ * thread. */
20036+ JF_CLR(node, JNODE_RIP);
20037+ write_unlock_tree(tree);
20038+ spin_unlock_jnode(node);
20039+ if (page != NULL)
20040+ unlock_page(page);
20041+ }
20042+ return result;
20043+}
20044+
20045+/* This function frees jnode "if possible". In particular, [dcx]_count has to
20046+ be 0 (where applicable). */
20047+void jdrop(jnode * node)
20048+{
20049+ jdrop_in_tree(node, jnode_get_tree(node));
20050+}
20051+
20052+/* IO head jnode implementation; The io heads are simple j-nodes with limited
20053+ functionality (these j-nodes are not in any hash table) just for reading
20054+ from and writing to disk. */
20055+
71430cf6 20056+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
44254afd
MT
20057+{
20058+ jnode *jal = jalloc();
20059+
20060+ if (jal != NULL) {
20061+ jnode_init(jal, current_tree, JNODE_IO_HEAD);
20062+ jnode_set_block(jal, block);
20063+ }
20064+
20065+ jref(jal);
20066+
20067+ return jal;
20068+}
20069+
71430cf6 20070+void reiser4_drop_io_head(jnode * node)
44254afd
MT
20071+{
20072+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20073+
20074+ jput(node);
20075+ jdrop(node);
20076+}
20077+
20078+/* protect keep jnode data from reiser4_releasepage() */
20079+void pin_jnode_data(jnode * node)
20080+{
20081+ assert("zam-671", jnode_page(node) != NULL);
20082+ page_cache_get(jnode_page(node));
20083+}
20084+
20085+/* make jnode data free-able again */
20086+void unpin_jnode_data(jnode * node)
20087+{
20088+ assert("zam-672", jnode_page(node) != NULL);
20089+ page_cache_release(jnode_page(node));
20090+}
20091+
20092+struct address_space *jnode_get_mapping(const jnode * node)
20093+{
20094+ assert("nikita-3162", node != NULL);
20095+ return jnode_ops(node)->mapping(node);
20096+}
20097+
20098+#if REISER4_DEBUG
20099+/* debugging aid: jnode invariant */
20100+int jnode_invariant_f(const jnode * node, char const **msg)
20101+{
20102+#define _ergo(ant, con) \
20103+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20104+#define _check(exp) ((*msg) = #exp, (exp))
20105+
20106+ return _check(node != NULL) &&
20107+ /* [jnode-queued] */
20108+ /* only relocated node can be queued, except that when znode
20109+ * is being deleted, its JNODE_RELOC bit is cleared */
20110+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20111+ JF_ISSET(node, JNODE_RELOC) ||
20112+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20113+ _check(node->jnodes.prev != NULL) &&
20114+ _check(node->jnodes.next != NULL) &&
20115+ /* [jnode-dirty] invariant */
20116+ /* dirty inode is part of atom */
20117+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20118+ /* [jnode-oid] invariant */
20119+ /* for unformatted node ->objectid and ->mapping fields are
20120+ * consistent */
20121+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20122+ node->key.j.objectid ==
20123+ get_inode_oid(node->key.j.mapping->host)) &&
20124+ /* [jnode-atom-valid] invariant */
20125+ /* node atom has valid state */
20126+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20127+ /* [jnode-page-binding] invariant */
20128+ /* if node points to page, it points back to node */
20129+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20130+ /* [jnode-refs] invariant */
20131+ /* only referenced jnode can be loaded */
20132+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20133+
20134+}
20135+
20136+static const char *jnode_type_name(jnode_type type)
20137+{
20138+ switch (type) {
20139+ case JNODE_UNFORMATTED_BLOCK:
20140+ return "unformatted";
20141+ case JNODE_FORMATTED_BLOCK:
20142+ return "formatted";
20143+ case JNODE_BITMAP:
20144+ return "bitmap";
20145+ case JNODE_IO_HEAD:
20146+ return "io head";
20147+ case JNODE_INODE:
20148+ return "inode";
20149+ case LAST_JNODE_TYPE:
20150+ return "last";
20151+ default:{
20152+ static char unknown[30];
20153+
20154+ sprintf(unknown, "unknown %i", type);
20155+ return unknown;
20156+ }
20157+ }
20158+}
20159+
20160+#define jnode_state_name( node, flag ) \
20161+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20162+
20163+/* debugging aid: output human readable information about @node */
20164+static void info_jnode(const char *prefix /* prefix to print */ ,
20165+ const jnode * node /* node to print */ )
20166+{
20167+ assert("umka-068", prefix != NULL);
20168+
20169+ if (node == NULL) {
20170+ printk("%s: null\n", prefix);
20171+ return;
20172+ }
20173+
20174+ printk
20175+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20176+ " block: %s, d_count: %d, x_count: %d, "
20177+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
71430cf6 20178+ node->state,
44254afd
MT
20179+ jnode_state_name(node, JNODE_PARSED),
20180+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
20181+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
20182+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20183+ jnode_state_name(node, JNODE_ORPHAN),
20184+ jnode_state_name(node, JNODE_CREATED),
20185+ jnode_state_name(node, JNODE_RELOC),
20186+ jnode_state_name(node, JNODE_OVRWR),
20187+ jnode_state_name(node, JNODE_DIRTY),
20188+ jnode_state_name(node, JNODE_IS_DYING),
20189+ jnode_state_name(node, JNODE_RIP),
20190+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20191+ jnode_state_name(node, JNODE_WRITEBACK),
20192+ jnode_state_name(node, JNODE_NEW),
20193+ jnode_state_name(node, JNODE_DKSET),
20194+ jnode_state_name(node, JNODE_REPACK),
20195+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
20196+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
20197+ atomic_read(&node->d_count), atomic_read(&node->x_count),
20198+ jnode_page(node), node->atom, 0, 0,
20199+ jnode_type_name(jnode_get_type(node)));
20200+ if (jnode_is_unformatted(node)) {
20201+ printk("inode: %llu, index: %lu, ",
20202+ node->key.j.objectid, node->key.j.index);
20203+ }
20204+}
20205+
20206+/* debugging aid: check znode invariant and panic if it doesn't hold */
20207+static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20208+{
20209+ char const *failed_msg;
20210+ int result;
20211+ reiser4_tree *tree;
20212+
20213+ tree = jnode_get_tree(node);
20214+
20215+ assert("umka-063312", node != NULL);
20216+ assert("umka-064321", tree != NULL);
20217+
20218+ if (!jlocked && !tlocked)
20219+ spin_lock_jnode((jnode *) node);
20220+ if (!tlocked)
20221+ read_lock_tree(jnode_get_tree(node));
20222+ result = jnode_invariant_f(node, &failed_msg);
20223+ if (!result) {
20224+ info_jnode("corrupted node", node);
20225+ warning("jmacd-555", "Condition %s failed", failed_msg);
20226+ }
20227+ if (!tlocked)
20228+ read_unlock_tree(jnode_get_tree(node));
20229+ if (!jlocked && !tlocked)
20230+ spin_unlock_jnode((jnode *) node);
20231+ return result;
20232+}
20233+
20234+#endif /* REISER4_DEBUG */
20235+
20236+/* Make Linus happy.
20237+ Local variables:
20238+ c-indentation-style: "K&R"
20239+ mode-name: "LC"
20240+ c-basic-offset: 8
20241+ tab-width: 8
20242+ fill-column: 80
20243+ End:
20244+*/
71430cf6
MT
20245diff -urN linux-2.6.22.orig/fs/reiser4/jnode.h linux-2.6.22/fs/reiser4/jnode.h
20246--- linux-2.6.22.orig/fs/reiser4/jnode.h 1970-01-01 03:00:00.000000000 +0300
20247+++ linux-2.6.22/fs/reiser4/jnode.h 2007-07-29 00:25:34.876696477 +0400
20248@@ -0,0 +1,702 @@
44254afd
MT
20249+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20250+ * reiser4/README */
20251+
20252+/* Declaration of jnode. See jnode.c for details. */
20253+
20254+#ifndef __JNODE_H__
20255+#define __JNODE_H__
20256+
20257+#include "forward.h"
20258+#include "type_safe_hash.h"
20259+#include "txnmgr.h"
20260+#include "key.h"
20261+#include "debug.h"
20262+#include "dformat.h"
71430cf6 20263+#include "page_cache.h"
44254afd
MT
20264+#include "context.h"
20265+
20266+#include "plugin/plugin.h"
20267+
20268+#include <linux/fs.h>
20269+#include <linux/mm.h>
20270+#include <linux/spinlock.h>
20271+#include <asm/atomic.h>
20272+#include <asm/bitops.h>
20273+#include <linux/list.h>
20274+#include <linux/rcupdate.h>
20275+
20276+/* declare hash table of jnodes (jnodes proper, that is, unformatted
20277+ nodes) */
20278+TYPE_SAFE_HASH_DECLARE(j, jnode);
20279+
20280+/* declare hash table of znodes */
20281+TYPE_SAFE_HASH_DECLARE(z, znode);
20282+
71430cf6 20283+struct jnode_key {
44254afd
MT
20284+ __u64 objectid;
20285+ unsigned long index;
20286+ struct address_space *mapping;
71430cf6 20287+};
44254afd
MT
20288+
20289+/*
20290+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
20291+ be exactly the node we use for unformatted tree nodes.
20292+
20293+ Jnode provides following basic functionality:
20294+
20295+ . reference counting and indexing.
20296+
20297+ . integration with page cache. Jnode has ->pg reference to which page can
20298+ be attached.
20299+
20300+ . interface to transaction manager. It is jnode that is kept in transaction
20301+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20302+ means, there should be special type of jnode for inode.)
20303+
20304+ Locking:
20305+
20306+ Spin lock: the following fields are protected by the per-jnode spin lock:
20307+
20308+ ->state
20309+ ->atom
20310+ ->capture_link
20311+
20312+ Following fields are protected by the global tree lock:
20313+
20314+ ->link
20315+ ->key.z (content of ->key.z is only changed in znode_rehash())
20316+ ->key.j
20317+
20318+ Atomic counters
20319+
20320+ ->x_count
20321+ ->d_count
20322+
20323+ ->pg, and ->data are protected by spin lock for unused jnode and are
20324+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20325+ is false).
20326+
20327+ ->tree is immutable after creation
20328+
20329+ Unclear
20330+
20331+ ->blocknr: should be under jnode spin-lock, but current interface is based
20332+ on passing of block address.
20333+
20334+ If you ever need to spin lock two nodes at once, do this in "natural"
20335+ memory order: lock znode with lower address first. (See lock_two_nodes().)
20336+
20337+ Invariants involving this data-type:
20338+
20339+ [jnode-dirty]
20340+ [jnode-refs]
20341+ [jnode-oid]
20342+ [jnode-queued]
20343+ [jnode-atom-valid]
20344+ [jnode-page-binding]
20345+*/
20346+
20347+struct jnode {
20348+#if REISER4_DEBUG
20349+#define JMAGIC 0x52654973 /* "ReIs" */
20350+ int magic;
20351+#endif
20352+ /* FIRST CACHE LINE (16 bytes): data used by jload */
20353+
20354+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20355+ /* 0 */ unsigned long state;
20356+
20357+ /* lock, protecting jnode's fields. */
20358+ /* 4 */ spinlock_t load;
20359+
20360+ /* counter of references to jnode itself. Increased on jref().
20361+ Decreased on jput().
20362+ */
20363+ /* 8 */ atomic_t x_count;
20364+
20365+ /* counter of references to jnode's data. Pin data page(s) in
20366+ memory while this is greater than 0. Increased on jload().
20367+ Decreased on jrelse().
20368+ */
20369+ /* 12 */ atomic_t d_count;
20370+
20371+ /* SECOND CACHE LINE: data used by hash table lookups */
20372+
20373+ /* 16 */ union {
20374+ /* znodes are hashed by block number */
20375+ reiser4_block_nr z;
20376+ /* unformatted nodes are hashed by mapping plus offset */
71430cf6 20377+ struct jnode_key j;
44254afd
MT
20378+ } key;
20379+
20380+ /* THIRD CACHE LINE */
20381+
20382+ /* 32 */ union {
20383+ /* pointers to maintain hash-table */
20384+ z_hash_link z;
20385+ j_hash_link j;
20386+ } link;
20387+
20388+ /* pointer to jnode page. */
20389+ /* 36 */ struct page *pg;
20390+ /* pointer to node itself. This is page_address(node->pg) when page is
20391+ attached to the jnode
20392+ */
20393+ /* 40 */ void *data;
20394+
20395+ /* 44 */ reiser4_tree *tree;
20396+
20397+ /* FOURTH CACHE LINE: atom related fields */
20398+
20399+ /* 48 */ spinlock_t guard;
20400+
20401+ /* atom the block is in, if any */
20402+ /* 52 */ txn_atom *atom;
20403+
20404+ /* capture list */
20405+ /* 56 */ struct list_head capture_link;
20406+
20407+ /* FIFTH CACHE LINE */
20408+
20409+ /* 64 */ struct rcu_head rcu;
20410+ /* crosses cache line */
20411+
20412+ /* SIXTH CACHE LINE */
20413+
20414+ /* the real blocknr (where io is going to/from) */
20415+ /* 80 */ reiser4_block_nr blocknr;
20416+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20417+ /* NOTE: this parent_item_id looks like jnode type. */
20418+ /* 88 */ reiser4_plugin_id parent_item_id;
20419+ /* 92 */
20420+#if REISER4_DEBUG
44254afd
MT
20421+ /* list of all jnodes for debugging purposes. */
20422+ struct list_head jnodes;
20423+ /* how many times this jnode was written in one transaction */
20424+ int written;
20425+ /* this indicates which atom's list the jnode is on */
20426+ atom_list list;
20427+#endif
20428+} __attribute__ ((aligned(16)));
20429+
20430+/*
20431+ * jnode types. Enumeration of existing jnode types.
20432+ */
20433+typedef enum {
20434+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20435+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20436+ JNODE_BITMAP, /* bitmap */
20437+ JNODE_IO_HEAD, /* jnode representing a block in the
20438+ * wandering log */
20439+ JNODE_INODE, /* jnode embedded into inode */
20440+ LAST_JNODE_TYPE
20441+} jnode_type;
20442+
20443+/* jnode states */
20444+typedef enum {
20445+ /* jnode's page is loaded and data checked */
20446+ JNODE_PARSED = 0,
20447+ /* node was deleted, not all locks on it were released. This
20448+ node is empty and is going to be removed from the tree
20449+ shortly. */
20450+ JNODE_HEARD_BANSHEE = 1,
20451+ /* left sibling pointer is valid */
20452+ JNODE_LEFT_CONNECTED = 2,
20453+ /* right sibling pointer is valid */
20454+ JNODE_RIGHT_CONNECTED = 3,
20455+
20456+ /* znode was just created and doesn't yet have a pointer from
20457+ its parent */
20458+ JNODE_ORPHAN = 4,
20459+
20460+ /* this node was created by its transaction and has not been assigned
20461+ a block address. */
20462+ JNODE_CREATED = 5,
20463+
20464+ /* this node is currently relocated */
20465+ JNODE_RELOC = 6,
20466+ /* this node is currently wandered */
20467+ JNODE_OVRWR = 7,
20468+
20469+ /* this znode has been modified */
20470+ JNODE_DIRTY = 8,
20471+
20472+ /* znode lock is being invalidated */
20473+ JNODE_IS_DYING = 9,
20474+
20475+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20476+
20477+ /* jnode is queued for flushing. */
20478+ JNODE_FLUSH_QUEUED = 12,
20479+
20480+ /* In the following bits jnode type is encoded. */
20481+ JNODE_TYPE_1 = 13,
20482+ JNODE_TYPE_2 = 14,
20483+ JNODE_TYPE_3 = 15,
20484+
20485+ /* jnode is being destroyed */
20486+ JNODE_RIP = 16,
20487+
20488+ /* znode was not captured during locking (it might so be because
20489+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20490+ JNODE_MISSED_IN_CAPTURE = 17,
20491+
20492+ /* write is in progress */
20493+ JNODE_WRITEBACK = 18,
20494+
20495+ /* FIXME: now it is used by crypto-compress plugin only */
20496+ JNODE_NEW = 19,
20497+
20498+ /* delimiting keys are already set for this znode. */
20499+ JNODE_DKSET = 20,
20500+
20501+ /* when this bit is set page and jnode can not be disconnected */
20502+ JNODE_WRITE_PREPARED = 21,
20503+
20504+ JNODE_CLUSTER_PAGE = 22,
20505+ /* Jnode is marked for repacking, that means the reiser4 flush and the
20506+ * block allocator should process this node special way */
20507+ JNODE_REPACK = 23,
20508+ /* node should be converted by flush in squalloc phase */
20509+ JNODE_CONVERTIBLE = 24,
20510+ /*
20511+ * When jnode is dirtied for the first time in given transaction,
20512+ * do_jnode_make_dirty() checks whether this jnode can possible became
20513+ * member of overwrite set. If so, this bit is set, and one block is
20514+ * reserved in the ->flush_reserved space of atom.
20515+ *
20516+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20517+ *
20518+ * (1) flush decides that we want this block to go into relocate
20519+ * set after all.
20520+ *
20521+ * (2) wandering log is allocated (by log writer)
20522+ *
20523+ * (3) extent is allocated
20524+ *
20525+ */
20526+ JNODE_FLUSH_RESERVED = 29
20527+} reiser4_jnode_state;
20528+
20529+/* Macros for accessing the jnode state. */
20530+
20531+static inline void JF_CLR(jnode * j, int f)
20532+{
20533+ assert("unknown-1", j->magic == JMAGIC);
20534+ clear_bit(f, &j->state);
20535+}
20536+static inline int JF_ISSET(const jnode * j, int f)
20537+{
20538+ assert("unknown-2", j->magic == JMAGIC);
20539+ return test_bit(f, &((jnode *) j)->state);
20540+}
20541+static inline void JF_SET(jnode * j, int f)
20542+{
20543+ assert("unknown-3", j->magic == JMAGIC);
20544+ set_bit(f, &j->state);
20545+}
20546+
20547+static inline int JF_TEST_AND_SET(jnode * j, int f)
20548+{
20549+ assert("unknown-4", j->magic == JMAGIC);
20550+ return test_and_set_bit(f, &j->state);
20551+}
20552+
20553+static inline void spin_lock_jnode(jnode *node)
20554+{
20555+ /* check that spinlocks of lower priorities are not held */
20556+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20557+ LOCK_CNT_NIL(spin_locked_txnh) &&
20558+ LOCK_CNT_NIL(spin_locked_zlock) &&
20559+ LOCK_CNT_NIL(rw_locked_dk) &&
20560+ LOCK_CNT_LT(spin_locked_jnode, 2)));
20561+
20562+ spin_lock(&(node->guard));
20563+
20564+ LOCK_CNT_INC(spin_locked_jnode);
20565+ LOCK_CNT_INC(spin_locked);
20566+}
20567+
20568+static inline void spin_unlock_jnode(jnode *node)
20569+{
20570+ assert_spin_locked(&(node->guard));
20571+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20572+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20573+
20574+ LOCK_CNT_DEC(spin_locked_jnode);
20575+ LOCK_CNT_DEC(spin_locked);
20576+
20577+ spin_unlock(&(node->guard));
20578+}
20579+
20580+static inline int jnode_is_in_deleteset(const jnode * node)
20581+{
20582+ return JF_ISSET(node, JNODE_RELOC);
20583+}
20584+
20585+extern int init_jnodes(void);
20586+extern void done_jnodes(void);
20587+
20588+/* Jnode routines */
20589+extern jnode *jalloc(void);
20590+extern void jfree(jnode * node) NONNULL;
20591+extern jnode *jclone(jnode *);
20592+extern jnode *jlookup(reiser4_tree * tree,
20593+ oid_t objectid, unsigned long ind) NONNULL;
20594+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20595+extern jnode *jnode_by_page(struct page *pg) NONNULL;
20596+extern jnode *jnode_of_page(struct page *pg) NONNULL;
20597+void jnode_attach_page(jnode * node, struct page *pg);
44254afd
MT
20598+
20599+void unhash_unformatted_jnode(jnode *);
44254afd
MT
20600+extern jnode *page_next_jnode(jnode * node) NONNULL;
20601+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20602+extern void jnode_make_dirty(jnode * node) NONNULL;
20603+extern void jnode_make_clean(jnode * node) NONNULL;
20604+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20605+extern void jnode_make_wander(jnode *) NONNULL;
20606+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20607+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20608+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20609+
20610+/**
20611+ * jnode_get_block
20612+ * @node: jnode to query
20613+ *
20614+ */
20615+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20616+{
20617+ assert("nikita-528", node != NULL);
20618+
20619+ return &node->blocknr;
20620+}
20621+
20622+/**
20623+ * jnode_set_block
20624+ * @node: jnode to update
20625+ * @blocknr: new block nr
20626+ */
20627+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20628+{
20629+ assert("nikita-2020", node != NULL);
20630+ assert("umka-055", blocknr != NULL);
20631+ node->blocknr = *blocknr;
20632+}
20633+
20634+
20635+/* block number for IO. Usually this is the same as jnode_get_block(), unless
20636+ * jnode was emergency flushed---then block number chosen by eflush is
20637+ * used. */
20638+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20639+{
20640+ assert("nikita-2768", node != NULL);
20641+ assert_spin_locked(&(node->guard));
20642+
20643+ return jnode_get_block(node);
20644+}
20645+
20646+/* Jnode flush interface. */
71430cf6
MT
20647+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20648+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
44254afd
MT
20649+
20650+/* FIXME-VS: these are used in plugin/item/extent.c */
20651+
20652+/* does extent_get_block have to be called */
20653+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
20654+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20655+
20656+/* the node should be converted during flush squalloc phase */
20657+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
20658+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
20659+
20660+/* Macros to convert from jnode to znode, znode to jnode. These are macros
20661+ because C doesn't allow overloading of const prototypes. */
20662+#define ZJNODE(x) (& (x) -> zjnode)
20663+#define JZNODE(x) \
20664+({ \
20665+ typeof (x) __tmp_x; \
20666+ \
20667+ __tmp_x = (x); \
20668+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
20669+ (znode*) __tmp_x; \
20670+})
20671+
20672+extern int jnodes_tree_init(reiser4_tree * tree);
20673+extern int jnodes_tree_done(reiser4_tree * tree);
20674+
20675+#if REISER4_DEBUG
20676+
20677+extern int znode_is_any_locked(const znode * node);
20678+extern void jnode_list_remove(jnode * node);
20679+
20680+#else
20681+
20682+#define jnode_list_remove(node) noop
20683+
20684+#endif
20685+
20686+int znode_is_root(const znode * node) NONNULL;
20687+
20688+/* bump reference counter on @node */
20689+static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20690+{
20691+ assert("nikita-1911", node != NULL);
20692+
20693+ atomic_inc(&node->x_count);
20694+ LOCK_CNT_INC(x_refs);
20695+}
20696+
20697+static inline void dec_x_ref(jnode * node)
20698+{
20699+ assert("nikita-3215", node != NULL);
20700+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
20701+
20702+ atomic_dec(&node->x_count);
20703+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20704+ LOCK_CNT_DEC(x_refs);
20705+}
20706+
20707+/* jref() - increase counter of references to jnode/znode (x_count) */
20708+static inline jnode *jref(jnode * node)
20709+{
20710+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20711+ add_x_ref(node);
20712+ return node;
20713+}
20714+
20715+/* get the page of jnode */
20716+static inline struct page *jnode_page(const jnode * node)
20717+{
20718+ return node->pg;
20719+}
20720+
20721+/* return pointer to jnode data */
20722+static inline char *jdata(const jnode * node)
20723+{
20724+ assert("nikita-1415", node != NULL);
20725+ assert("nikita-3198", jnode_page(node) != NULL);
20726+ return node->data;
20727+}
20728+
20729+static inline int jnode_is_loaded(const jnode * node)
20730+{
20731+ assert("zam-506", node != NULL);
20732+ return atomic_read(&node->d_count) > 0;
20733+}
20734+
44254afd
MT
20735+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20736+
20737+static inline void jnode_set_reloc(jnode * node)
20738+{
20739+ assert("nikita-2431", node != NULL);
20740+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20741+ JF_SET(node, JNODE_RELOC);
20742+}
20743+
20744+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20745+
20746+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20747+
20748+static inline int jload(jnode *node)
20749+{
71430cf6 20750+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
44254afd
MT
20751+}
20752+
20753+extern int jinit_new(jnode *, gfp_t) NONNULL;
20754+extern int jstartio(jnode *) NONNULL;
20755+
20756+extern void jdrop(jnode *) NONNULL;
20757+extern int jwait_io(jnode *, int rw) NONNULL;
20758+
20759+void jload_prefetch(jnode *);
20760+
71430cf6
MT
20761+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20762+extern void reiser4_drop_io_head(jnode * node) NONNULL;
44254afd
MT
20763+
20764+static inline reiser4_tree *jnode_get_tree(const jnode * node)
20765+{
20766+ assert("nikita-2691", node != NULL);
20767+ return node->tree;
20768+}
20769+
20770+extern void pin_jnode_data(jnode *);
20771+extern void unpin_jnode_data(jnode *);
20772+
20773+static inline jnode_type jnode_get_type(const jnode * node)
20774+{
20775+ static const unsigned long state_mask =
20776+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20777+
20778+ static jnode_type mask_to_type[] = {
20779+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20780+
20781+ /* 000 */
20782+ [0] = JNODE_FORMATTED_BLOCK,
20783+ /* 001 */
20784+ [1] = JNODE_UNFORMATTED_BLOCK,
20785+ /* 010 */
20786+ [2] = JNODE_BITMAP,
20787+ /* 011 */
20788+ [3] = LAST_JNODE_TYPE, /*invalid */
20789+ /* 100 */
20790+ [4] = JNODE_INODE,
20791+ /* 101 */
20792+ [5] = LAST_JNODE_TYPE,
20793+ /* 110 */
20794+ [6] = JNODE_IO_HEAD,
20795+ /* 111 */
20796+ [7] = LAST_JNODE_TYPE, /* invalid */
20797+ };
20798+
20799+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20800+}
20801+
20802+/* returns true if node is a znode */
20803+static inline int jnode_is_znode(const jnode * node)
20804+{
20805+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20806+}
20807+
20808+static inline int jnode_is_flushprepped(jnode * node)
20809+{
20810+ assert("jmacd-78212", node != NULL);
20811+ assert_spin_locked(&(node->guard));
20812+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20813+ JF_ISSET(node, JNODE_OVRWR);
20814+}
20815+
20816+/* Return true if @node has already been processed by the squeeze and allocate
20817+ process. This implies the block address has been finalized for the
20818+ duration of this atom (or it is clean and will remain in place). If this
20819+ returns true you may use the block number as a hint. */
20820+static inline int jnode_check_flushprepped(jnode * node)
20821+{
20822+ int result;
20823+
20824+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */
20825+ spin_lock_jnode(node);
20826+ result = jnode_is_flushprepped(node);
20827+ spin_unlock_jnode(node);
20828+ return result;
20829+}
20830+
20831+/* returns true if node is unformatted */
20832+static inline int jnode_is_unformatted(const jnode * node)
20833+{
20834+ assert("jmacd-0123", node != NULL);
20835+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20836+}
20837+
20838+/* returns true if node represents a cluster cache page */
20839+static inline int jnode_is_cluster_page(const jnode * node)
20840+{
20841+ assert("edward-50", node != NULL);
20842+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20843+}
20844+
20845+/* returns true is node is builtin inode's jnode */
20846+static inline int jnode_is_inode(const jnode * node)
20847+{
20848+ assert("vs-1240", node != NULL);
20849+ return jnode_get_type(node) == JNODE_INODE;
20850+}
20851+
20852+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20853+{
20854+ assert("nikita-2367", type < LAST_JNODE_TYPE);
20855+ return jnode_plugin_by_id((reiser4_plugin_id) type);
20856+}
20857+
20858+static inline jnode_plugin *jnode_ops(const jnode * node)
20859+{
20860+ assert("nikita-2366", node != NULL);
20861+
20862+ return jnode_ops_of(jnode_get_type(node));
20863+}
20864+
20865+/* Get the index of a block. */
20866+static inline unsigned long jnode_get_index(jnode * node)
20867+{
20868+ return jnode_ops(node)->index(node);
20869+}
20870+
20871+/* return true if "node" is the root */
20872+static inline int jnode_is_root(const jnode * node)
20873+{
20874+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20875+}
20876+
20877+extern struct address_space *mapping_jnode(const jnode * node);
20878+extern unsigned long index_jnode(const jnode * node);
20879+
20880+static inline void jput(jnode * node);
20881+extern void jput_final(jnode * node);
20882+
20883+/* bump data counter on @node */
20884+static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20885+{
20886+ assert("nikita-1962", node != NULL);
20887+
20888+ atomic_inc(&node->d_count);
20889+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
20890+ LOCK_CNT_INC(d_refs);
20891+}
20892+
20893+/* jput() - decrement x_count reference counter on znode.
20894+
20895+ Count may drop to 0, jnode stays in cache until memory pressure causes the
20896+ eviction of its page. The c_count variable also ensures that children are
20897+ pressured out of memory before the parent. The jnode remains hashed as
20898+ long as the VM allows its page to stay in memory.
20899+*/
20900+static inline void jput(jnode * node)
20901+{
20902+ assert("jmacd-509", node != NULL);
20903+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
71430cf6 20904+ assert("zam-926", reiser4_schedulable());
44254afd
MT
20905+ LOCK_CNT_DEC(x_refs);
20906+
20907+ rcu_read_lock();
20908+ /*
20909+ * we don't need any kind of lock here--jput_final() uses RCU.
20910+ */
20911+ if (unlikely(atomic_dec_and_test(&node->x_count))) {
20912+ jput_final(node);
20913+ } else
20914+ rcu_read_unlock();
71430cf6 20915+ assert("nikita-3473", reiser4_schedulable());
44254afd
MT
20916+}
20917+
20918+extern void jrelse(jnode * node);
20919+extern void jrelse_tail(jnode * node);
20920+
20921+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20922+
20923+/* resolve race with jput */
20924+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20925+{
20926+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
20927+ node = jnode_rip_sync(tree, node);
20928+ return node;
20929+}
20930+
20931+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20932+
20933+#if REISER4_DEBUG
20934+extern int jnode_invariant_f(const jnode *node, char const **msg);
20935+#endif
20936+
20937+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20938+
20939+/* __JNODE_H__ */
20940+#endif
20941+
20942+/* Make Linus happy.
20943+ Local variables:
20944+ c-indentation-style: "K&R"
20945+ mode-name: "LC"
20946+ c-basic-offset: 8
20947+ tab-width: 8
20948+ fill-column: 120
20949+ End:
20950+*/
71430cf6
MT
20951diff -urN linux-2.6.22.orig/fs/reiser4/kassign.c linux-2.6.22/fs/reiser4/kassign.c
20952--- linux-2.6.22.orig/fs/reiser4/kassign.c 1970-01-01 03:00:00.000000000 +0300
20953+++ linux-2.6.22/fs/reiser4/kassign.c 2007-07-29 00:25:34.880697512 +0400
20954@@ -0,0 +1,661 @@
44254afd
MT
20955+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20956+ * reiser4/README */
20957+
20958+/* Key assignment policy implementation */
20959+
20960+/*
20961+ * In reiser4 every piece of file system data and meta-data has a key. Keys
20962+ * are used to store information in and retrieve it from reiser4 internal
20963+ * tree. In addition to this, keys define _ordering_ of all file system
20964+ * information: things having close keys are placed into the same or
20965+ * neighboring (in the tree order) nodes of the tree. As our block allocator
20966+ * tries to respect tree order (see flush.c), keys also define order in which
20967+ * things are laid out on the disk, and hence, affect performance directly.
20968+ *
20969+ * Obviously, assignment of keys to data and meta-data should be consistent
20970+ * across whole file system. Algorithm that calculates a key for a given piece
20971+ * of data or meta-data is referred to as "key assignment".
20972+ *
20973+ * Key assignment is too expensive to be implemented as a plugin (that is,
20974+ * with an ability to support different key assignment schemas in the same
20975+ * compiled kernel image). As a compromise, all key-assignment functions and
20976+ * data-structures are collected in this single file, so that modifications to
20977+ * key assignment algorithm can be localized. Additional changes may be
20978+ * required in key.[ch].
20979+ *
20980+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
20981+ * may guess, there is "Plan B" too.
20982+ *
20983+ */
20984+
20985+/*
20986+ * Additional complication with key assignment implementation is a requirement
20987+ * to support different key length.
20988+ */
20989+
20990+/*
20991+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
20992+ *
20993+ * DIRECTORY ITEMS
20994+ *
20995+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
20996+ * +--------------+---+---+-+-------------+------------------+-----------------+
20997+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
20998+ * +--------------+---+---+-+-------------+------------------+-----------------+
20999+ * | | | | |
21000+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21001+ *
21002+ * dirid objectid of directory this item is for
21003+ *
21004+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21005+ *
21006+ * H 1 if last 8 bytes of the key contain hash,
21007+ * 0 if last 8 bytes of the key contain prefix-3
21008+ *
21009+ * prefix-1 first 7 characters of file name.
21010+ * Padded by zeroes if name is not long enough.
21011+ *
21012+ * prefix-2 next 8 characters of the file name.
21013+ *
21014+ * prefix-3 next 8 characters of the file name.
21015+ *
21016+ * hash hash of the rest of file name (i.e., portion of file
21017+ * name not included into prefix-1 and prefix-2).
21018+ *
21019+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21020+ * in the key. Such file names are called "short". They are distinguished by H
21021+ * bit set 0 in the key.
21022+ *
21023+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21024+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21025+ * key. Last 8 bytes of the key are occupied by hash of the remaining
21026+ * characters of the name.
21027+ *
21028+ * This key assignment reaches following important goals:
21029+ *
21030+ * (1) directory entries are sorted in approximately lexicographical
21031+ * order.
21032+ *
21033+ * (2) collisions (when multiple directory items have the same key), while
21034+ * principally unavoidable in a tree with fixed length keys, are rare.
21035+ *
21036+ * STAT DATA
21037+ *
21038+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21039+ * +--------------+---+-----------------+---+--------------+-----------------+
21040+ * | locality id | 1 | ordering | 0 | objectid | 0 |
21041+ * +--------------+---+-----------------+---+--------------+-----------------+
21042+ * | | | | |
21043+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21044+ *
21045+ * locality id object id of a directory where first name was created for
21046+ * the object
21047+ *
21048+ * ordering copy of second 8-byte portion of the key of directory
21049+ * entry for the first name of this object. Ordering has a form
21050+ * {
21051+ * fibration :7;
21052+ * h :1;
21053+ * prefix1 :56;
21054+ * }
21055+ * see description of key for directory entry above.
21056+ *
21057+ * objectid object id for this object
21058+ *
21059+ * This key assignment policy is designed to keep stat-data in the same order
21060+ * as corresponding directory items, thus speeding up readdir/stat types of
21061+ * workload.
21062+ *
21063+ * FILE BODY
21064+ *
21065+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21066+ * +--------------+---+-----------------+---+--------------+-----------------+
21067+ * | locality id | 4 | ordering | 0 | objectid | offset |
21068+ * +--------------+---+-----------------+---+--------------+-----------------+
21069+ * | | | | |
21070+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21071+ *
21072+ * locality id object id of a directory where first name was created for
21073+ * the object
21074+ *
21075+ * ordering the same as in the key of stat-data for this object
21076+ *
21077+ * objectid object id for this object
21078+ *
21079+ * offset logical offset from the beginning of this file.
21080+ * Measured in bytes.
21081+ *
21082+ *
21083+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21084+ *
21085+ * DIRECTORY ITEMS
21086+ *
21087+ * | 60 | 4 | 7 |1| 56 | 64 |
21088+ * +--------------+---+---+-+-------------+-----------------+
21089+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21090+ * +--------------+---+---+-+-------------+-----------------+
21091+ * | | | |
21092+ * | 8 bytes | 8 bytes | 8 bytes |
21093+ *
21094+ * dirid objectid of directory this item is for
21095+ *
21096+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21097+ *
21098+ * H 1 if last 8 bytes of the key contain hash,
21099+ * 0 if last 8 bytes of the key contain prefix-2
21100+ *
21101+ * prefix-1 first 7 characters of file name.
21102+ * Padded by zeroes if name is not long enough.
21103+ *
21104+ * prefix-2 next 8 characters of the file name.
21105+ *
21106+ * hash hash of the rest of file name (i.e., portion of file
21107+ * name not included into prefix-1).
21108+ *
21109+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21110+ * the key. Such file names are called "short". They are distinguished by H
21111+ * bit set in the key.
21112+ *
21113+ * Other file names are "long". For long name, H bit is 0, and first 7
21114+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21115+ * key are occupied by hash of the remaining characters of the name.
21116+ *
21117+ * STAT DATA
21118+ *
21119+ * | 60 | 4 | 4 | 60 | 64 |
21120+ * +--------------+---+---+--------------+-----------------+
21121+ * | locality id | 1 | 0 | objectid | 0 |
21122+ * +--------------+---+---+--------------+-----------------+
21123+ * | | | |
21124+ * | 8 bytes | 8 bytes | 8 bytes |
21125+ *
21126+ * locality id object id of a directory where first name was created for
21127+ * the object
21128+ *
21129+ * objectid object id for this object
21130+ *
21131+ * FILE BODY
21132+ *
21133+ * | 60 | 4 | 4 | 60 | 64 |
21134+ * +--------------+---+---+--------------+-----------------+
21135+ * | locality id | 4 | 0 | objectid | offset |
21136+ * +--------------+---+---+--------------+-----------------+
21137+ * | | | |
21138+ * | 8 bytes | 8 bytes | 8 bytes |
21139+ *
21140+ * locality id object id of a directory where first name was created for
21141+ * the object
21142+ *
21143+ * objectid object id for this object
21144+ *
21145+ * offset logical offset from the beginning of this file.
21146+ * Measured in bytes.
21147+ *
21148+ *
21149+ */
21150+
21151+#include "debug.h"
21152+#include "key.h"
21153+#include "kassign.h"
21154+#include "vfs_ops.h"
21155+#include "inode.h"
21156+#include "super.h"
21157+#include "dscale.h"
21158+
21159+#include <linux/types.h> /* for __u?? */
21160+#include <linux/fs.h> /* for struct super_block, etc */
21161+
21162+/* bitmask for H bit (see comment at the beginning of this file */
21163+static const __u64 longname_mark = 0x0100000000000000ull;
21164+/* bitmask for F and H portions of the key. */
21165+static const __u64 fibration_mask = 0xff00000000000000ull;
21166+
21167+/* return true if name is not completely encoded in @key */
21168+int is_longname_key(const reiser4_key * key)
21169+{
21170+ __u64 highpart;
21171+
21172+ assert("nikita-2863", key != NULL);
21173+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
71430cf6 21174+ reiser4_print_key("oops", key);
44254afd
MT
21175+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21176+
21177+ if (REISER4_LARGE_KEY)
21178+ highpart = get_key_ordering(key);
21179+ else
21180+ highpart = get_key_objectid(key);
21181+
21182+ return (highpart & longname_mark) ? 1 : 0;
21183+}
21184+
21185+/* return true if @name is too long to be completely encoded in the key */
21186+int is_longname(const char *name UNUSED_ARG, int len)
21187+{
21188+ if (REISER4_LARGE_KEY)
21189+ return len > 23;
21190+ else
21191+ return len > 15;
21192+}
21193+
21194+/* code ascii string into __u64.
21195+
21196+ Put characters of @name into result (@str) one after another starting
21197+ from @start_idx-th highest (arithmetically) byte. This produces
21198+ endian-safe encoding. memcpy(2) will not do.
21199+
21200+*/
21201+static __u64 pack_string(const char *name /* string to encode */ ,
21202+ int start_idx /* highest byte in result from
21203+ * which to start encoding */ )
21204+{
21205+ unsigned i;
21206+ __u64 str;
21207+
21208+ str = 0;
21209+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21210+ str <<= 8;
21211+ str |= (unsigned char)name[i];
21212+ }
21213+ str <<= (sizeof str - i - start_idx) << 3;
21214+ return str;
21215+}
21216+
21217+/* opposite to pack_string(). Takes value produced by pack_string(), restores
21218+ * string encoded in it and stores result in @buf */
71430cf6 21219+char * reiser4_unpack_string(__u64 value, char *buf)
44254afd
MT
21220+{
21221+ do {
21222+ *buf = value >> (64 - 8);
21223+ if (*buf)
21224+ ++buf;
21225+ value <<= 8;
21226+ } while (value != 0);
21227+ *buf = 0;
21228+ return buf;
21229+}
21230+
21231+/* obtain name encoded in @key and store it in @buf */
21232+char *extract_name_from_key(const reiser4_key * key, char *buf)
21233+{
21234+ char *c;
21235+
21236+ assert("nikita-2868", !is_longname_key(key));
21237+
21238+ c = buf;
21239+ if (REISER4_LARGE_KEY) {
71430cf6
MT
21240+ c = reiser4_unpack_string(get_key_ordering(key) &
21241+ ~fibration_mask, c);
21242+ c = reiser4_unpack_string(get_key_fulloid(key), c);
44254afd 21243+ } else
71430cf6
MT
21244+ c = reiser4_unpack_string(get_key_fulloid(key) &
21245+ ~fibration_mask, c);
21246+ reiser4_unpack_string(get_key_offset(key), c);
44254afd
MT
21247+ return buf;
21248+}
21249+
21250+/**
21251+ * complete_entry_key - calculate entry key by name
21252+ * @dir: directory where entry is (or will be) in
21253+ * @name: name to calculate key of
21254+ * @len: lenth of name
21255+ * @result: place to store result in
21256+ *
21257+ * Sets fields of entry key @result which depend on file name.
21258+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21259+ * objectid and offset. Otherwise, objectid and offset are set.
21260+ */
21261+void complete_entry_key(const struct inode *dir, const char *name,
21262+ int len, reiser4_key *result)
21263+{
21264+#if REISER4_LARGE_KEY
21265+ __u64 ordering;
21266+ __u64 objectid;
21267+ __u64 offset;
21268+
21269+ assert("nikita-1139", dir != NULL);
21270+ assert("nikita-1142", result != NULL);
21271+ assert("nikita-2867", strlen(name) == len);
21272+
21273+ /*
21274+ * key allocation algorithm for directory entries in case of large
21275+ * keys:
21276+ *
21277+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21278+ * characters into ordering field of key, next 8 charactes (if any)
21279+ * into objectid field of key and next 8 ones (of any) into offset
21280+ * field of key
21281+ *
21282+ * If file name is longer than 23 characters, put first 7 characters
21283+ * into key's ordering, next 8 to objectid and hash of remaining
21284+ * characters into offset field.
21285+ *
21286+ * To distinguish above cases, in latter set up unused high bit in
21287+ * ordering field.
21288+ */
21289+
21290+ /* [0-6] characters to ordering */
21291+ ordering = pack_string(name, 1);
21292+ if (len > 7) {
21293+ /* [7-14] characters to objectid */
21294+ objectid = pack_string(name + 7, 0);
21295+ if (len > 15) {
21296+ if (len <= 23) {
21297+ /* [15-23] characters to offset */
21298+ offset = pack_string(name + 15, 0);
21299+ } else {
21300+ /* note in a key the fact that offset contains hash. */
21301+ ordering |= longname_mark;
21302+
21303+ /* offset is the hash of the file name's tail. */
21304+ offset = inode_hash_plugin(dir)->hash(name + 15,
21305+ len - 15);
21306+ }
21307+ } else {
21308+ offset = 0ull;
21309+ }
21310+ } else {
21311+ objectid = 0ull;
21312+ offset = 0ull;
21313+ }
21314+
21315+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21316+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21317+
21318+ set_key_ordering(result, ordering);
21319+ set_key_fulloid(result, objectid);
21320+ set_key_offset(result, offset);
21321+ return;
21322+
21323+#else
21324+ __u64 objectid;
21325+ __u64 offset;
21326+
21327+ assert("nikita-1139", dir != NULL);
21328+ assert("nikita-1142", result != NULL);
21329+ assert("nikita-2867", strlen(name) == len);
21330+
21331+ /*
21332+ * key allocation algorithm for directory entries in case of not large
21333+ * keys:
21334+ *
21335+ * If name is not longer than 7 + 8 = 15 characters, put first 7
21336+ * characters into objectid field of key, next 8 charactes (if any)
21337+ * into offset field of key
21338+ *
21339+ * If file name is longer than 15 characters, put first 7 characters
21340+ * into key's objectid, and hash of remaining characters into offset
21341+ * field.
21342+ *
21343+ * To distinguish above cases, in latter set up unused high bit in
21344+ * objectid field.
21345+ */
21346+
21347+ /* [0-6] characters to objectid */
21348+ objectid = pack_string(name, 1);
21349+ if (len > 7) {
21350+ if (len <= 15) {
21351+ /* [7-14] characters to offset */
21352+ offset = pack_string(name + 7, 0);
21353+ } else {
21354+ /* note in a key the fact that offset contains hash. */
21355+ objectid |= longname_mark;
21356+
21357+ /* offset is the hash of the file name. */
21358+ offset = inode_hash_plugin(dir)->hash(name + 7,
21359+ len - 7);
21360+ }
21361+ } else
21362+ offset = 0ull;
21363+
21364+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21365+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21366+
21367+ set_key_fulloid(result, objectid);
21368+ set_key_offset(result, offset);
21369+ return;
21370+#endif /* ! REISER4_LARGE_KEY */
21371+}
21372+
21373+/* true, if @key is the key of "." */
21374+int is_dot_key(const reiser4_key * key /* key to check */ )
21375+{
21376+ assert("nikita-1717", key != NULL);
21377+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21378+ return
21379+ (get_key_ordering(key) == 0ull) &&
21380+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21381+}
21382+
21383+/* build key for stat-data.
21384+
21385+ return key of stat-data of this object. This should became sd plugin
21386+ method in the future. For now, let it be here.
21387+
21388+*/
21389+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21390+ reiser4_key * result /* resulting key of @target
21391+ stat-data */ )
21392+{
21393+ assert("nikita-261", result != NULL);
21394+
21395+ reiser4_key_init(result);
21396+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
21397+ set_key_ordering(result, get_inode_ordering(target));
21398+ set_key_objectid(result, get_inode_oid(target));
21399+ set_key_type(result, KEY_SD_MINOR);
21400+ set_key_offset(result, (__u64) 0);
21401+ return result;
21402+}
21403+
21404+/* encode part of key into &obj_key_id
21405+
21406+ This encodes into @id part of @key sufficient to restore @key later,
21407+ given that latter is key of object (key of stat-data).
21408+
21409+ See &obj_key_id
21410+*/
21411+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21412+ obj_key_id * id /* id where key is encoded in */ )
21413+{
21414+ assert("nikita-1151", key != NULL);
21415+ assert("nikita-1152", id != NULL);
21416+
21417+ memcpy(id, key, sizeof *id);
21418+ return 0;
21419+}
21420+
21421+/* encode reference to @obj in @id.
21422+
21423+ This is like build_obj_key_id() above, but takes inode as parameter. */
21424+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21425+ obj_key_id * id /* result */ )
21426+{
21427+ reiser4_key sdkey;
21428+
21429+ assert("nikita-1166", obj != NULL);
21430+ assert("nikita-1167", id != NULL);
21431+
21432+ build_sd_key(obj, &sdkey);
21433+ build_obj_key_id(&sdkey, id);
21434+ return 0;
21435+}
21436+
21437+/* decode @id back into @key
21438+
21439+ Restore key of object stat-data from @id. This is dual to
21440+ build_obj_key_id() above.
21441+*/
21442+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21443+ * from */ ,
21444+ reiser4_key * key /* result */ )
21445+{
21446+ assert("nikita-1153", id != NULL);
21447+ assert("nikita-1154", key != NULL);
21448+
21449+ reiser4_key_init(key);
21450+ memcpy(key, id, sizeof *id);
21451+ return 0;
21452+}
21453+
21454+/* extract objectid of directory from key of directory entry within said
21455+ directory.
21456+ */
21457+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21458+ * directory
21459+ * entry */ )
21460+{
21461+ assert("nikita-1314", de_key != NULL);
21462+ return get_key_locality(de_key);
21463+}
21464+
21465+/* encode into @id key of directory entry.
21466+
21467+ Encode into @id information sufficient to later distinguish directory
21468+ entries within the same directory. This is not whole key, because all
21469+ directory entries within directory item share locality which is equal
21470+ to objectid of their directory.
21471+
21472+*/
21473+int build_de_id(const struct inode *dir /* inode of directory */ ,
21474+ const struct qstr *name /* name to be given to @obj by
21475+ * directory entry being
21476+ * constructed */ ,
21477+ de_id * id /* short key of directory entry */ )
21478+{
21479+ reiser4_key key;
21480+
21481+ assert("nikita-1290", dir != NULL);
21482+ assert("nikita-1292", id != NULL);
21483+
21484+ /* NOTE-NIKITA this is suboptimal. */
21485+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21486+ return build_de_id_by_key(&key, id);
21487+}
21488+
21489+/* encode into @id key of directory entry.
21490+
21491+ Encode into @id information sufficient to later distinguish directory
21492+ entries within the same directory. This is not whole key, because all
21493+ directory entries within directory item share locality which is equal
21494+ to objectid of their directory.
21495+
21496+*/
21497+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21498+ * entry */ ,
21499+ de_id * id /* short key of directory entry */ )
21500+{
21501+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21502+ return 0;
21503+}
21504+
21505+/* restore from @id key of directory entry.
21506+
21507+ Function dual to build_de_id(): given @id and locality, build full
21508+ key of directory entry within directory item.
21509+
21510+*/
21511+int extract_key_from_de_id(const oid_t locality /* locality of directory
21512+ * entry */ ,
21513+ const de_id * id /* directory entry id */ ,
21514+ reiser4_key * key /* result */ )
21515+{
21516+ /* no need to initialise key here: all fields are overwritten */
21517+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
21518+ set_key_locality(key, locality);
21519+ set_key_type(key, KEY_FILE_NAME_MINOR);
21520+ return 0;
21521+}
21522+
21523+/* compare two &de_id's */
21524+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21525+ const de_id * id2 /* second &de_id to compare */ )
21526+{
21527+ /* NOTE-NIKITA ugly implementation */
21528+ reiser4_key k1;
21529+ reiser4_key k2;
21530+
21531+ extract_key_from_de_id((oid_t) 0, id1, &k1);
21532+ extract_key_from_de_id((oid_t) 0, id2, &k2);
21533+ return keycmp(&k1, &k2);
21534+}
21535+
21536+/* compare &de_id with key */
21537+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21538+ const reiser4_key * key /* key to compare */ )
21539+{
21540+ cmp_t result;
21541+ reiser4_key *k1;
21542+
21543+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21544+ result = KEY_DIFF_EL(k1, key, 1);
21545+ if (result == EQUAL_TO) {
21546+ result = KEY_DIFF_EL(k1, key, 2);
21547+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21548+ result = KEY_DIFF_EL(k1, key, 3);
21549+ }
21550+ }
21551+ return result;
21552+}
21553+
21554+/*
21555+ * return number of bytes necessary to encode @inode identity.
21556+ */
21557+int inode_onwire_size(const struct inode *inode)
21558+{
21559+ int result;
21560+
21561+ result = dscale_bytes(get_inode_oid(inode));
21562+ result += dscale_bytes(get_inode_locality(inode));
21563+
21564+ /*
21565+ * ordering is large (it usually has highest bits set), so it makes
21566+ * little sense to dscale it.
21567+ */
21568+ if (REISER4_LARGE_KEY)
21569+ result += sizeof(get_inode_ordering(inode));
21570+ return result;
21571+}
21572+
21573+/*
21574+ * encode @inode identity at @start
21575+ */
21576+char *build_inode_onwire(const struct inode *inode, char *start)
21577+{
21578+ start += dscale_write(start, get_inode_locality(inode));
21579+ start += dscale_write(start, get_inode_oid(inode));
21580+
21581+ if (REISER4_LARGE_KEY) {
21582+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21583+ start += sizeof(get_inode_ordering(inode));
21584+ }
21585+ return start;
21586+}
21587+
21588+/*
21589+ * extract key that was previously encoded by build_inode_onwire() at @addr
21590+ */
21591+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21592+{
21593+ __u64 val;
21594+
21595+ addr += dscale_read(addr, &val);
21596+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21597+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21598+ addr += dscale_read(addr, &val);
21599+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21600+#if REISER4_LARGE_KEY
21601+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21602+ addr += sizeof key_id->ordering;
21603+#endif
21604+ return addr;
21605+}
21606+
21607+/* Make Linus happy.
21608+ Local variables:
21609+ c-indentation-style: "K&R"
21610+ mode-name: "LC"
21611+ c-basic-offset: 8
21612+ tab-width: 8
21613+ fill-column: 120
21614+ End:
21615+*/
71430cf6
MT
21616diff -urN linux-2.6.22.orig/fs/reiser4/kassign.h linux-2.6.22/fs/reiser4/kassign.h
21617--- linux-2.6.22.orig/fs/reiser4/kassign.h 1970-01-01 03:00:00.000000000 +0300
21618+++ linux-2.6.22/fs/reiser4/kassign.h 2007-07-29 00:25:34.880697512 +0400
44254afd
MT
21619@@ -0,0 +1,110 @@
21620+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21621+ * reiser4/README */
21622+
21623+/* Key assignment policy interface. See kassign.c for details. */
21624+
21625+#if !defined( __KASSIGN_H__ )
21626+#define __KASSIGN_H__
21627+
21628+#include "forward.h"
21629+#include "key.h"
21630+#include "dformat.h"
21631+
21632+#include <linux/types.h> /* for __u?? */
21633+#include <linux/fs.h> /* for struct super_block, etc */
21634+#include <linux/dcache.h> /* for struct qstr */
21635+
21636+/* key assignment functions */
21637+
21638+/* Information from which key of file stat-data can be uniquely
21639+ restored. This depends on key assignment policy for
21640+ stat-data. Currently it's enough to store object id and locality id
21641+ (60+60==120) bits, because minor packing locality and offset of
21642+ stat-data key are always known constants: KEY_SD_MINOR and 0
21643+ respectively. For simplicity 4 bits are wasted in each id, and just
21644+ two 64 bit integers are stored.
21645+
21646+ This field has to be byte-aligned, because we don't want to waste
21647+ space in directory entries. There is another side of a coin of
21648+ course: we waste CPU and bus bandwidth in stead, by copying data back
21649+ and forth.
21650+
21651+ Next optimization: &obj_key_id is mainly used to address stat data from
21652+ directory entries. Under the assumption that majority of files only have
21653+ only name (one hard link) from *the* parent directory it seems reasonable
21654+ to only store objectid of stat data and take its locality from key of
21655+ directory item.
21656+
21657+ This requires some flag to be added to the &obj_key_id to distinguish
21658+ between these two cases. Remaining bits in flag byte are then asking to be
21659+ used to store file type.
21660+
21661+ This optimization requires changes in directory item handling code.
21662+
21663+*/
21664+typedef struct obj_key_id {
21665+ d8 locality[sizeof(__u64)];
21666+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21667+ )
21668+ d8 objectid[sizeof(__u64)];
21669+}
21670+obj_key_id;
21671+
21672+/* Information sufficient to uniquely identify directory entry within
21673+ compressed directory item.
21674+
21675+ For alignment issues see &obj_key_id above.
21676+*/
21677+typedef struct de_id {
21678+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21679+ d8 objectid[sizeof(__u64)];
21680+ d8 offset[sizeof(__u64)];
21681+}
21682+de_id;
21683+
21684+extern int inode_onwire_size(const struct inode *obj);
21685+extern char *build_inode_onwire(const struct inode *obj, char *area);
21686+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21687+
21688+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21689+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21690+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21691+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21692+extern int build_de_id(const struct inode *dir, const struct qstr *name,
21693+ de_id * id);
21694+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21695+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21696+ reiser4_key * key);
21697+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21698+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21699+
21700+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21701+extern void build_entry_key_common(const struct inode *dir,
21702+ const struct qstr *name,
21703+ reiser4_key * result);
21704+extern void build_entry_key_stable_entry(const struct inode *dir,
21705+ const struct qstr *name,
21706+ reiser4_key * result);
21707+extern int is_dot_key(const reiser4_key * key);
21708+extern reiser4_key *build_sd_key(const struct inode *target,
21709+ reiser4_key * result);
21710+
21711+extern int is_longname_key(const reiser4_key * key);
21712+extern int is_longname(const char *name, int len);
21713+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
71430cf6 21714+extern char *reiser4_unpack_string(__u64 value, char *buf);
44254afd
MT
21715+extern void complete_entry_key(const struct inode *dir, const char *name,
21716+ int len, reiser4_key *result);
21717+
21718+/* __KASSIGN_H__ */
21719+#endif
21720+
21721+/* Make Linus happy.
21722+ Local variables:
21723+ c-indentation-style: "K&R"
21724+ mode-name: "LC"
21725+ c-basic-offset: 8
21726+ tab-width: 8
21727+ fill-column: 120
21728+ End:
21729+*/
71430cf6
MT
21730diff -urN linux-2.6.22.orig/fs/reiser4/Kconfig linux-2.6.22/fs/reiser4/Kconfig
21731--- linux-2.6.22.orig/fs/reiser4/Kconfig 1970-01-01 03:00:00.000000000 +0300
21732+++ linux-2.6.22/fs/reiser4/Kconfig 2007-07-29 00:25:34.880697512 +0400
21733@@ -0,0 +1,32 @@
21734+config REISER4_FS
21735+ tristate "Reiser4 (EXPERIMENTAL)"
21736+ depends on EXPERIMENTAL
21737+ select ZLIB_INFLATE
21738+ select ZLIB_DEFLATE
21739+ select CRYPTO
21740+ help
21741+ Reiser4 is a filesystem that performs all filesystem operations
21742+ as atomic transactions, which means that it either performs a
21743+ write, or it does not, and in the event of a crash it does not
21744+ partially perform it or corrupt it.
21745+
21746+ It stores files in dancing trees, which are like balanced trees but
21747+ faster. It packs small files together so that they share blocks
21748+ without wasting space. This means you can use it to store really
21749+ small files. It also means that it saves you disk space. It avoids
21750+ hassling you with anachronisms like having a maximum number of
21751+ inodes, and wasting space if you use less than that number.
21752+
21753+ Reiser4 is a distinct filesystem type from reiserfs (V3).
21754+ It's therefore not possible to use reiserfs file systems
21755+ with reiser4.
21756+
21757+ To learn more about reiser4, go to http://www.namesys.com
21758+
21759+config REISER4_DEBUG
21760+ bool "Enable reiser4 debug mode"
21761+ depends on REISER4_FS
21762+ help
21763+ Don't use this unless you are debugging reiser4.
21764+
21765+ If unsure, say N.
21766diff -urN linux-2.6.22.orig/fs/reiser4/key.c linux-2.6.22/fs/reiser4/key.c
21767--- linux-2.6.22.orig/fs/reiser4/key.c 1970-01-01 03:00:00.000000000 +0300
21768+++ linux-2.6.22/fs/reiser4/key.c 2007-07-29 00:25:34.880697512 +0400
44254afd
MT
21769@@ -0,0 +1,137 @@
21770+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21771+
21772+/* Key manipulations. */
21773+
21774+#include "debug.h"
21775+#include "key.h"
21776+#include "super.h"
21777+#include "reiser4.h"
21778+
21779+#include <linux/types.h> /* for __u?? */
21780+
21781+/* Minimal possible key: all components are zero. It is presumed that this is
21782+ independent of key scheme. */
21783+static const reiser4_key MINIMAL_KEY = {
21784+ .el = {
21785+ 0ull,
21786+ ON_LARGE_KEY(0ull,)
21787+ 0ull,
21788+ 0ull
21789+ }
21790+};
21791+
21792+/* Maximal possible key: all components are ~0. It is presumed that this is
21793+ independent of key scheme. */
21794+static const reiser4_key MAXIMAL_KEY = {
21795+ .el = {
21796+ __constant_cpu_to_le64(~0ull),
21797+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21798+ __constant_cpu_to_le64(~0ull),
21799+ __constant_cpu_to_le64(~0ull)
21800+ }
21801+};
21802+
21803+/* Initialize key. */
21804+void reiser4_key_init(reiser4_key * key /* key to init */ )
21805+{
21806+ assert("nikita-1169", key != NULL);
21807+ memset(key, 0, sizeof *key);
21808+}
21809+
21810+/* minimal possible key in the tree. Return pointer to the static storage. */
71430cf6 21811+const reiser4_key *reiser4_min_key(void)
44254afd
MT
21812+{
21813+ return &MINIMAL_KEY;
21814+}
21815+
21816+/* maximum possible key in the tree. Return pointer to the static storage. */
71430cf6 21817+const reiser4_key *reiser4_max_key(void)
44254afd
MT
21818+{
21819+ return &MAXIMAL_KEY;
21820+}
21821+
21822+#if REISER4_DEBUG
21823+/* debugging aid: print symbolic name of key type */
21824+static const char *type_name(unsigned int key_type /* key type */ )
21825+{
21826+ switch (key_type) {
21827+ case KEY_FILE_NAME_MINOR:
21828+ return "file name";
21829+ case KEY_SD_MINOR:
21830+ return "stat data";
21831+ case KEY_ATTR_NAME_MINOR:
21832+ return "attr name";
21833+ case KEY_ATTR_BODY_MINOR:
21834+ return "attr body";
21835+ case KEY_BODY_MINOR:
21836+ return "file body";
21837+ default:
21838+ return "unknown";
21839+ }
21840+}
21841+
21842+/* debugging aid: print human readable information about key */
71430cf6 21843+void reiser4_print_key(const char *prefix /* prefix to print */ ,
44254afd
MT
21844+ const reiser4_key * key /* key to print */ )
21845+{
21846+ /* turn bold on */
21847+ /* printf ("\033[1m"); */
21848+ if (key == NULL)
21849+ printk("%s: null key\n", prefix);
21850+ else {
21851+ if (REISER4_LARGE_KEY)
21852+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21853+ get_key_locality(key),
21854+ get_key_type(key),
21855+ get_key_ordering(key),
21856+ get_key_band(key),
21857+ get_key_objectid(key), get_key_offset(key));
21858+ else
21859+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21860+ get_key_locality(key),
21861+ get_key_type(key),
21862+ get_key_band(key),
21863+ get_key_objectid(key), get_key_offset(key));
21864+ /*
21865+ * if this is a key of directory entry, try to decode part of
21866+ * a name stored in the key, and output it.
21867+ */
21868+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21869+ char buf[DE_NAME_BUF_LEN];
21870+ char *c;
21871+
21872+ c = buf;
71430cf6
MT
21873+ c = reiser4_unpack_string(get_key_ordering(key), c);
21874+ reiser4_unpack_string(get_key_fulloid(key), c);
44254afd
MT
21875+ printk("[%s", buf);
21876+ if (is_longname_key(key))
21877+ /*
21878+ * only part of the name is stored in the key.
21879+ */
21880+ printk("...]\n");
21881+ else {
21882+ /*
21883+ * whole name is stored in the key.
21884+ */
71430cf6 21885+ reiser4_unpack_string(get_key_offset(key), buf);
44254afd
MT
21886+ printk("%s]\n", buf);
21887+ }
21888+ } else {
21889+ printk("[%s]\n", type_name(get_key_type(key)));
21890+ }
21891+ }
21892+ /* turn bold off */
21893+ /* printf ("\033[m\017"); */
21894+}
21895+
21896+#endif
21897+
21898+/* Make Linus happy.
21899+ Local variables:
21900+ c-indentation-style: "K&R"
21901+ mode-name: "LC"
21902+ c-basic-offset: 8
21903+ tab-width: 8
21904+ fill-column: 120
21905+ End:
21906+*/
71430cf6
MT
21907diff -urN linux-2.6.22.orig/fs/reiser4/key.h linux-2.6.22/fs/reiser4/key.h
21908--- linux-2.6.22.orig/fs/reiser4/key.h 1970-01-01 03:00:00.000000000 +0300
21909+++ linux-2.6.22/fs/reiser4/key.h 2007-07-29 00:25:34.884698547 +0400
44254afd
MT
21910@@ -0,0 +1,384 @@
21911+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21912+
21913+/* Declarations of key-related data-structures and operations on keys. */
21914+
21915+#if !defined( __REISER4_KEY_H__ )
21916+#define __REISER4_KEY_H__
21917+
21918+#include "dformat.h"
21919+#include "forward.h"
21920+#include "debug.h"
21921+
21922+#include <linux/types.h> /* for __u?? */
21923+
21924+/* Operations on keys in reiser4 tree */
21925+
21926+/* No access to any of these fields shall be done except via a
21927+ wrapping macro/function, and that wrapping macro/function shall
21928+ convert to little endian order. Compare keys will consider cpu byte order. */
21929+
21930+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21931+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21932+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
21933+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21934+ right one. */
21935+
21936+/* possible values for minor packing locality (4 bits required) */
21937+typedef enum {
21938+ /* file name */
21939+ KEY_FILE_NAME_MINOR = 0,
21940+ /* stat-data */
21941+ KEY_SD_MINOR = 1,
21942+ /* file attribute name */
21943+ KEY_ATTR_NAME_MINOR = 2,
21944+ /* file attribute value */
21945+ KEY_ATTR_BODY_MINOR = 3,
21946+ /* file body (tail or extent) */
21947+ KEY_BODY_MINOR = 4,
21948+} key_minor_locality;
21949+
21950+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21951+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21952+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
21953+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21954+ block_alloc.c to check the node type when deciding where to allocate the node.
21955+
21956+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
21957+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
21958+ current implementation tails have a different minor packing locality from extents, and no files have both extents and
21959+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
21960+*/
21961+
21962+/* Arbitrary major packing localities can be assigned to objects using
21963+ the reiser4(filenameA/..packing<=some_number) system call.
21964+
21965+ In reiser4, the creat() syscall creates a directory
21966+
21967+ whose default flow (that which is referred to if the directory is
21968+ read as a file) is the traditional unix file body.
21969+
21970+ whose directory plugin is the 'filedir'
21971+
21972+ whose major packing locality is that of the parent of the object created.
21973+
21974+ The static_stat item is a particular commonly used directory
21975+ compression (the one for normal unix files).
21976+
21977+ The filedir plugin checks to see if the static_stat item exists.
21978+ There is a unique key for static_stat. If yes, then it uses the
21979+ static_stat item for all of the values that it contains. The
21980+ static_stat item contains a flag for each stat it contains which
21981+ indicates whether one should look outside the static_stat item for its
21982+ contents.
21983+*/
21984+
21985+/* offset of fields in reiser4_key. Value of each element of this enum
21986+ is index within key (thought as array of __u64's) where this field
21987+ is. */
21988+typedef enum {
21989+ /* major "locale", aka dirid. Sits in 1st element */
21990+ KEY_LOCALITY_INDEX = 0,
21991+ /* minor "locale", aka item type. Sits in 1st element */
21992+ KEY_TYPE_INDEX = 0,
21993+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
21994+ /* "object band". Sits in 2nd element */
21995+ KEY_BAND_INDEX,
21996+ /* objectid. Sits in 2nd element */
21997+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
21998+ /* full objectid. Sits in 2nd element */
21999+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22000+ /* Offset. Sits in 3rd element */
22001+ KEY_OFFSET_INDEX,
22002+ /* Name hash. Sits in 3rd element */
22003+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22004+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22005+ KEY_LAST_INDEX
22006+} reiser4_key_field_index;
22007+
22008+/* key in reiser4 internal "balanced" tree. It is just array of three
22009+ 64bit integers in disk byte order (little-endian by default). This
22010+ array is actually indexed by reiser4_key_field. Each __u64 within
22011+ this array is called "element". Logical key component encoded within
22012+ elements are called "fields".
22013+
22014+ We declare this as union with second component dummy to suppress
22015+ inconvenient array<->pointer casts implied in C. */
22016+union reiser4_key {
22017+ __le64 el[KEY_LAST_INDEX];
22018+ int pad;
22019+};
22020+
22021+/* bitmasks showing where within reiser4_key particular key is stored. */
22022+/* major locality occupies higher 60 bits of the first element */
22023+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22024+
22025+/* minor locality occupies lower 4 bits of the first element */
22026+#define KEY_TYPE_MASK 0xfull
22027+
22028+/* controversial band occupies higher 4 bits of the 2nd element */
22029+#define KEY_BAND_MASK 0xf000000000000000ull
22030+
22031+/* objectid occupies lower 60 bits of the 2nd element */
22032+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22033+
22034+/* full 64bit objectid*/
22035+#define KEY_FULLOID_MASK 0xffffffffffffffffull
22036+
22037+/* offset is just 3rd L.M.Nt itself */
22038+#define KEY_OFFSET_MASK 0xffffffffffffffffull
22039+
22040+/* ordering is whole second element */
22041+#define KEY_ORDERING_MASK 0xffffffffffffffffull
22042+
22043+/* how many bits key element should be shifted to left to get particular field */
22044+typedef enum {
22045+ KEY_LOCALITY_SHIFT = 4,
22046+ KEY_TYPE_SHIFT = 0,
22047+ KEY_BAND_SHIFT = 60,
22048+ KEY_OBJECTID_SHIFT = 0,
22049+ KEY_FULLOID_SHIFT = 0,
22050+ KEY_OFFSET_SHIFT = 0,
22051+ KEY_ORDERING_SHIFT = 0,
22052+} reiser4_key_field_shift;
22053+
22054+static inline __u64
22055+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22056+{
22057+ assert("nikita-753", key != NULL);
22058+ assert("nikita-754", off < KEY_LAST_INDEX);
22059+ return le64_to_cpu(get_unaligned(&key->el[off]));
22060+}
22061+
22062+static inline void
22063+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22064+{
22065+ assert("nikita-755", key != NULL);
22066+ assert("nikita-756", off < KEY_LAST_INDEX);
22067+ put_unaligned(cpu_to_le64(value), &key->el[off]);
22068+}
22069+
22070+/* macro to define getter and setter functions for field F with type T */
22071+#define DEFINE_KEY_FIELD( L, U, T ) \
22072+static inline T get_key_ ## L ( const reiser4_key *key ) \
22073+{ \
22074+ assert( "nikita-750", key != NULL ); \
22075+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22076+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22077+} \
22078+ \
22079+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22080+{ \
22081+ __u64 el; \
22082+ \
22083+ assert( "nikita-752", key != NULL ); \
22084+ \
22085+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22086+ /* clear field bits in the key */ \
22087+ el &= ~KEY_ ## U ## _MASK; \
22088+ /* actually it should be \
22089+ \
22090+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22091+ \
22092+ but we trust user to never pass values that wouldn't fit \
22093+ into field. Clearing extra bits is one operation, but this \
22094+ function is time-critical. \
22095+ But check this in assertion. */ \
22096+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22097+ ~KEY_ ## U ## _MASK ) == 0 ); \
22098+ el |= ( loc << KEY_ ## U ## _SHIFT ); \
22099+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22100+}
22101+
22102+typedef __u64 oid_t;
22103+
22104+/* define get_key_locality(), set_key_locality() */
22105+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22106+/* define get_key_type(), set_key_type() */
22107+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22108+/* define get_key_band(), set_key_band() */
22109+DEFINE_KEY_FIELD(band, BAND, __u64);
22110+/* define get_key_objectid(), set_key_objectid() */
22111+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22112+/* define get_key_fulloid(), set_key_fulloid() */
22113+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22114+/* define get_key_offset(), set_key_offset() */
22115+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22116+#if (REISER4_LARGE_KEY)
22117+/* define get_key_ordering(), set_key_ordering() */
22118+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22119+#else
22120+static inline __u64 get_key_ordering(const reiser4_key * key)
22121+{
22122+ return 0;
22123+}
22124+
22125+static inline void set_key_ordering(reiser4_key * key, __u64 val)
22126+{
22127+}
22128+#endif
22129+
22130+/* key comparison result */
22131+typedef enum { LESS_THAN = -1, /* if first key is less than second */
22132+ EQUAL_TO = 0, /* if keys are equal */
22133+ GREATER_THAN = +1 /* if first key is greater than second */
22134+} cmp_t;
22135+
22136+void reiser4_key_init(reiser4_key * key);
22137+
22138+/* minimal possible key in the tree. Return pointer to the static storage. */
71430cf6
MT
22139+extern const reiser4_key *reiser4_min_key(void);
22140+extern const reiser4_key *reiser4_max_key(void);
44254afd
MT
22141+
22142+/* helper macro for keycmp() */
22143+#define KEY_DIFF(k1, k2, field) \
22144+({ \
22145+ typeof (get_key_ ## field (k1)) f1; \
22146+ typeof (get_key_ ## field (k2)) f2; \
22147+ \
22148+ f1 = get_key_ ## field (k1); \
22149+ f2 = get_key_ ## field (k2); \
22150+ \
22151+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22152+})
22153+
22154+/* helper macro for keycmp() */
22155+#define KEY_DIFF_EL(k1, k2, off) \
22156+({ \
22157+ __u64 e1; \
22158+ __u64 e2; \
22159+ \
22160+ e1 = get_key_el(k1, off); \
22161+ e2 = get_key_el(k2, off); \
22162+ \
22163+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22164+})
22165+
22166+/* compare `k1' and `k2'. This function is a heart of "key allocation
22167+ policy". All you need to implement new policy is to add yet another
22168+ clause here. */
22169+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22170+ const reiser4_key * k2 /* second key to compare */ )
22171+{
22172+ cmp_t result;
22173+
22174+ /*
22175+ * This function is the heart of reiser4 tree-routines. Key comparison
22176+ * is among most heavily used operations in the file system.
22177+ */
22178+
22179+ assert("nikita-439", k1 != NULL);
22180+ assert("nikita-440", k2 != NULL);
22181+
22182+ /* there is no actual branch here: condition is compile time constant
22183+ * and constant folding and propagation ensures that only one branch
22184+ * is actually compiled in. */
22185+
22186+ if (REISER4_PLANA_KEY_ALLOCATION) {
22187+ /* if physical order of fields in a key is identical
22188+ with logical order, we can implement key comparison
22189+ as three 64bit comparisons. */
22190+ /* logical order of fields in plan-a:
22191+ locality->type->objectid->offset. */
22192+ /* compare locality and type at once */
22193+ result = KEY_DIFF_EL(k1, k2, 0);
22194+ if (result == EQUAL_TO) {
22195+ /* compare objectid (and band if it's there) */
22196+ result = KEY_DIFF_EL(k1, k2, 1);
22197+ /* compare offset */
22198+ if (result == EQUAL_TO) {
22199+ result = KEY_DIFF_EL(k1, k2, 2);
22200+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22201+ result = KEY_DIFF_EL(k1, k2, 3);
22202+ }
22203+ }
22204+ }
22205+ } else if (REISER4_3_5_KEY_ALLOCATION) {
22206+ result = KEY_DIFF(k1, k2, locality);
22207+ if (result == EQUAL_TO) {
22208+ result = KEY_DIFF(k1, k2, objectid);
22209+ if (result == EQUAL_TO) {
22210+ result = KEY_DIFF(k1, k2, type);
22211+ if (result == EQUAL_TO)
22212+ result = KEY_DIFF(k1, k2, offset);
22213+ }
22214+ }
22215+ } else
22216+ impossible("nikita-441", "Unknown key allocation scheme!");
22217+ return result;
22218+}
22219+
22220+/* true if @k1 equals @k2 */
22221+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22222+ const reiser4_key * k2 /* second key to compare */ )
22223+{
22224+ assert("nikita-1879", k1 != NULL);
22225+ assert("nikita-1880", k2 != NULL);
22226+ return !memcmp(k1, k2, sizeof *k1);
22227+}
22228+
22229+/* true if @k1 is less than @k2 */
22230+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22231+ const reiser4_key * k2 /* second key to compare */ )
22232+{
22233+ assert("nikita-1952", k1 != NULL);
22234+ assert("nikita-1953", k2 != NULL);
22235+ return keycmp(k1, k2) == LESS_THAN;
22236+}
22237+
22238+/* true if @k1 is less than or equal to @k2 */
22239+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22240+ const reiser4_key * k2 /* second key to compare */ )
22241+{
22242+ assert("nikita-1954", k1 != NULL);
22243+ assert("nikita-1955", k2 != NULL);
22244+ return keycmp(k1, k2) != GREATER_THAN;
22245+}
22246+
22247+/* true if @k1 is greater than @k2 */
22248+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22249+ const reiser4_key * k2 /* second key to compare */ )
22250+{
22251+ assert("nikita-1959", k1 != NULL);
22252+ assert("nikita-1960", k2 != NULL);
22253+ return keycmp(k1, k2) == GREATER_THAN;
22254+}
22255+
22256+/* true if @k1 is greater than or equal to @k2 */
22257+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22258+ const reiser4_key * k2 /* second key to compare */ )
22259+{
22260+ assert("nikita-1956", k1 != NULL);
22261+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22262+ * November 3: Laika */
22263+ return keycmp(k1, k2) != LESS_THAN;
22264+}
22265+
22266+static inline void prefetchkey(reiser4_key * key)
22267+{
22268+ prefetch(key);
22269+ prefetch(&key->el[KEY_CACHELINE_END]);
22270+}
22271+
22272+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22273+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22274+/* size of a buffer suitable to hold human readable key representation */
22275+#define KEY_BUF_LEN (80)
22276+
22277+#if REISER4_DEBUG
71430cf6 22278+extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
44254afd 22279+#else
71430cf6 22280+#define reiser4_print_key(p,k) noop
44254afd
MT
22281+#endif
22282+
22283+/* __FS_REISERFS_KEY_H__ */
22284+#endif
22285+
22286+/* Make Linus happy.
22287+ Local variables:
22288+ c-indentation-style: "K&R"
22289+ mode-name: "LC"
22290+ c-basic-offset: 8
22291+ tab-width: 8
22292+ fill-column: 120
22293+ End:
22294+*/
71430cf6
MT
22295diff -urN linux-2.6.22.orig/fs/reiser4/ktxnmgrd.c linux-2.6.22/fs/reiser4/ktxnmgrd.c
22296--- linux-2.6.22.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 03:00:00.000000000 +0300
22297+++ linux-2.6.22/fs/reiser4/ktxnmgrd.c 2007-07-29 00:25:34.884698547 +0400
22298@@ -0,0 +1,215 @@
44254afd
MT
22299+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22300+/* Transaction manager daemon. */
22301+
22302+/*
22303+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22304+ * needed/important for the following reasons:
22305+ *
22306+ * 1. in reiser4 atom is not committed immediately when last transaction
22307+ * handle closes, unless atom is either too old or too large (see
22308+ * atom_should_commit()). This is done to avoid committing too frequently.
22309+ * because:
22310+ *
22311+ * 2. sometimes we don't want to commit atom when closing last transaction
22312+ * handle even if it is old and fat enough. For example, because we are at
22313+ * this point under directory semaphore, and committing would stall all
22314+ * accesses to this directory.
22315+ *
22316+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22317+ * either due to (tunable) timeout or because it was explicitly woken up by
22318+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22319+ * eligible.
22320+ *
22321+ */
22322+
22323+#include "debug.h"
22324+#include "txnmgr.h"
22325+#include "tree.h"
22326+#include "ktxnmgrd.h"
22327+#include "super.h"
22328+#include "reiser4.h"
22329+
22330+#include <linux/sched.h> /* for struct task_struct */
22331+#include <linux/wait.h>
22332+#include <linux/suspend.h>
22333+#include <linux/kernel.h>
22334+#include <linux/writeback.h>
22335+#include <linux/kthread.h>
71430cf6 22336+#include <linux/freezer.h>
44254afd
MT
22337+
22338+static int scan_mgr(struct super_block *);
22339+
22340+/*
22341+ * change current->comm so that ps, top, and friends will see changed
22342+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
22343+ * be it will make lonely system administrator feeling less alone at 3 A.M.
22344+ */
22345+#define set_comm( state ) \
22346+ snprintf( current -> comm, sizeof( current -> comm ), \
22347+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22348+
22349+/**
22350+ * ktxnmgrd - kernel txnmgr daemon
22351+ * @arg: pointer to super block
22352+ *
22353+ * The background transaction manager daemon, started as a kernel thread during
22354+ * reiser4 initialization.
22355+ */
22356+static int ktxnmgrd(void *arg)
22357+{
22358+ struct super_block *super;
22359+ ktxnmgrd_context *ctx;
22360+ txn_mgr *mgr;
22361+ int done = 0;
22362+
22363+ super = arg;
22364+ mgr = &get_super_private(super)->tmgr;
22365+
22366+ /*
22367+ * do_fork() just copies task_struct into the new thread. ->fs_context
22368+ * shouldn't be copied of course. This shouldn't be a problem for the
22369+ * rest of the code though.
22370+ */
22371+ current->journal_info = NULL;
22372+ ctx = mgr->daemon;
22373+ while (1) {
22374+ try_to_freeze();
22375+ set_comm("wait");
22376+ {
22377+ DEFINE_WAIT(__wait);
22378+
22379+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22380+ if (kthread_should_stop()) {
22381+ done = 1;
22382+ } else
22383+ schedule_timeout(ctx->timeout);
22384+ finish_wait(&ctx->wait, &__wait);
22385+ }
22386+ if (done)
22387+ break;
22388+ set_comm("run");
22389+ spin_lock(&ctx->guard);
22390+ /*
22391+ * wait timed out or ktxnmgrd was woken up by explicit request
22392+ * to commit something. Scan list of atoms in txnmgr and look
22393+ * for too old atoms.
22394+ */
22395+ do {
22396+ ctx->rescan = 0;
22397+ scan_mgr(super);
22398+ spin_lock(&ctx->guard);
22399+ if (ctx->rescan) {
22400+ /*
22401+ * the list could be modified while ctx
22402+ * spinlock was released, we have to repeat
22403+ * scanning from the beginning
22404+ */
22405+ break;
22406+ }
22407+ } while (ctx->rescan);
22408+ spin_unlock(&ctx->guard);
22409+ }
22410+ return 0;
22411+}
22412+
22413+#undef set_comm
22414+
22415+/**
71430cf6 22416+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
44254afd
MT
22417+ * @super: pointer to super block
22418+ *
22419+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22420+ * manager. Starts kernel txnmgr daemon. This is called on mount.
22421+ */
71430cf6 22422+int reiser4_init_ktxnmgrd(struct super_block *super)
44254afd
MT
22423+{
22424+ txn_mgr *mgr;
22425+ ktxnmgrd_context *ctx;
22426+
22427+ mgr = &get_super_private(super)->tmgr;
22428+
22429+ assert("zam-1014", mgr->daemon == NULL);
22430+
71430cf6 22431+ ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
44254afd
MT
22432+ if (ctx == NULL)
22433+ return RETERR(-ENOMEM);
22434+
22435+ assert("nikita-2442", ctx != NULL);
22436+
22437+ memset(ctx, 0, sizeof *ctx);
22438+ init_waitqueue_head(&ctx->wait);
22439+
22440+ /*kcond_init(&ctx->startup);*/
22441+ spin_lock_init(&ctx->guard);
22442+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22443+ ctx->rescan = 1;
22444+ mgr->daemon = ctx;
22445+
22446+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22447+ if (IS_ERR(ctx->tsk)) {
22448+ int ret = PTR_ERR(ctx->tsk);
22449+ mgr->daemon = NULL;
22450+ kfree(ctx);
22451+ return RETERR(ret);
22452+ }
22453+ return 0;
22454+}
22455+
22456+void ktxnmgrd_kick(txn_mgr *mgr)
22457+{
22458+ assert("nikita-3234", mgr != NULL);
22459+ assert("nikita-3235", mgr->daemon != NULL);
22460+ wake_up(&mgr->daemon->wait);
22461+}
22462+
22463+int is_current_ktxnmgrd(void)
22464+{
22465+ return (get_current_super_private()->tmgr.daemon->tsk == current);
22466+}
22467+
22468+/**
22469+ * scan_mgr - commit atoms which are to be committed
22470+ * @super: super block to commit atoms of
22471+ *
22472+ * Commits old atoms.
22473+ */
22474+static int scan_mgr(struct super_block *super)
22475+{
22476+ int ret;
22477+ reiser4_context ctx;
22478+
22479+ init_stack_context(&ctx, super);
22480+
22481+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
22482+
22483+ reiser4_exit_context(&ctx);
22484+ return ret;
22485+}
22486+
22487+/**
71430cf6 22488+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
44254afd
MT
22489+ * @mgr:
22490+ *
22491+ * This is called on umount. Stops ktxnmgrd and free t
22492+ */
71430cf6 22493+void reiser4_done_ktxnmgrd(struct super_block *super)
44254afd
MT
22494+{
22495+ txn_mgr *mgr;
22496+
22497+ mgr = &get_super_private(super)->tmgr;
22498+ assert("zam-1012", mgr->daemon != NULL);
22499+
22500+ kthread_stop(mgr->daemon->tsk);
22501+ kfree(mgr->daemon);
22502+ mgr->daemon = NULL;
22503+}
22504+
22505+/*
22506+ * Local variables:
22507+ * c-indentation-style: "K&R"
22508+ * mode-name: "LC"
22509+ * c-basic-offset: 8
22510+ * tab-width: 8
22511+ * fill-column: 120
22512+ * End:
22513+ */
71430cf6
MT
22514diff -urN linux-2.6.22.orig/fs/reiser4/ktxnmgrd.h linux-2.6.22/fs/reiser4/ktxnmgrd.h
22515--- linux-2.6.22.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 03:00:00.000000000 +0300
22516+++ linux-2.6.22/fs/reiser4/ktxnmgrd.h 2007-07-29 00:25:34.884698547 +0400
44254afd
MT
22517@@ -0,0 +1,52 @@
22518+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22519+ * reiser4/README */
22520+
22521+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22522+
22523+#ifndef __KTXNMGRD_H__
22524+#define __KTXNMGRD_H__
22525+
22526+#include "txnmgr.h"
22527+
22528+#include <linux/fs.h>
22529+#include <linux/wait.h>
22530+#include <linux/completion.h>
22531+#include <linux/spinlock.h>
22532+#include <asm/atomic.h>
22533+#include <linux/sched.h> /* for struct task_struct */
22534+
22535+/* in this structure all data necessary to start up, shut down and communicate
22536+ * with ktxnmgrd are kept. */
22537+struct ktxnmgrd_context {
22538+ /* wait queue head on which ktxnmgrd sleeps */
22539+ wait_queue_head_t wait;
22540+ /* spin lock protecting all fields of this structure */
22541+ spinlock_t guard;
22542+ /* timeout of sleeping on ->wait */
22543+ signed long timeout;
22544+ /* kernel thread running ktxnmgrd */
22545+ struct task_struct *tsk;
22546+ /* list of all file systems served by this ktxnmgrd */
22547+ struct list_head queue;
22548+ /* should ktxnmgrd repeat scanning of atoms? */
22549+ unsigned int rescan:1;
22550+};
22551+
71430cf6
MT
22552+extern int reiser4_init_ktxnmgrd(struct super_block *);
22553+extern void reiser4_done_ktxnmgrd(struct super_block *);
44254afd
MT
22554+
22555+extern void ktxnmgrd_kick(txn_mgr * mgr);
22556+extern int is_current_ktxnmgrd(void);
22557+
22558+/* __KTXNMGRD_H__ */
22559+#endif
22560+
22561+/* Make Linus happy.
22562+ Local variables:
22563+ c-indentation-style: "K&R"
22564+ mode-name: "LC"
22565+ c-basic-offset: 8
22566+ tab-width: 8
22567+ fill-column: 120
22568+ End:
22569+*/
71430cf6
MT
22570diff -urN linux-2.6.22.orig/fs/reiser4/lock.c linux-2.6.22/fs/reiser4/lock.c
22571--- linux-2.6.22.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22572+++ linux-2.6.22/fs/reiser4/lock.c 2007-07-29 00:25:34.884698547 +0400
22573@@ -0,0 +1,1232 @@
44254afd
MT
22574+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22575+ * reiser4/README */
22576+
22577+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22578+ order. V4 balances the tree from the bottom up, and searches the tree from
22579+ the top down, and that is really the way we want it, so tradition won't work
22580+ for us.
22581+
22582+ Instead we have two lock orderings, a high priority lock ordering, and a low
22583+ priority lock ordering. Each node in the tree has a lock in its znode.
22584+
22585+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22586+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
22587+ process may have a pending lock request to a node locked by another process.
22588+ Note: we lock and unlock, but do not transfer locks: it is possible
22589+ transferring locks instead would save some bus locking....
22590+
22591+ Deadlock occurs when we have a loop constructed from process locked sets and
22592+ lock request vectors.
22593+
22594+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22595+ memory is extended with "znodes" with which we connect nodes with their left
22596+ and right neighbors using sibling pointers stored in the znodes. When we
22597+ perform balancing operations we often go from left to right and from right to
22598+ left.
22599+
22600+ +-P1-+ +-P3-+
22601+ |+--+| V1 |+--+|
22602+ ||N1|| -------> ||N3||
22603+ |+--+| |+--+|
22604+ +----+ +----+
22605+ ^ |
22606+ |V2 |V3
22607+ | v
22608+ +---------P2---------+
22609+ |+--+ +--+|
22610+ ||N2| -------- |N4||
22611+ |+--+ +--+|
22612+ +--------------------+
22613+
22614+ We solve this by ensuring that only low priority processes lock in top to
22615+ bottom order and from right to left, and high priority processes lock from
22616+ bottom to top and left to right.
22617+
22618+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22619+ kill those damn busy loops.
22620+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22621+ stage) cannot be ordered that way. There are no rules what nodes can belong
22622+ to the atom and what nodes cannot. We cannot define what is right or left
22623+ direction, what is top or bottom. We can take immediate parent or side
22624+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
22625+ not a far right neighbor for other nodes from the same atom. It breaks
22626+ deadlock avoidance rules and hi-low priority locking cannot be applied for
22627+ atom locks.
22628+
22629+ How does it help to avoid deadlocks ?
22630+
22631+ Suppose we have a deadlock with n processes. Processes from one priority
22632+ class never deadlock because they take locks in one consistent
22633+ order.
22634+
22635+ So, any possible deadlock loop must have low priority as well as high
22636+ priority processes. There are no other lock priority levels except low and
22637+ high. We know that any deadlock loop contains at least one node locked by a
22638+ low priority process and requested by a high priority process. If this
22639+ situation is caught and resolved it is sufficient to avoid deadlocks.
22640+
22641+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22642+
22643+ The deadlock prevention algorithm is based on comparing
22644+ priorities of node owners (processes which keep znode locked) and
22645+ requesters (processes which want to acquire a lock on znode). We
22646+ implement a scheme where low-priority owners yield locks to
22647+ high-priority requesters. We created a signal passing system that
22648+ is used to ask low-priority processes to yield one or more locked
22649+ znodes.
22650+
22651+ The condition when a znode needs to change its owners is described by the
22652+ following formula:
22653+
22654+ #############################################
22655+ # #
22656+ # (number of high-priority requesters) > 0 #
22657+ # AND #
22658+ # (numbers of high-priority owners) == 0 #
22659+ # #
22660+ #############################################
22661+
22662+ Note that a low-priority process delays node releasing if another
22663+ high-priority process owns this node. So, slightly more strictly speaking,
22664+ to have a deadlock capable cycle you must have a loop in which a high
22665+ priority process is waiting on a low priority process to yield a node, which
22666+ is slightly different from saying a high priority process is waiting on a
22667+ node owned by a low priority process.
22668+
22669+ It is enough to avoid deadlocks if we prevent any low-priority process from
22670+ falling asleep if its locked set contains a node which satisfies the
22671+ deadlock condition.
22672+
22673+ That condition is implicitly or explicitly checked in all places where new
22674+ high-priority requests may be added or removed from node request queue or
22675+ high-priority process takes or releases a lock on node. The main
22676+ goal of these checks is to never lose the moment when node becomes "has
22677+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22678+ at that time.
22679+
22680+ The information about received signals is stored in the per-process
22681+ structure (lock stack) and analyzed before a low-priority process goes to
22682+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22683+ sleeping process up and forces him to re-check lock status and received
22684+ signal info. If "must-yield-this-lock" signals were received the locking
22685+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22686+
22687+ V4 LOCKING DRAWBACKS
22688+
22689+ If we have already balanced on one level, and we are propagating our changes
22690+ upward to a higher level, it could be very messy to surrender all locks on
22691+ the lower level because we put so much computational work into it, and
22692+ reverting them to their state before they were locked might be very complex.
22693+ We also don't want to acquire all locks before performing balancing because
22694+ that would either be almost as much work as the balancing, or it would be
22695+ too conservative and lock too much. We want balancing to be done only at
22696+ high priority. Yet, we might want to go to the left one node and use some
22697+ of its empty space... So we make one attempt at getting the node to the left
22698+ using try_lock, and if it fails we do without it, because we didn't really
22699+ need it, it was only a nice to have.
22700+
22701+ LOCK STRUCTURES DESCRIPTION
22702+
22703+ The following data structures are used in the reiser4 locking
22704+ implementation:
22705+
22706+ All fields related to long-term locking are stored in znode->lock.
22707+
22708+ The lock stack is a per thread object. It owns all znodes locked by the
22709+ thread. One znode may be locked by several threads in case of read lock or
22710+ one znode may be write locked by one thread several times. The special link
22711+ objects (lock handles) support n<->m relation between znodes and lock
22712+ owners.
22713+
22714+ <Thread 1> <Thread 2>
22715+
22716+ +---------+ +---------+
22717+ | LS1 | | LS2 |
22718+ +---------+ +---------+
22719+ ^ ^
22720+ |---------------+ +----------+
22721+ v v v v
22722+ +---------+ +---------+ +---------+ +---------+
22723+ | LH1 | | LH2 | | LH3 | | LH4 |
22724+ +---------+ +---------+ +---------+ +---------+
22725+ ^ ^ ^ ^
22726+ | +------------+ |
22727+ v v v
22728+ +---------+ +---------+ +---------+
22729+ | Z1 | | Z2 | | Z3 |
22730+ +---------+ +---------+ +---------+
22731+
22732+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22733+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22734+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
22735+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22736+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22737+ is locked (for read) twice by different threads and two lock handles are on
22738+ its list. Each lock handle represents a single relation of a locking of a
22739+ znode by a thread. Locking of a znode is an establishing of a locking
22740+ relation between the lock stack and the znode by adding of a new lock handle
22741+ to a list of lock handles, the lock stack. The lock stack links all lock
22742+ handles for all znodes locked by the lock stack. The znode list groups all
22743+ lock handles for all locks stacks which locked the znode.
22744+
22745+ Yet another relation may exist between znode and lock owners. If lock
22746+ procedure cannot immediately take lock on an object it adds the lock owner
22747+ on special `requestors' list belongs to znode. That list represents a
22748+ queue of pending lock requests. Because one lock owner may request only
22749+ only one lock object at a time, it is a 1->n relation between lock objects
22750+ and a lock owner implemented as it is described above. Full information
22751+ (priority, pointers to lock and link objects) about each lock request is
22752+ stored in lock owner structure in `request' field.
22753+
22754+ SHORT_TERM LOCKING
22755+
22756+ This is a list of primitive operations over lock stacks / lock handles /
22757+ znodes and locking descriptions for them.
22758+
22759+ 1. locking / unlocking which is done by two list insertion/deletion, one
22760+ to/from znode's list of lock handles, another one is to/from lock stack's
22761+ list of lock handles. The first insertion is protected by
22762+ znode->lock.guard spinlock. The list owned by the lock stack can be
22763+ modified only by thread who owns the lock stack and nobody else can
22764+ modify/read it. There is nothing to be protected by a spinlock or
22765+ something else.
22766+
22767+ 2. adding/removing a lock request to/from znode requesters list. The rule is
22768+ that znode->lock.guard spinlock should be taken for this.
22769+
22770+ 3. we can traverse list of lock handles and use references to lock stacks who
22771+ locked given znode if znode->lock.guard spinlock is taken.
22772+
22773+ 4. If a lock stack is associated with a znode as a lock requestor or lock
22774+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
22775+ (lock stack's) fields should be protected from being accessed in parallel
22776+ by two or more threads. Please look at lock_stack structure definition
22777+ for the info how those fields are protected. */
22778+
22779+/* Znode lock and capturing intertwining. */
22780+/* In current implementation we capture formatted nodes before locking
71430cf6
MT
22781+ them. Take a look on longterm lock znode, reiser4_try_capture() request
22782+ precedes locking requests. The longterm_lock_znode function unconditionally
22783+ captures znode before even checking of locking conditions.
44254afd
MT
22784+
22785+ Another variant is to capture znode after locking it. It was not tested, but
22786+ at least one deadlock condition is supposed to be there. One thread has
71430cf6
MT
22787+ locked a znode (Node-1) and calls reiser4_try_capture() for it.
22788+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22789+ Second thread is a flushing thread, its current atom is the atom Node-1
22790+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22791+ is locked by the first thread. The described situation is a deadlock. */
44254afd
MT
22792+
22793+#include "debug.h"
22794+#include "txnmgr.h"
22795+#include "znode.h"
22796+#include "jnode.h"
22797+#include "tree.h"
22798+#include "plugin/node/node.h"
22799+#include "super.h"
22800+
22801+#include <linux/spinlock.h>
22802+
22803+#if REISER4_DEBUG
22804+static int request_is_deadlock_safe(znode *, znode_lock_mode,
22805+ znode_lock_request);
22806+#endif
22807+
22808+/* Returns a lock owner associated with current thread */
22809+lock_stack *get_current_lock_stack(void)
22810+{
22811+ return &get_current_context()->stack;
22812+}
22813+
22814+/* Wakes up all low priority owners informing them about possible deadlock */
22815+static void wake_up_all_lopri_owners(znode * node)
22816+{
22817+ lock_handle *handle;
22818+
22819+ assert_spin_locked(&(node->lock.guard));
22820+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
22821+ assert("nikita-1832", handle->node == node);
22822+ /* count this signal in owner->nr_signaled */
22823+ if (!handle->signaled) {
22824+ handle->signaled = 1;
22825+ atomic_inc(&handle->owner->nr_signaled);
22826+ /* Wake up a single process */
22827+ reiser4_wake_up(handle->owner);
22828+ }
22829+ }
22830+}
22831+
22832+/* Adds a lock to a lock owner, which means creating a link to the lock and
22833+ putting the link into the two lists all links are on (the doubly linked list
22834+ that forms the lock_stack, and the doubly linked list of links attached
22835+ to a lock.
22836+*/
22837+static inline void
22838+link_object(lock_handle * handle, lock_stack * owner, znode * node)
22839+{
22840+ assert("jmacd-810", handle->owner == NULL);
22841+ assert_spin_locked(&(node->lock.guard));
22842+
22843+ handle->owner = owner;
22844+ handle->node = node;
22845+
22846+ assert("reiser4-4",
22847+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22848+
22849+ /* add lock handle to the end of lock_stack's list of locks */
22850+ list_add_tail(&handle->locks_link, &owner->locks);
22851+ ON_DEBUG(owner->nr_locks++);
71430cf6 22852+ reiser4_ctx_gfp_mask_set();
44254afd
MT
22853+
22854+ /* add lock handle to the head of znode's list of owners */
22855+ list_add(&handle->owners_link, &node->lock.owners);
22856+ handle->signaled = 0;
22857+}
22858+
22859+/* Breaks a relation between a lock and its owner */
22860+static inline void unlink_object(lock_handle * handle)
22861+{
22862+ assert("zam-354", handle->owner != NULL);
22863+ assert("nikita-1608", handle->node != NULL);
22864+ assert_spin_locked(&(handle->node->lock.guard));
22865+ assert("nikita-1829", handle->owner == get_current_lock_stack());
22866+ assert("reiser4-5", handle->owner->nr_locks > 0);
22867+
22868+ /* remove lock handle from lock_stack's list of locks */
22869+ list_del(&handle->locks_link);
22870+ ON_DEBUG(handle->owner->nr_locks--);
71430cf6 22871+ reiser4_ctx_gfp_mask_set();
44254afd
MT
22872+ assert("reiser4-6",
22873+ ergo(list_empty_careful(&handle->owner->locks),
22874+ handle->owner->nr_locks == 0));
22875+ /* remove lock handle from znode's list of owners */
22876+ list_del(&handle->owners_link);
22877+ /* indicates that lock handle is free now */
22878+ handle->node = NULL;
22879+#if REISER4_DEBUG
22880+ INIT_LIST_HEAD(&handle->locks_link);
22881+ INIT_LIST_HEAD(&handle->owners_link);
22882+ handle->owner = NULL;
22883+#endif
22884+}
22885+
22886+/* Actually locks an object knowing that we are able to do this */
22887+static void lock_object(lock_stack * owner)
22888+{
71430cf6 22889+ struct lock_request *request;
44254afd
MT
22890+ znode *node;
22891+
22892+ request = &owner->request;
22893+ node = request->node;
22894+ assert_spin_locked(&(node->lock.guard));
22895+ if (request->mode == ZNODE_READ_LOCK) {
22896+ node->lock.nr_readers++;
22897+ } else {
22898+ /* check that we don't switched from read to write lock */
22899+ assert("nikita-1840", node->lock.nr_readers <= 0);
22900+ /* We allow recursive locking; a node can be locked several
22901+ times for write by same process */
22902+ node->lock.nr_readers--;
22903+ }
22904+
22905+ link_object(request->handle, owner, node);
22906+
22907+ if (owner->curpri) {
22908+ node->lock.nr_hipri_owners++;
22909+ }
22910+}
22911+
22912+/* Check for recursive write locking */
22913+static int recursive(lock_stack * owner)
22914+{
22915+ int ret;
22916+ znode *node;
22917+ lock_handle *lh;
22918+
22919+ node = owner->request.node;
22920+
22921+ /* Owners list is not empty for a locked node */
22922+ assert("zam-314", !list_empty_careful(&node->lock.owners));
22923+ assert("nikita-1841", owner == get_current_lock_stack());
22924+ assert_spin_locked(&(node->lock.guard));
22925+
44254afd
MT
22926+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22927+ ret = (lh->owner == owner);
22928+
22929+ /* Recursive read locking should be done usual way */
22930+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22931+ /* mixing of read/write locks is not allowed */
22932+ assert("zam-341", !ret || znode_is_wlocked(node));
22933+
22934+ return ret;
22935+}
22936+
22937+#if REISER4_DEBUG
22938+/* Returns true if the lock is held by the calling thread. */
22939+int znode_is_any_locked(const znode * node)
22940+{
22941+ lock_handle *handle;
22942+ lock_stack *stack;
22943+ int ret;
22944+
22945+ if (!znode_is_locked(node)) {
22946+ return 0;
22947+ }
22948+
22949+ stack = get_current_lock_stack();
22950+
22951+ spin_lock_stack(stack);
22952+
22953+ ret = 0;
22954+
22955+ list_for_each_entry(handle, &stack->locks, locks_link) {
22956+ if (handle->node == node) {
22957+ ret = 1;
22958+ break;
22959+ }
22960+ }
22961+
22962+ spin_unlock_stack(stack);
22963+
22964+ return ret;
22965+}
22966+
22967+#endif
22968+
22969+/* Returns true if a write lock is held by the calling thread. */
22970+int znode_is_write_locked(const znode * node)
22971+{
22972+ lock_stack *stack;
22973+ lock_handle *handle;
22974+
22975+ assert("jmacd-8765", node != NULL);
22976+
22977+ if (!znode_is_wlocked(node)) {
22978+ return 0;
22979+ }
22980+
22981+ stack = get_current_lock_stack();
22982+
22983+ /*
22984+ * When znode is write locked, all owner handles point to the same lock
22985+ * stack. Get pointer to lock stack from the first lock handle from
22986+ * znode's owner list
22987+ */
22988+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
22989+
22990+ return (handle->owner == stack);
22991+}
22992+
22993+/* This "deadlock" condition is the essential part of reiser4 locking
22994+ implementation. This condition is checked explicitly by calling
22995+ check_deadlock_condition() or implicitly in all places where znode lock
22996+ state (set of owners and request queue) is changed. Locking code is
22997+ designed to use this condition to trigger procedure of passing object from
22998+ low priority owner(s) to high priority one(s).
22999+
23000+ The procedure results in passing an event (setting lock_handle->signaled
23001+ flag) and counting this event in nr_signaled field of owner's lock stack
23002+ object and wakeup owner's process.
23003+*/
23004+static inline int check_deadlock_condition(znode * node)
23005+{
23006+ assert_spin_locked(&(node->lock.guard));
23007+ return node->lock.nr_hipri_requests > 0
23008+ && node->lock.nr_hipri_owners == 0;
23009+}
23010+
23011+static int check_livelock_condition(znode * node, znode_lock_mode mode)
23012+{
23013+ zlock * lock = &node->lock;
23014+
23015+ return mode == ZNODE_READ_LOCK &&
23016+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23017+}
23018+
23019+/* checks lock/request compatibility */
23020+static int can_lock_object(lock_stack * owner)
23021+{
23022+ znode *node = owner->request.node;
23023+
23024+ assert_spin_locked(&(node->lock.guard));
23025+
23026+ /* See if the node is disconnected. */
23027+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23028+ return RETERR(-EINVAL);
23029+
23030+ /* Do not ever try to take a lock if we are going in low priority
23031+ direction and a node have a high priority request without high
23032+ priority owners. */
23033+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23034+ return RETERR(-E_REPEAT);
23035+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23036+ return RETERR(-E_REPEAT);
23037+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23038+ return RETERR(-E_REPEAT);
23039+ return 0;
23040+}
23041+
23042+/* Setting of a high priority to the process. It clears "signaled" flags
23043+ because znode locked by high-priority process can't satisfy our "deadlock
23044+ condition". */
23045+static void set_high_priority(lock_stack * owner)
23046+{
23047+ assert("nikita-1846", owner == get_current_lock_stack());
23048+ /* Do nothing if current priority is already high */
23049+ if (!owner->curpri) {
23050+ /* We don't need locking for owner->locks list, because, this
23051+ * function is only called with the lock stack of the current
23052+ * thread, and no other thread can play with owner->locks list
23053+ * and/or change ->node pointers of lock handles in this list.
23054+ *
23055+ * (Interrupts also are not involved.)
23056+ */
23057+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23058+ while (&owner->locks != &item->locks_link) {
23059+ znode *node = item->node;
23060+
23061+ spin_lock_zlock(&node->lock);
23062+
23063+ node->lock.nr_hipri_owners++;
23064+
23065+ /* we can safely set signaled to zero, because
23066+ previous statement (nr_hipri_owners ++) guarantees
23067+ that signaled will be never set again. */
23068+ item->signaled = 0;
23069+ spin_unlock_zlock(&node->lock);
23070+
23071+ item = list_entry(item->locks_link.next, lock_handle, locks_link);
23072+ }
23073+ owner->curpri = 1;
23074+ atomic_set(&owner->nr_signaled, 0);
23075+ }
23076+}
23077+
23078+/* Sets a low priority to the process. */
23079+static void set_low_priority(lock_stack * owner)
23080+{
23081+ assert("nikita-3075", owner == get_current_lock_stack());
23082+ /* Do nothing if current priority is already low */
23083+ if (owner->curpri) {
23084+ /* scan all locks (lock handles) held by @owner, which is
23085+ actually current thread, and check whether we are reaching
23086+ deadlock possibility anywhere.
23087+ */
23088+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23089+ while (&owner->locks != &handle->locks_link) {
23090+ znode *node = handle->node;
23091+ spin_lock_zlock(&node->lock);
23092+ /* this thread just was hipri owner of @node, so
23093+ nr_hipri_owners has to be greater than zero. */
23094+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23095+ node->lock.nr_hipri_owners--;
23096+ /* If we have deadlock condition, adjust a nr_signaled
23097+ field. It is enough to set "signaled" flag only for
23098+ current process, other low-pri owners will be
23099+ signaled and waken up after current process unlocks
23100+ this object and any high-priority requestor takes
23101+ control. */
23102+ if (check_deadlock_condition(node)
23103+ && !handle->signaled) {
23104+ handle->signaled = 1;
23105+ atomic_inc(&owner->nr_signaled);
23106+ }
23107+ spin_unlock_zlock(&node->lock);
23108+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23109+ }
23110+ owner->curpri = 0;
23111+ }
23112+}
23113+
23114+static void remove_lock_request(lock_stack * requestor)
23115+{
23116+ zlock * lock = &requestor->request.node->lock;
23117+
23118+ if (requestor->curpri) {
23119+ assert("nikita-1838", lock->nr_hipri_requests > 0);
23120+ lock->nr_hipri_requests--;
23121+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
23122+ lock->nr_hipri_write_requests --;
23123+ }
23124+ list_del(&requestor->requestors_link);
23125+}
23126+
44254afd
MT
23127+static void invalidate_all_lock_requests(znode * node)
23128+{
23129+ lock_stack *requestor, *tmp;
23130+
23131+ assert_spin_locked(&(node->lock.guard));
23132+
23133+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23134+ remove_lock_request(requestor);
23135+ requestor->request.ret_code = -EINVAL;
23136+ reiser4_wake_up(requestor);
23137+ requestor->request.mode = ZNODE_NO_LOCK;
23138+ }
23139+}
23140+
23141+static void dispatch_lock_requests(znode * node)
23142+{
23143+ lock_stack *requestor, *tmp;
23144+
23145+ assert_spin_locked(&(node->lock.guard));
23146+
23147+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23148+ if (znode_is_write_locked(node))
23149+ break;
23150+ if (!can_lock_object(requestor)) {
23151+ lock_object(requestor);
23152+ remove_lock_request(requestor);
23153+ requestor->request.ret_code = 0;
23154+ reiser4_wake_up(requestor);
23155+ requestor->request.mode = ZNODE_NO_LOCK;
23156+ }
23157+ }
23158+}
23159+
23160+/* release long-term lock, acquired by longterm_lock_znode() */
23161+void longterm_unlock_znode(lock_handle * handle)
23162+{
23163+ znode *node = handle->node;
23164+ lock_stack *oldowner = handle->owner;
23165+ int hipri;
23166+ int readers;
23167+ int rdelta;
23168+ int youdie;
23169+
23170+ /*
23171+ * this is time-critical and highly optimized code. Modify carefully.
23172+ */
23173+
23174+ assert("jmacd-1021", handle != NULL);
23175+ assert("jmacd-1022", handle->owner != NULL);
23176+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23177+
23178+ assert("zam-130", oldowner == get_current_lock_stack());
23179+
23180+ LOCK_CNT_DEC(long_term_locked_znode);
23181+
23182+ /*
23183+ * to minimize amount of operations performed under lock, pre-compute
23184+ * all variables used within critical section. This makes code
23185+ * obscure.
23186+ */
23187+
23188+ /* was this lock of hi or lo priority */
71430cf6 23189+ hipri = oldowner->curpri ? 1 : 0;
44254afd
MT
23190+ /* number of readers */
23191+ readers = node->lock.nr_readers;
23192+ /* +1 if write lock, -1 if read lock */
23193+ rdelta = (readers > 0) ? -1 : +1;
23194+ /* true if node is to die and write lock is released */
23195+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23196+
23197+ spin_lock_zlock(&node->lock);
23198+
23199+ assert("zam-101", znode_is_locked(node));
23200+
23201+ /* Adjust a number of high priority owners of this lock */
71430cf6
MT
23202+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23203+ node->lock.nr_hipri_owners -= hipri;
44254afd
MT
23204+
23205+ /* Handle znode deallocation on last write-lock release. */
23206+ if (znode_is_wlocked_once(node)) {
23207+ if (youdie) {
23208+ forget_znode(handle);
23209+ assert("nikita-2191", znode_invariant(node));
23210+ zput(node);
23211+ return;
23212+ }
23213+ }
23214+
23215+ if (handle->signaled)
23216+ atomic_dec(&oldowner->nr_signaled);
23217+
23218+ /* Unlocking means owner<->object link deletion */
23219+ unlink_object(handle);
23220+
23221+ /* This is enough to be sure whether an object is completely
23222+ unlocked. */
23223+ node->lock.nr_readers += rdelta;
23224+
23225+ /* If the node is locked it must have an owners list. Likewise, if
23226+ the node is unlocked it must have an empty owners list. */
23227+ assert("zam-319", equi(znode_is_locked(node),
23228+ !list_empty_careful(&node->lock.owners)));
23229+
23230+#if REISER4_DEBUG
23231+ if (!znode_is_locked(node))
23232+ ++node->times_locked;
23233+#endif
23234+
23235+ /* If there are pending lock requests we wake up a requestor */
23236+ if (!znode_is_wlocked(node))
23237+ dispatch_lock_requests(node);
23238+ if (check_deadlock_condition(node))
23239+ wake_up_all_lopri_owners(node);
23240+ spin_unlock_zlock(&node->lock);
23241+
23242+ /* minus one reference from handle->node */
23243+ assert("nikita-2190", znode_invariant(node));
23244+ ON_DEBUG(check_lock_data());
23245+ ON_DEBUG(check_lock_node_data(node));
23246+ zput(node);
23247+}
23248+
23249+/* final portion of longterm-lock */
23250+static int
23251+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23252+{
23253+ znode *node = owner->request.node;
23254+
23255+ assert_spin_locked(&(node->lock.guard));
23256+
23257+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
23258+ if (ok == 0) {
23259+ lock_object(owner);
23260+ owner->request.mode = 0;
23261+ /* count a reference from lockhandle->node
23262+
23263+ znode was already referenced at the entry to this function,
23264+ hence taking spin-lock here is not necessary (see comment
23265+ in the zref()).
23266+ */
23267+ zref(node);
23268+
23269+ LOCK_CNT_INC(long_term_locked_znode);
23270+ }
23271+ spin_unlock_zlock(&node->lock);
23272+ ON_DEBUG(check_lock_data());
23273+ ON_DEBUG(check_lock_node_data(node));
23274+ return ok;
23275+}
23276+
23277+/*
23278+ * version of longterm_znode_lock() optimized for the most common case: read
23279+ * lock without any special flags. This is the kind of lock that any tree
23280+ * traversal takes on the root node of the tree, which is very frequent.
23281+ */
23282+static int longterm_lock_tryfast(lock_stack * owner)
23283+{
23284+ int result;
23285+ znode *node;
23286+ zlock *lock;
23287+
23288+ node = owner->request.node;
23289+ lock = &node->lock;
23290+
71430cf6 23291+ assert("nikita-3340", reiser4_schedulable());
44254afd
MT
23292+ assert("nikita-3341", request_is_deadlock_safe(node,
23293+ ZNODE_READ_LOCK,
23294+ ZNODE_LOCK_LOPRI));
23295+ spin_lock_zlock(lock);
23296+ result = can_lock_object(owner);
23297+ spin_unlock_zlock(lock);
23298+
23299+ if (likely(result != -EINVAL)) {
23300+ spin_lock_znode(node);
71430cf6 23301+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
44254afd
MT
23302+ spin_unlock_znode(node);
23303+ spin_lock_zlock(lock);
23304+ if (unlikely(result != 0)) {
23305+ owner->request.mode = 0;
23306+ } else {
23307+ result = can_lock_object(owner);
23308+ if (unlikely(result == -E_REPEAT)) {
23309+ /* fall back to longterm_lock_znode() */
23310+ spin_unlock_zlock(lock);
23311+ return 1;
23312+ }
23313+ }
23314+ return lock_tail(owner, result, ZNODE_READ_LOCK);
23315+ } else
23316+ return 1;
23317+}
23318+
23319+/* locks given lock object */
23320+int longterm_lock_znode(
23321+ /* local link object (allocated by lock owner thread, usually on its own
23322+ * stack) */
23323+ lock_handle * handle,
23324+ /* znode we want to lock. */
23325+ znode * node,
23326+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23327+ znode_lock_mode mode,
23328+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23329+ znode_lock_request request) {
23330+ int ret;
23331+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23332+ int non_blocking = 0;
23333+ int has_atom;
23334+ txn_capture cap_flags;
23335+ zlock *lock;
23336+ txn_handle *txnh;
23337+ tree_level level;
23338+
23339+ /* Get current process context */
23340+ lock_stack *owner = get_current_lock_stack();
23341+
23342+ /* Check that the lock handle is initialized and isn't already being
23343+ * used. */
23344+ assert("jmacd-808", handle->owner == NULL);
71430cf6 23345+ assert("nikita-3026", reiser4_schedulable());
44254afd
MT
23346+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23347+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23348+ /* long term locks are not allowed in the VM contexts (->writepage(),
23349+ * prune_{d,i}cache()).
23350+ *
23351+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23352+ * bug caused by d_splice_alias() only working for directories.
23353+ */
23354+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23355+ assert ("zam-1055", mode != ZNODE_NO_LOCK);
23356+
23357+ cap_flags = 0;
23358+ if (request & ZNODE_LOCK_NONBLOCK) {
23359+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
23360+ non_blocking = 1;
23361+ }
23362+
23363+ if (request & ZNODE_LOCK_DONT_FUSE)
23364+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
23365+
23366+ /* If we are changing our process priority we must adjust a number
23367+ of high priority owners for each znode that we already lock */
23368+ if (hipri) {
23369+ set_high_priority(owner);
23370+ } else {
23371+ set_low_priority(owner);
23372+ }
23373+
23374+ level = znode_get_level(node);
23375+
23376+ /* Fill request structure with our values. */
23377+ owner->request.mode = mode;
23378+ owner->request.handle = handle;
23379+ owner->request.node = node;
23380+
23381+ txnh = get_current_context()->trans;
23382+ lock = &node->lock;
23383+
23384+ if (mode == ZNODE_READ_LOCK && request == 0) {
23385+ ret = longterm_lock_tryfast(owner);
23386+ if (ret <= 0)
23387+ return ret;
23388+ }
23389+
23390+ has_atom = (txnh->atom != NULL);
23391+
23392+ /* Synchronize on node's zlock guard lock. */
23393+ spin_lock_zlock(lock);
23394+
23395+ if (znode_is_locked(node) &&
23396+ mode == ZNODE_WRITE_LOCK && recursive(owner))
23397+ return lock_tail(owner, 0, mode);
23398+
23399+ for (;;) {
23400+ /* Check the lock's availability: if it is unavaiable we get
23401+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
23402+ invalid. */
23403+ ret = can_lock_object(owner);
23404+
23405+ if (unlikely(ret == -EINVAL)) {
23406+ /* @node is dying. Leave it alone. */
23407+ break;
23408+ }
23409+
23410+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
23411+ /* either locking of @node by the current thread will
23412+ * lead to the deadlock, or lock modes are
23413+ * incompatible. */
23414+ break;
23415+ }
23416+
23417+ assert("nikita-1844", (ret == 0)
23418+ || ((ret == -E_REPEAT) && !non_blocking));
23419+ /* If we can get the lock... Try to capture first before
23420+ taking the lock. */
23421+
23422+ /* first handle commonest case where node and txnh are already
23423+ * in the same atom. */
23424+ /* safe to do without taking locks, because:
23425+ *
23426+ * 1. read of aligned word is atomic with respect to writes to
23427+ * this word
23428+ *
71430cf6 23429+ * 2. false negatives are handled in reiser4_try_capture().
44254afd
MT
23430+ *
23431+ * 3. false positives are impossible.
23432+ *
23433+ * PROOF: left as an exercise to the curious reader.
23434+ *
23435+ * Just kidding. Here is one:
23436+ *
23437+ * At the time T0 txnh->atom is stored in txnh_atom.
23438+ *
23439+ * At the time T1 node->atom is stored in node_atom.
23440+ *
23441+ * At the time T2 we observe that
23442+ *
23443+ * txnh_atom != NULL && node_atom == txnh_atom.
23444+ *
23445+ * Imagine that at this moment we acquire node and txnh spin
23446+ * lock in this order. Suppose that under spin lock we have
23447+ *
23448+ * node->atom != txnh->atom, (S1)
23449+ *
23450+ * at the time T3.
23451+ *
23452+ * txnh->atom != NULL still, because txnh is open by the
23453+ * current thread.
23454+ *
23455+ * Suppose node->atom == NULL, that is, node was un-captured
23456+ * between T1, and T3. But un-capturing of formatted node is
71430cf6
MT
23457+ * always preceded by the call to reiser4_invalidate_lock(),
23458+ * which marks znode as JNODE_IS_DYING under zlock spin
44254afd
MT
23459+ * lock. Contradiction, because can_lock_object() above checks
23460+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23461+ *
23462+ * Suppose that node->atom != node_atom, that is, atom, node
23463+ * belongs to was fused into another atom: node_atom was fused
23464+ * into node->atom. Atom of txnh was equal to node_atom at T2,
23465+ * which means that under spin lock, txnh->atom == node->atom,
23466+ * because txnh->atom can only follow fusion
23467+ * chain. Contradicts S1.
23468+ *
23469+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
23470+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
23471+ * contradicts S1. Hence S1 is false. QED.
23472+ *
23473+ */
23474+
23475+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23476+ ;
23477+ } else {
23478+ /*
23479+ * unlock zlock spin lock here. It is possible for
23480+ * longterm_unlock_znode() to sneak in here, but there
71430cf6
MT
23481+ * is no harm: reiser4_invalidate_lock() will mark znode
23482+ * as JNODE_IS_DYING and this will be noted by
44254afd
MT
23483+ * can_lock_object() below.
23484+ */
23485+ spin_unlock_zlock(lock);
23486+ spin_lock_znode(node);
71430cf6 23487+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
44254afd
MT
23488+ spin_unlock_znode(node);
23489+ spin_lock_zlock(lock);
23490+ if (unlikely(ret != 0)) {
23491+ /* In the failure case, the txnmgr releases
23492+ the znode's lock (or in some cases, it was
23493+ released a while ago). There's no need to
23494+ reacquire it so we should return here,
23495+ avoid releasing the lock. */
23496+ owner->request.mode = 0;
23497+ break;
23498+ }
23499+
23500+ /* Check the lock's availability again -- this is
23501+ because under some circumstances the capture code
23502+ has to release and reacquire the znode spinlock. */
23503+ ret = can_lock_object(owner);
23504+ }
23505+
23506+ /* This time, a return of (ret == 0) means we can lock, so we
23507+ should break out of the loop. */
71430cf6 23508+ if (likely(ret != -E_REPEAT || non_blocking))
44254afd 23509+ break;
44254afd
MT
23510+
23511+ /* Lock is unavailable, we have to wait. */
71430cf6
MT
23512+ ret = reiser4_prepare_to_sleep(owner);
23513+ if (unlikely(ret != 0))
44254afd 23514+ break;
44254afd
MT
23515+
23516+ assert_spin_locked(&(node->lock.guard));
23517+ if (hipri) {
23518+ /* If we are going in high priority direction then
23519+ increase high priority requests counter for the
23520+ node */
23521+ lock->nr_hipri_requests++;
23522+ if (mode == ZNODE_WRITE_LOCK)
23523+ lock->nr_hipri_write_requests ++;
23524+ /* If there are no high priority owners for a node,
23525+ then immediately wake up low priority owners, so
23526+ they can detect possible deadlock */
23527+ if (lock->nr_hipri_owners == 0)
23528+ wake_up_all_lopri_owners(node);
23529+ }
23530+ list_add_tail(&owner->requestors_link, &lock->requestors);
23531+
23532+ /* Ok, here we have prepared a lock request, so unlock
23533+ a znode ... */
23534+ spin_unlock_zlock(lock);
23535+ /* ... and sleep */
71430cf6 23536+ reiser4_go_to_sleep(owner);
44254afd
MT
23537+ if (owner->request.mode == ZNODE_NO_LOCK)
23538+ goto request_is_done;
23539+ spin_lock_zlock(lock);
23540+ if (owner->request.mode == ZNODE_NO_LOCK) {
23541+ spin_unlock_zlock(lock);
23542+ request_is_done:
23543+ if (owner->request.ret_code == 0) {
23544+ LOCK_CNT_INC(long_term_locked_znode);
23545+ zref(node);
23546+ }
23547+ return owner->request.ret_code;
23548+ }
23549+ remove_lock_request(owner);
23550+ }
23551+
23552+ return lock_tail(owner, ret, mode);
23553+}
23554+
23555+/* lock object invalidation means changing of lock object state to `INVALID'
23556+ and waiting for all other processes to cancel theirs lock requests. */
71430cf6
MT
23557+void reiser4_invalidate_lock(lock_handle * handle /* path to lock
23558+ * owner and lock
23559+ * object is being
23560+ * invalidated. */ )
44254afd
MT
23561+{
23562+ znode *node = handle->node;
23563+ lock_stack *owner = handle->owner;
23564+
23565+ assert("zam-325", owner == get_current_lock_stack());
23566+ assert("zam-103", znode_is_write_locked(node));
23567+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23568+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23569+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23570+ assert("nikita-3097", znode_is_wlocked_once(node));
23571+ assert_spin_locked(&(node->lock.guard));
23572+
23573+ if (handle->signaled)
23574+ atomic_dec(&owner->nr_signaled);
23575+
23576+ ZF_SET(node, JNODE_IS_DYING);
23577+ unlink_object(handle);
23578+ node->lock.nr_readers = 0;
23579+
23580+ invalidate_all_lock_requests(node);
23581+ spin_unlock_zlock(&node->lock);
23582+}
23583+
23584+/* Initializes lock_stack. */
23585+void init_lock_stack(lock_stack * owner /* pointer to
23586+ * allocated
23587+ * structure. */ )
23588+{
23589+ INIT_LIST_HEAD(&owner->locks);
23590+ INIT_LIST_HEAD(&owner->requestors_link);
23591+ spin_lock_init(&owner->sguard);
23592+ owner->curpri = 1;
71430cf6 23593+ init_waitqueue_head(&owner->wait);
44254afd
MT
23594+}
23595+
23596+/* Initializes lock object. */
23597+void reiser4_init_lock(zlock * lock /* pointer on allocated
23598+ * uninitialized lock object
23599+ * structure. */ )
23600+{
23601+ memset(lock, 0, sizeof(zlock));
23602+ spin_lock_init(&lock->guard);
23603+ INIT_LIST_HEAD(&lock->requestors);
23604+ INIT_LIST_HEAD(&lock->owners);
23605+}
23606+
23607+/* Transfer a lock handle (presumably so that variables can be moved between stack and
23608+ heap locations). */
23609+static void
23610+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23611+{
23612+ znode *node = old->node;
23613+ lock_stack *owner = old->owner;
23614+ int signaled;
23615+
23616+ /* locks_list, modified by link_object() is not protected by
23617+ anything. This is valid because only current thread ever modifies
23618+ locks_list of its lock_stack.
23619+ */
23620+ assert("nikita-1827", owner == get_current_lock_stack());
23621+ assert("nikita-1831", new->owner == NULL);
23622+
23623+ spin_lock_zlock(&node->lock);
23624+
23625+ signaled = old->signaled;
23626+ if (unlink_old) {
23627+ unlink_object(old);
23628+ } else {
23629+ if (node->lock.nr_readers > 0) {
23630+ node->lock.nr_readers += 1;
23631+ } else {
23632+ node->lock.nr_readers -= 1;
23633+ }
23634+ if (signaled) {
23635+ atomic_inc(&owner->nr_signaled);
23636+ }
23637+ if (owner->curpri) {
23638+ node->lock.nr_hipri_owners += 1;
23639+ }
23640+ LOCK_CNT_INC(long_term_locked_znode);
23641+
23642+ zref(node);
23643+ }
23644+ link_object(new, owner, node);
23645+ new->signaled = signaled;
23646+
23647+ spin_unlock_zlock(&node->lock);
23648+}
23649+
23650+void move_lh(lock_handle * new, lock_handle * old)
23651+{
23652+ move_lh_internal(new, old, /*unlink_old */ 1);
23653+}
23654+
23655+void copy_lh(lock_handle * new, lock_handle * old)
23656+{
23657+ move_lh_internal(new, old, /*unlink_old */ 0);
23658+}
23659+
23660+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
71430cf6 23661+int reiser4_check_deadlock(void)
44254afd
MT
23662+{
23663+ lock_stack *owner = get_current_lock_stack();
23664+ return atomic_read(&owner->nr_signaled) != 0;
23665+}
23666+
23667+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23668+ priorities. */
71430cf6 23669+int reiser4_prepare_to_sleep(lock_stack * owner)
44254afd
MT
23670+{
23671+ assert("nikita-1847", owner == get_current_lock_stack());
44254afd
MT
23672+
23673+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23674+ * counted in nr_signaled */
23675+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23676+ assert("zam-959", !owner->curpri);
23677+ return RETERR(-E_DEADLOCK);
23678+ }
23679+ return 0;
23680+}
23681+
23682+/* Wakes up a single thread */
23683+void __reiser4_wake_up(lock_stack * owner)
23684+{
71430cf6
MT
23685+ atomic_set(&owner->wakeup, 1);
23686+ wake_up(&owner->wait);
44254afd
MT
23687+}
23688+
23689+/* Puts a thread to sleep */
71430cf6 23690+void reiser4_go_to_sleep(lock_stack * owner)
44254afd
MT
23691+{
23692+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
71430cf6
MT
23693+ assert("nikita-3027", reiser4_schedulable());
23694+
23695+ wait_event(owner->wait, atomic_read(&owner->wakeup));
23696+ atomic_set(&owner->wakeup, 0);
44254afd
MT
23697+}
23698+
23699+int lock_stack_isclean(lock_stack * owner)
23700+{
23701+ if (list_empty_careful(&owner->locks)) {
23702+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23703+ return 1;
23704+ }
23705+
23706+ return 0;
23707+}
23708+
23709+#if REISER4_DEBUG
23710+
23711+/*
23712+ * debugging functions
23713+ */
23714+
23715+static void list_check(struct list_head *head)
23716+{
23717+ struct list_head *pos;
23718+
23719+ list_for_each(pos, head)
23720+ assert("", (pos->prev != NULL && pos->next != NULL &&
23721+ pos->prev->next == pos && pos->next->prev == pos));
23722+}
23723+
23724+/* check consistency of locking data-structures hanging of the @stack */
23725+static void check_lock_stack(lock_stack * stack)
23726+{
23727+ spin_lock_stack(stack);
23728+ /* check that stack->locks is not corrupted */
23729+ list_check(&stack->locks);
23730+ spin_unlock_stack(stack);
23731+}
23732+
23733+/* check consistency of locking data structures */
23734+void check_lock_data(void)
23735+{
23736+ check_lock_stack(&get_current_context()->stack);
23737+}
23738+
23739+/* check consistency of locking data structures for @node */
23740+void check_lock_node_data(znode * node)
23741+{
23742+ spin_lock_zlock(&node->lock);
23743+ list_check(&node->lock.owners);
23744+ list_check(&node->lock.requestors);
23745+ spin_unlock_zlock(&node->lock);
23746+}
23747+
23748+/* check that given lock request is dead lock safe. This check is, of course,
23749+ * not exhaustive. */
23750+static int
23751+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23752+ znode_lock_request request)
23753+{
23754+ lock_stack *owner;
23755+
23756+ owner = get_current_lock_stack();
23757+ /*
23758+ * check that hipri lock request is not issued when there are locked
23759+ * nodes at the higher levels.
23760+ */
23761+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23762+ znode_get_level(node) != 0) {
23763+ lock_handle *item;
23764+
23765+ list_for_each_entry(item, &owner->locks, locks_link) {
23766+ znode *other;
23767+
23768+ other = item->node;
23769+
23770+ if (znode_get_level(other) == 0)
23771+ continue;
23772+ if (znode_get_level(other) > znode_get_level(node))
23773+ return 0;
23774+ }
23775+ }
23776+ return 1;
23777+}
23778+
23779+#endif
23780+
23781+/* return pointer to static storage with name of lock_mode. For
23782+ debugging */
23783+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23784+{
23785+ if (lock == ZNODE_READ_LOCK)
23786+ return "read";
23787+ else if (lock == ZNODE_WRITE_LOCK)
23788+ return "write";
23789+ else {
23790+ static char buf[30];
23791+
23792+ sprintf(buf, "unknown: %i", lock);
23793+ return buf;
23794+ }
23795+}
23796+
23797+/* Make Linus happy.
23798+ Local variables:
23799+ c-indentation-style: "K&R"
23800+ mode-name: "LC"
23801+ c-basic-offset: 8
23802+ tab-width: 8
23803+ fill-column: 79
23804+ End:
23805+*/
71430cf6
MT
23806diff -urN linux-2.6.22.orig/fs/reiser4/lock.h linux-2.6.22/fs/reiser4/lock.h
23807--- linux-2.6.22.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23808+++ linux-2.6.22/fs/reiser4/lock.h 2007-07-29 00:25:34.888699583 +0400
23809@@ -0,0 +1,249 @@
44254afd
MT
23810+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23811+
23812+/* Long term locking data structures. See lock.c for details. */
23813+
23814+#ifndef __LOCK_H__
23815+#define __LOCK_H__
23816+
23817+#include "forward.h"
23818+#include "debug.h"
23819+#include "dformat.h"
23820+#include "key.h"
23821+#include "coord.h"
23822+#include "plugin/node/node.h"
23823+#include "txnmgr.h"
23824+#include "readahead.h"
23825+
23826+#include <linux/types.h>
23827+#include <linux/spinlock.h>
23828+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
23829+#include <asm/atomic.h>
71430cf6 23830+#include <linux/wait.h>
44254afd
MT
23831+
23832+/* Per-znode lock object */
23833+struct zlock {
23834+ spinlock_t guard;
23835+ /* The number of readers if positive; the number of recursively taken
23836+ write locks if negative. Protected by zlock spin lock. */
23837+ int nr_readers;
23838+ /* A number of processes (lock_stacks) that have this object
23839+ locked with high priority */
23840+ unsigned nr_hipri_owners;
23841+ /* A number of attempts to lock znode in high priority direction */
23842+ unsigned nr_hipri_requests;
23843+ /* A linked list of lock_handle objects that contains pointers
23844+ for all lock_stacks which have this lock object locked */
23845+ unsigned nr_hipri_write_requests;
23846+ struct list_head owners;
23847+ /* A linked list of lock_stacks that wait for this lock */
23848+ struct list_head requestors;
23849+};
23850+
23851+static inline void spin_lock_zlock(zlock *lock)
23852+{
23853+ /* check that zlock is not locked */
23854+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
23855+ /* check that spinlocks of lower priorities are not held */
23856+ assert("", LOCK_CNT_NIL(spin_locked_stack));
23857+
23858+ spin_lock(&lock->guard);
23859+
23860+ LOCK_CNT_INC(spin_locked_zlock);
23861+ LOCK_CNT_INC(spin_locked);
23862+}
23863+
23864+static inline void spin_unlock_zlock(zlock *lock)
23865+{
23866+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23867+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23868+
23869+ LOCK_CNT_DEC(spin_locked_zlock);
23870+ LOCK_CNT_DEC(spin_locked);
23871+
23872+ spin_unlock(&lock->guard);
23873+}
23874+
23875+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
23876+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
23877+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
23878+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
23879+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
23880+#define lock_mode_compatible(lock, mode) \
23881+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23882+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23883+
23884+/* Since we have R/W znode locks we need additional bidirectional `link'
23885+ objects to implement n<->m relationship between lock owners and lock
23886+ objects. We call them `lock handles'.
23887+
23888+ Locking: see lock.c/"SHORT-TERM LOCKING"
23889+*/
23890+struct lock_handle {
23891+ /* This flag indicates that a signal to yield a lock was passed to
23892+ lock owner and counted in owner->nr_signalled
23893+
23894+ Locking: this is accessed under spin lock on ->node.
23895+ */
23896+ int signaled;
23897+ /* A link to owner of a lock */
23898+ lock_stack *owner;
23899+ /* A link to znode locked */
23900+ znode *node;
23901+ /* A list of all locks for a process */
23902+ struct list_head locks_link;
23903+ /* A list of all owners for a znode */
23904+ struct list_head owners_link;
23905+};
23906+
71430cf6 23907+struct lock_request {
44254afd
MT
23908+ /* A pointer to uninitialized link object */
23909+ lock_handle *handle;
23910+ /* A pointer to the object we want to lock */
23911+ znode *node;
23912+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23913+ znode_lock_mode mode;
23914+ /* how dispatch_lock_requests() returns lock request result code */
23915+ int ret_code;
71430cf6 23916+};
44254afd
MT
23917+
23918+/* A lock stack structure for accumulating locks owned by a process */
23919+struct lock_stack {
23920+ /* A guard lock protecting a lock stack */
23921+ spinlock_t sguard;
23922+ /* number of znodes which were requested by high priority processes */
23923+ atomic_t nr_signaled;
23924+ /* Current priority of a process
23925+
23926+ This is only accessed by the current thread and thus requires no
23927+ locking.
23928+ */
23929+ int curpri;
23930+ /* A list of all locks owned by this process. Elements can be added to
23931+ * this list only by the current thread. ->node pointers in this list
23932+ * can be only changed by the current thread. */
23933+ struct list_head locks;
23934+ /* When lock_stack waits for the lock, it puts itself on double-linked
23935+ requestors list of that lock */
23936+ struct list_head requestors_link;
23937+ /* Current lock request info.
23938+
23939+ This is only accessed by the current thread and thus requires no
23940+ locking.
23941+ */
71430cf6
MT
23942+ struct lock_request request;
23943+ /* the following two fields are the lock stack's
23944+ * synchronization object to use with the standard linux/wait.h
23945+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23946+ * usage details. */
23947+ wait_queue_head_t wait;
23948+ atomic_t wakeup;
44254afd
MT
23949+#if REISER4_DEBUG
23950+ int nr_locks; /* number of lock handles in the above list */
23951+#endif
23952+};
23953+
44254afd
MT
23954+/*
23955+ User-visible znode locking functions
23956+*/
23957+
23958+extern int longterm_lock_znode(lock_handle * handle,
23959+ znode * node,
23960+ znode_lock_mode mode,
23961+ znode_lock_request request);
23962+
23963+extern void longterm_unlock_znode(lock_handle * handle);
23964+
71430cf6 23965+extern int reiser4_check_deadlock(void);
44254afd
MT
23966+
23967+extern lock_stack *get_current_lock_stack(void);
23968+
23969+extern void init_lock_stack(lock_stack * owner);
23970+extern void reiser4_init_lock(zlock * lock);
23971+
23972+static inline void init_lh(lock_handle *lh)
23973+{
23974+#if REISER4_DEBUG
23975+ memset(lh, 0, sizeof *lh);
23976+ INIT_LIST_HEAD(&lh->locks_link);
23977+ INIT_LIST_HEAD(&lh->owners_link);
23978+#else
23979+ lh->node = NULL;
23980+#endif
23981+}
23982+
23983+static inline void done_lh(lock_handle *lh)
23984+{
23985+ assert("zam-342", lh != NULL);
23986+ if (lh->node != NULL)
23987+ longterm_unlock_znode(lh);
23988+}
23989+
23990+extern void move_lh(lock_handle * new, lock_handle * old);
23991+extern void copy_lh(lock_handle * new, lock_handle * old);
23992+
71430cf6
MT
23993+extern int reiser4_prepare_to_sleep(lock_stack * owner);
23994+extern void reiser4_go_to_sleep(lock_stack * owner);
44254afd
MT
23995+extern void __reiser4_wake_up(lock_stack * owner);
23996+
23997+extern int lock_stack_isclean(lock_stack * owner);
23998+
23999+/* zlock object state check macros: only used in assertions. Both forms imply that the
24000+ lock is held by the current thread. */
24001+extern int znode_is_write_locked(const znode *);
71430cf6 24002+extern void reiser4_invalidate_lock(lock_handle *);
44254afd
MT
24003+
24004+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24005+#define spin_ordering_pred_stack(stack) \
24006+ (LOCK_CNT_NIL(spin_locked_stack) && \
24007+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
24008+ LOCK_CNT_NIL(spin_locked_inode) && \
24009+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24010+ LOCK_CNT_NIL(spin_locked_super_eflush) )
24011+
24012+static inline void spin_lock_stack(lock_stack *stack)
24013+{
24014+ assert("", spin_ordering_pred_stack(stack));
24015+ spin_lock(&(stack->sguard));
24016+ LOCK_CNT_INC(spin_locked_stack);
24017+ LOCK_CNT_INC(spin_locked);
24018+}
24019+
24020+static inline void spin_unlock_stack(lock_stack *stack)
24021+{
24022+ assert_spin_locked(&(stack->sguard));
24023+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24024+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24025+ LOCK_CNT_DEC(spin_locked_stack);
24026+ LOCK_CNT_DEC(spin_locked);
24027+ spin_unlock(&(stack->sguard));
24028+}
24029+
44254afd
MT
24030+static inline void reiser4_wake_up(lock_stack * owner)
24031+{
24032+ spin_lock_stack(owner);
24033+ __reiser4_wake_up(owner);
24034+ spin_unlock_stack(owner);
24035+}
24036+
24037+const char *lock_mode_name(znode_lock_mode lock);
24038+
24039+#if REISER4_DEBUG
24040+extern void check_lock_data(void);
24041+extern void check_lock_node_data(znode * node);
24042+#else
24043+#define check_lock_data() noop
24044+#define check_lock_node_data() noop
24045+#endif
24046+
24047+/* __LOCK_H__ */
24048+#endif
24049+
24050+/* Make Linus happy.
24051+ Local variables:
24052+ c-indentation-style: "K&R"
24053+ mode-name: "LC"
24054+ c-basic-offset: 8
24055+ tab-width: 8
24056+ fill-column: 120
24057+ End:
24058+*/
71430cf6
MT
24059diff -urN linux-2.6.22.orig/fs/reiser4/Makefile linux-2.6.22/fs/reiser4/Makefile
24060--- linux-2.6.22.orig/fs/reiser4/Makefile 1970-01-01 03:00:00.000000000 +0300
24061+++ linux-2.6.22/fs/reiser4/Makefile 2007-07-29 00:25:34.888699583 +0400
24062@@ -0,0 +1,99 @@
24063+#
24064+# reiser4/Makefile
24065+#
24066+
24067+obj-$(CONFIG_REISER4_FS) += reiser4.o
24068+
24069+reiser4-y := \
24070+ debug.o \
24071+ jnode.o \
24072+ znode.o \
24073+ key.o \
24074+ pool.o \
24075+ tree_mod.o \
24076+ estimate.o \
24077+ carry.o \
24078+ carry_ops.o \
24079+ lock.o \
24080+ tree.o \
24081+ context.o \
24082+ tap.o \
24083+ coord.o \
24084+ block_alloc.o \
24085+ txnmgr.o \
24086+ kassign.o \
24087+ flush.o \
24088+ wander.o \
24089+ eottl.o \
24090+ search.o \
24091+ page_cache.o \
24092+ seal.o \
24093+ dscale.o \
24094+ flush_queue.o \
24095+ ktxnmgrd.o \
24096+ blocknrset.o \
24097+ super.o \
24098+ super_ops.o \
24099+ fsdata.o \
24100+ export_ops.o \
24101+ oid.o \
24102+ tree_walk.o \
24103+ inode.o \
24104+ vfs_ops.o \
24105+ as_ops.o \
24106+ entd.o\
24107+ readahead.o \
24108+ status_flags.o \
24109+ init_super.o \
24110+ safe_link.o \
24111+ \
24112+ plugin/plugin.o \
24113+ plugin/plugin_set.o \
24114+ plugin/node/node.o \
24115+ plugin/object.o \
24116+ plugin/cluster.o \
24117+ plugin/inode_ops.o \
24118+ plugin/inode_ops_rename.o \
24119+ plugin/file_ops.o \
24120+ plugin/file_ops_readdir.o \
24121+ plugin/file_plugin_common.o \
24122+ plugin/file/file.o \
24123+ plugin/file/tail_conversion.o \
24124+ plugin/file/file_conversion.o \
24125+ plugin/file/symlink.o \
24126+ plugin/file/cryptcompress.o \
24127+ plugin/dir_plugin_common.o \
24128+ plugin/dir/hashed_dir.o \
24129+ plugin/dir/seekable_dir.o \
24130+ plugin/node/node40.o \
24131+ \
24132+ plugin/crypto/cipher.o \
24133+ plugin/crypto/digest.o \
24134+ \
24135+ plugin/compress/minilzo.o \
24136+ plugin/compress/compress.o \
24137+ plugin/compress/compress_mode.o \
24138+ \
24139+ plugin/item/static_stat.o \
24140+ plugin/item/sde.o \
24141+ plugin/item/cde.o \
24142+ plugin/item/blackbox.o \
24143+ plugin/item/internal.o \
24144+ plugin/item/tail.o \
24145+ plugin/item/ctail.o \
24146+ plugin/item/extent.o \
24147+ plugin/item/extent_item_ops.o \
24148+ plugin/item/extent_file_ops.o \
24149+ plugin/item/extent_flush_ops.o \
24150+ \
24151+ plugin/hash.o \
24152+ plugin/fibration.o \
24153+ plugin/tail_policy.o \
24154+ plugin/item/item.o \
24155+ \
24156+ plugin/security/perm.o \
24157+ plugin/space/bitmap.o \
24158+ \
24159+ plugin/disk_format/disk_format40.o \
24160+ plugin/disk_format/disk_format.o
24161+
24162diff -urN linux-2.6.22.orig/fs/reiser4/oid.c linux-2.6.22/fs/reiser4/oid.c
24163--- linux-2.6.22.orig/fs/reiser4/oid.c 1970-01-01 03:00:00.000000000 +0300
24164+++ linux-2.6.22/fs/reiser4/oid.c 2007-07-29 00:25:34.888699583 +0400
44254afd
MT
24165@@ -0,0 +1,141 @@
24166+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24167+
24168+#include "debug.h"
24169+#include "super.h"
24170+#include "txnmgr.h"
24171+
24172+/* we used to have oid allocation plugin. It was removed because it
24173+ was recognized as providing unneeded level of abstraction. If one
24174+ ever will find it useful - look at yet_unneeded_abstractions/oid
24175+*/
24176+
24177+/*
24178+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
24179+ * are provided by disk format plugin that reads them from the disk during
24180+ * mount.
24181+ */
24182+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24183+{
24184+ reiser4_super_info_data *sbinfo;
24185+
24186+ sbinfo = get_super_private(super);
24187+
24188+ sbinfo->next_to_use = next;
24189+ sbinfo->oids_in_use = nr_files;
24190+ return 0;
24191+}
24192+
24193+/*
24194+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24195+ * runs out of oids.
24196+ */
24197+oid_t oid_allocate(struct super_block * super)
24198+{
24199+ reiser4_super_info_data *sbinfo;
24200+ oid_t oid;
24201+
24202+ sbinfo = get_super_private(super);
24203+
24204+ spin_lock_reiser4_super(sbinfo);
24205+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24206+ oid = sbinfo->next_to_use++;
24207+ sbinfo->oids_in_use++;
24208+ } else
24209+ oid = ABSOLUTE_MAX_OID;
24210+ spin_unlock_reiser4_super(sbinfo);
24211+ return oid;
24212+}
24213+
24214+/*
24215+ * Tell oid allocator that @oid is now free.
24216+ */
24217+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24218+{
24219+ reiser4_super_info_data *sbinfo;
24220+
24221+ sbinfo = get_super_private(super);
24222+
24223+ spin_lock_reiser4_super(sbinfo);
24224+ sbinfo->oids_in_use--;
24225+ spin_unlock_reiser4_super(sbinfo);
24226+ return 0;
24227+}
24228+
24229+/*
24230+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
24231+ * without actually allocating it. This is used by disk format plugin to save
24232+ * oid allocator state on the disk.
24233+ */
24234+oid_t oid_next(const struct super_block * super)
24235+{
24236+ reiser4_super_info_data *sbinfo;
24237+ oid_t oid;
24238+
24239+ sbinfo = get_super_private(super);
24240+
24241+ spin_lock_reiser4_super(sbinfo);
24242+ oid = sbinfo->next_to_use;
24243+ spin_unlock_reiser4_super(sbinfo);
24244+ return oid;
24245+}
24246+
24247+/*
24248+ * returns number of currently used oids. This is used by statfs(2) to report
24249+ * number of "inodes" and by disk format plugin to save oid allocator state on
24250+ * the disk.
24251+ */
24252+long oids_used(const struct super_block *super)
24253+{
24254+ reiser4_super_info_data *sbinfo;
24255+ oid_t used;
24256+
24257+ sbinfo = get_super_private(super);
24258+
24259+ spin_lock_reiser4_super(sbinfo);
24260+ used = sbinfo->oids_in_use;
24261+ spin_unlock_reiser4_super(sbinfo);
24262+ if (used < (__u64) ((long)~0) >> 1)
24263+ return (long)used;
24264+ else
24265+ return (long)-1;
24266+}
24267+
24268+/*
24269+ * Count oid as allocated in atom. This is done after call to oid_allocate()
24270+ * at the point when we are irrevocably committed to creation of the new file
24271+ * (i.e., when oid allocation cannot be any longer rolled back due to some
24272+ * error).
24273+ */
24274+void oid_count_allocated(void)
24275+{
24276+ txn_atom *atom;
24277+
24278+ atom = get_current_atom_locked();
24279+ atom->nr_objects_created++;
24280+ spin_unlock_atom(atom);
24281+}
24282+
24283+/*
24284+ * Count oid as free in atom. This is done after call to oid_release() at the
24285+ * point when we are irrevocably committed to the deletion of the file (i.e.,
24286+ * when oid release cannot be any longer rolled back due to some error).
24287+ */
24288+void oid_count_released(void)
24289+{
24290+ txn_atom *atom;
24291+
24292+ atom = get_current_atom_locked();
24293+ atom->nr_objects_deleted++;
24294+ spin_unlock_atom(atom);
24295+}
24296+
24297+/*
24298+ Local variables:
24299+ c-indentation-style: "K&R"
24300+ mode-name: "LC"
24301+ c-basic-offset: 8
24302+ tab-width: 8
24303+ fill-column: 120
24304+ scroll-step: 1
24305+ End:
24306+*/
71430cf6
MT
24307diff -urN linux-2.6.22.orig/fs/reiser4/page_cache.c linux-2.6.22/fs/reiser4/page_cache.c
24308--- linux-2.6.22.orig/fs/reiser4/page_cache.c 1970-01-01 03:00:00.000000000 +0300
24309+++ linux-2.6.22/fs/reiser4/page_cache.c 2007-07-29 00:25:34.888699583 +0400
24310@@ -0,0 +1,730 @@
44254afd
MT
24311+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24312+ * reiser4/README */
24313+
24314+/* Memory pressure hooks. Fake inodes handling. */
71430cf6
MT
24315+
24316+/* GLOSSARY
24317+
24318+ . Formatted and unformatted nodes.
24319+ Elements of reiser4 balanced tree to store data and metadata.
24320+ Unformatted nodes are pointed to by extent pointers. Such nodes
24321+ are used to store data of large objects. Unlike unformatted nodes,
24322+ formatted ones have associated format described by node4X plugin.
24323+
24324+ . Jnode (or journal node)
24325+ The in-memory header which is used to track formatted and unformatted
24326+ nodes, bitmap nodes, etc. In particular, jnodes are used to track
24327+ transactional information associated with each block(see reiser4/jnode.c
24328+ for details).
24329+
24330+ . Znode
24331+ The in-memory header which is used to track formatted nodes. Contains
24332+ embedded jnode (see reiser4/znode.c for details).
24333+*/
24334+
44254afd
MT
24335+/* We store all file system meta data (and data, of course) in the page cache.
24336+
24337+ What does this mean? In stead of using bread/brelse we create special
24338+ "fake" inode (one per super block) and store content of formatted nodes
24339+ into pages bound to this inode in the page cache. In newer kernels bread()
24340+ already uses inode attached to block device (bd_inode). Advantage of having
24341+ our own fake inode is that we can install appropriate methods in its
24342+ address_space operations. Such methods are called by VM on memory pressure
24343+ (or during background page flushing) and we can use them to react
24344+ appropriately.
24345+
24346+ In initial version we only support one block per page. Support for multiple
24347+ blocks per page is complicated by relocation.
24348+
24349+ To each page, used by reiser4, jnode is attached. jnode is analogous to
24350+ buffer head. Difference is that jnode is bound to the page permanently:
24351+ jnode cannot be removed from memory until its backing page is.
24352+
24353+ jnode contain pointer to page (->pg field) and page contain pointer to
24354+ jnode in ->private field. Pointer from jnode to page is protected to by
24355+ jnode's spinlock and pointer from page to jnode is protected by page lock
24356+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24357+ lock. To go into reverse direction use jnode_lock_page() function that uses
24358+ standard try-lock-and-release device.
24359+
24360+ Properties:
24361+
24362+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24363+ reference counter is increased.
24364+
71430cf6
MT
24365+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24366+ reference counter is decreased.
44254afd
MT
24367+
24368+ 3. on jload() reference counter on jnode page is increased, page is
24369+ kmapped and `referenced'.
24370+
24371+ 4. on jrelse() inverse operations are performed.
24372+
24373+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24374+
24375+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24376+ historically.]
24377+
24378+ [In the following discussion, `lock' invariably means long term lock on
24379+ znode.] (What about page locks?)
24380+
24381+ There is some special class of deadlock possibilities related to memory
24382+ pressure. Locks acquired by other reiser4 threads are accounted for in
24383+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24384+ invoked additional hidden arc is added to the locking graph: thread that
24385+ tries to allocate memory waits for ->vm_writeback() to finish. If this
24386+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24387+ prevention is useless.
24388+
24389+ Another related problem is possibility for ->vm_writeback() to run out of
24390+ memory itself. This is not a problem for ext2 and friends, because their
24391+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
24392+ definitely able to allocate huge amounts of memory.
24393+
24394+ It seems that there is no reliable way to cope with the problems above. In
24395+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
24396+ context) wouldn't perform any flushing itself, but rather should just wake
24397+ up some auxiliary thread dedicated for this purpose (or, the same thread
24398+ that does periodic commit of old atoms (ktxnmgrd.c)).
24399+
24400+ Details:
24401+
24402+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
24403+ page can be ultimately released by try_to_free_pages() under presumptions
24404+ that:
24405+
24406+ a. ->vm_writeback() for F is no-op, and
24407+
24408+ b. none of the threads accessing F are making any progress, and
24409+
24410+ c. other reiser4 mounts obey the same memory reservation protocol as F
24411+ (described below).
24412+
24413+ For example, clean un-pinned page, or page occupied by ext2 data are
24414+ reclaimable against any reiser4 mount.
24415+
24416+ When there is more than one reiser4 mount in a system, condition (c) makes
24417+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24418+
24419+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24420+
24421+ Fake inode is used to bound formatted nodes and each node is indexed within
24422+ fake inode by its block number. If block size of smaller than page size, it
24423+ may so happen that block mapped to the page with formatted node is occupied
24424+ by unformatted node or is unallocated. This lead to some complications,
24425+ because flushing whole page can lead to an incorrect overwrite of
24426+ unformatted node that is moreover, can be cached in some other place as
24427+ part of the file body. To avoid this, buffers for unformatted nodes are
24428+ never marked dirty. Also pages in the fake are never marked dirty. This
24429+ rules out usage of ->writepage() as memory pressure hook. In stead
24430+ ->releasepage() is used.
24431+
24432+ Josh is concerned that page->buffer is going to die. This should not pose
24433+ significant problem though, because we need to add some data structures to
24434+ the page anyway (jnode) and all necessary book keeping can be put there.
24435+
24436+*/
24437+
24438+/* Life cycle of pages/nodes.
24439+
24440+ jnode contains reference to page and page contains reference back to
24441+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
24442+ cannot be released back into free pool.
24443+
24444+ 1. Formatted nodes.
24445+
24446+ 1. formatted node is represented by znode. When new znode is created its
24447+ ->pg pointer is NULL initially.
24448+
24449+ 2. when node content is loaded into znode (by call to zload()) for the
24450+ first time following happens (in call to ->read_node() or
24451+ ->allocate_node()):
24452+
24453+ 1. new page is added to the page cache.
24454+
24455+ 2. this page is attached to znode and its ->count is increased.
24456+
24457+ 3. page is kmapped.
24458+
24459+ 3. if more calls to zload() follow (without corresponding zrelses), page
24460+ counter is left intact and in its stead ->d_count is increased in znode.
24461+
24462+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24463+ ->release_node() is called and page is kunmapped as result.
24464+
24465+ 5. at some moment node can be captured by a transaction. Its ->x_count
24466+ is then increased by transaction manager.
24467+
24468+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24469+ bit set) following will happen (also see comment at the top of znode.c):
24470+
24471+ 1. when last lock is released, node will be uncaptured from
24472+ transaction. This released reference that transaction manager acquired
24473+ at the step 5.
24474+
24475+ 2. when last reference is released, zput() detects that node is
24476+ actually deleted and calls ->delete_node()
24477+ operation. page_cache_delete_node() implementation detaches jnode from
24478+ page and releases page.
24479+
24480+ 7. otherwise (node wasn't removed from the tree), last reference to
24481+ znode will be released after transaction manager committed transaction
24482+ node was in. This implies squallocing of this node (see
24483+ flush.c). Nothing special happens at this point. Znode is still in the
24484+ hash table and page is still attached to it.
24485+
24486+ 8. znode is actually removed from the memory because of the memory
24487+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
24488+ removed by the call to zdrop(). At this moment, page is detached from
24489+ znode and removed from the inode address space.
24490+
24491+*/
24492+
24493+#include "debug.h"
24494+#include "dformat.h"
24495+#include "key.h"
24496+#include "txnmgr.h"
24497+#include "jnode.h"
24498+#include "znode.h"
24499+#include "block_alloc.h"
24500+#include "tree.h"
24501+#include "vfs_ops.h"
24502+#include "inode.h"
24503+#include "super.h"
24504+#include "entd.h"
24505+#include "page_cache.h"
24506+#include "ktxnmgrd.h"
24507+
24508+#include <linux/types.h>
24509+#include <linux/fs.h>
24510+#include <linux/mm.h> /* for struct page */
24511+#include <linux/swap.h> /* for struct page */
24512+#include <linux/pagemap.h>
24513+#include <linux/bio.h>
24514+#include <linux/writeback.h>
24515+#include <linux/blkdev.h>
24516+
24517+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24518+
24519+static struct address_space_operations formatted_fake_as_ops;
24520+
24521+static const oid_t fake_ino = 0x1;
24522+static const oid_t bitmap_ino = 0x2;
24523+static const oid_t cc_ino = 0x3;
24524+
24525+static void
24526+init_fake_inode(struct super_block *super, struct inode *fake,
24527+ struct inode **pfake)
24528+{
24529+ assert("nikita-2168", fake->i_state & I_NEW);
24530+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
24531+ *pfake = fake;
24532+ /* NOTE-NIKITA something else? */
24533+ unlock_new_inode(fake);
24534+}
24535+
24536+/**
71430cf6 24537+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
44254afd
MT
24538+ * @super: super block to init fake inode for
24539+ *
24540+ * Initializes fake inode to which formatted nodes are bound in the page cache
24541+ * and inode for bitmaps.
24542+ */
71430cf6 24543+int reiser4_init_formatted_fake(struct super_block *super)
44254afd
MT
24544+{
24545+ struct inode *fake;
24546+ struct inode *bitmap;
24547+ struct inode *cc;
24548+ reiser4_super_info_data *sinfo;
24549+
24550+ assert("nikita-1703", super != NULL);
24551+
24552+ sinfo = get_super_private_nocheck(super);
24553+ fake = iget_locked(super, oid_to_ino(fake_ino));
24554+
24555+ if (fake != NULL) {
24556+ init_fake_inode(super, fake, &sinfo->fake);
24557+
24558+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24559+ if (bitmap != NULL) {
24560+ init_fake_inode(super, bitmap, &sinfo->bitmap);
24561+
24562+ cc = iget_locked(super, oid_to_ino(cc_ino));
24563+ if (cc != NULL) {
24564+ init_fake_inode(super, cc, &sinfo->cc);
24565+ return 0;
24566+ } else {
24567+ iput(sinfo->fake);
24568+ iput(sinfo->bitmap);
24569+ sinfo->fake = NULL;
24570+ sinfo->bitmap = NULL;
24571+ }
24572+ } else {
24573+ iput(sinfo->fake);
24574+ sinfo->fake = NULL;
24575+ }
24576+ }
24577+ return RETERR(-ENOMEM);
24578+}
24579+
24580+/**
71430cf6 24581+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
44254afd
MT
24582+ * @super: super block to init fake inode for
24583+ *
24584+ * Releases inodes which were used as address spaces of bitmap and formatted
24585+ * nodes.
24586+ */
71430cf6 24587+void reiser4_done_formatted_fake(struct super_block *super)
44254afd
MT
24588+{
24589+ reiser4_super_info_data *sinfo;
24590+
24591+ sinfo = get_super_private_nocheck(super);
24592+
24593+ if (sinfo->fake != NULL) {
44254afd
MT
24594+ iput(sinfo->fake);
24595+ sinfo->fake = NULL;
24596+ }
24597+
24598+ if (sinfo->bitmap != NULL) {
24599+ iput(sinfo->bitmap);
24600+ sinfo->bitmap = NULL;
24601+ }
24602+
24603+ if (sinfo->cc != NULL) {
24604+ iput(sinfo->cc);
24605+ sinfo->cc = NULL;
24606+ }
24607+ return;
24608+}
24609+
24610+void reiser4_wait_page_writeback(struct page *page)
24611+{
24612+ assert("zam-783", PageLocked(page));
24613+
24614+ do {
24615+ unlock_page(page);
24616+ wait_on_page_writeback(page);
24617+ lock_page(page);
24618+ } while (PageWriteback(page));
24619+}
24620+
24621+/* return tree @page is in */
71430cf6 24622+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
44254afd
MT
24623+{
24624+ assert("nikita-2461", page != NULL);
24625+ return &get_super_private(page->mapping->host->i_sb)->tree;
24626+}
24627+
24628+/* completion handler for single page bio-based read.
24629+
24630+ mpage_end_io_read() would also do. But it's static.
24631+
24632+*/
24633+static int
24634+end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24635+ int err UNUSED_ARG)
24636+{
24637+ struct page *page;
24638+
24639+ if (bio->bi_size != 0) {
24640+ warning("nikita-3332", "Truncated single page read: %i",
24641+ bio->bi_size);
24642+ return 1;
24643+ }
24644+
24645+ page = bio->bi_io_vec[0].bv_page;
24646+
24647+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24648+ SetPageUptodate(page);
24649+ } else {
24650+ ClearPageUptodate(page);
24651+ SetPageError(page);
24652+ }
24653+ unlock_page(page);
24654+ bio_put(bio);
24655+ return 0;
24656+}
24657+
24658+/* completion handler for single page bio-based write.
24659+
24660+ mpage_end_io_write() would also do. But it's static.
24661+
24662+*/
24663+static int
24664+end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
24665+ int err UNUSED_ARG)
24666+{
24667+ struct page *page;
24668+
24669+ if (bio->bi_size != 0) {
24670+ warning("nikita-3333", "Truncated single page write: %i",
24671+ bio->bi_size);
24672+ return 1;
24673+ }
24674+
24675+ page = bio->bi_io_vec[0].bv_page;
24676+
24677+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24678+ SetPageError(page);
24679+ end_page_writeback(page);
24680+ bio_put(bio);
24681+ return 0;
24682+}
24683+
24684+/* ->readpage() method for formatted nodes */
24685+static int formatted_readpage(struct file *f UNUSED_ARG,
24686+ struct page *page /* page to read */ )
24687+{
24688+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
71430cf6
MT
24689+ return reiser4_page_io(page, jprivate(page), READ,
24690+ reiser4_ctx_gfp_mask_get());
44254afd
MT
24691+}
24692+
24693+/**
71430cf6 24694+ * reiser4_page_io - submit single-page bio request
44254afd
MT
24695+ * @page: page to perform io for
24696+ * @node: jnode of page
24697+ * @rw: read or write
24698+ * @gfp: gfp mask for bio allocation
24699+ *
24700+ * Submits single page read or write.
24701+ */
71430cf6 24702+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
44254afd
MT
24703+{
24704+ struct bio *bio;
24705+ int result;
24706+
24707+ assert("nikita-2094", page != NULL);
24708+ assert("nikita-2226", PageLocked(page));
24709+ assert("nikita-2634", node != NULL);
24710+ assert("nikita-2893", rw == READ || rw == WRITE);
24711+
24712+ if (rw) {
24713+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24714+ unlock_page(page);
24715+ return 0;
24716+ }
24717+ }
24718+
24719+ bio = page_bio(page, node, rw, gfp);
24720+ if (!IS_ERR(bio)) {
24721+ if (rw == WRITE) {
71430cf6 24722+ set_page_writeback(page);
44254afd
MT
24723+ unlock_page(page);
24724+ }
24725+ reiser4_submit_bio(rw, bio);
24726+ result = 0;
24727+ } else {
24728+ unlock_page(page);
24729+ result = PTR_ERR(bio);
24730+ }
24731+
24732+ return result;
24733+}
24734+
24735+/* helper function to construct bio for page */
24736+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24737+{
24738+ struct bio *bio;
24739+ assert("nikita-2092", page != NULL);
24740+ assert("nikita-2633", node != NULL);
24741+
24742+ /* Simple implementation in the assumption that blocksize == pagesize.
24743+
24744+ We only have to submit one block, but submit_bh() will allocate bio
24745+ anyway, so lets use all the bells-and-whistles of bio code.
24746+ */
24747+
24748+ bio = bio_alloc(gfp, 1);
24749+ if (bio != NULL) {
24750+ int blksz;
24751+ struct super_block *super;
24752+ reiser4_block_nr blocknr;
24753+
24754+ super = page->mapping->host->i_sb;
24755+ assert("nikita-2029", super != NULL);
24756+ blksz = super->s_blocksize;
24757+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24758+
24759+ spin_lock_jnode(node);
24760+ blocknr = *jnode_get_io_block(node);
24761+ spin_unlock_jnode(node);
24762+
24763+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
71430cf6 24764+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
44254afd
MT
24765+
24766+ bio->bi_bdev = super->s_bdev;
24767+ /* fill bio->bi_sector before calling bio_add_page(), because
24768+ * q->merge_bvec_fn may want to inspect it (see
24769+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24770+ bio->bi_sector = blocknr * (blksz >> 9);
24771+
24772+ if (!bio_add_page(bio, page, blksz, 0)) {
24773+ warning("nikita-3452",
24774+ "Single page bio cannot be constructed");
24775+ return ERR_PTR(RETERR(-EINVAL));
24776+ }
24777+
24778+ /* bio -> bi_idx is filled by bio_init() */
24779+ bio->bi_end_io = (rw == READ) ?
24780+ end_bio_single_page_read : end_bio_single_page_write;
24781+
24782+ return bio;
24783+ } else
24784+ return ERR_PTR(RETERR(-ENOMEM));
24785+}
24786+
24787+/* this function is internally called by jnode_make_dirty() */
71430cf6 24788+int reiser4_set_page_dirty_internal(struct page *page)
44254afd
MT
24789+{
24790+ struct address_space *mapping;
24791+
24792+ mapping = page->mapping;
24793+ BUG_ON(mapping == NULL);
24794+
24795+ if (!TestSetPageDirty(page)) {
24796+ if (mapping_cap_account_dirty(mapping))
71430cf6 24797+ inc_zone_page_state(page, NR_FILE_DIRTY);
44254afd
MT
24798+
24799+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24800+ }
24801+
24802+ /* znode must be dirty ? */
71430cf6 24803+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
44254afd
MT
24804+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
24805+ return 0;
24806+}
24807+
71430cf6 24808+#if 0
44254afd
MT
24809+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24810+{
24811+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24812+ return 1;
24813+ if (ctx->super != s)
24814+ return 1;
24815+ if (get_super_private(s)->entd.tsk == current)
24816+ return 0;
24817+ if (!lock_stack_isclean(&ctx->stack))
24818+ return 0;
24819+ if (ctx->trans->atom != NULL)
24820+ return 0;
24821+ return 1;
24822+}
44254afd
MT
24823+#endif
24824+
24825+/**
24826+ * reiser4_writepage - writepage of struct address_space_operations
24827+ * @page: page to write
24828+ * @wbc:
24829+ *
24830+ *
24831+ */
24832+/* Common memory pressure notification. */
24833+int reiser4_writepage(struct page *page,
24834+ struct writeback_control *wbc)
24835+{
24836+ struct super_block *s;
24837+ reiser4_context *ctx;
24838+
24839+ assert("vs-828", PageLocked(page));
24840+
24841+ s = page->mapping->host->i_sb;
24842+ ctx = get_current_context_check();
24843+
71430cf6 24844+ //assert("", can_hit_entd(ctx, s));
44254afd
MT
24845+ return write_page_by_ent(page, wbc);
24846+}
24847+
24848+/* ->set_page_dirty() method of formatted address_space */
24849+static int formatted_set_page_dirty(struct page *page)
24850+{
24851+ assert("nikita-2173", page != NULL);
24852+ BUG();
24853+ return __set_page_dirty_nobuffers(page);
24854+}
24855+
24856+/* writepages method of address space operations in reiser4 is used to involve
24857+ into transactions pages which are dirtied via mmap. Only regular files can
24858+ have such pages. Fake inode is used to access formatted nodes via page
24859+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
24860+ nothing to do */
24861+static int
24862+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24863+{
24864+ return 0;
24865+}
24866+
24867+/* address space operations for the fake inode */
24868+static struct address_space_operations formatted_fake_as_ops = {
24869+ /* Perform a writeback of a single page as a memory-freeing
24870+ * operation. */
24871+ .writepage = reiser4_writepage,
24872+ /* this is called to read formatted node */
24873+ .readpage = formatted_readpage,
24874+ /* ->sync_page() method of fake inode address space operations. Called
24875+ from wait_on_page() and lock_page().
24876+
24877+ This is most annoyingly misnomered method. Actually it is called
24878+ from wait_on_page_bit() and lock_page() and its purpose is to
24879+ actually start io by jabbing device drivers.
24880+ */
24881+ .sync_page = block_sync_page,
24882+ /* Write back some dirty pages from this mapping. Called from sync.
24883+ called during sync (pdflush) */
24884+ .writepages = writepages_fake,
24885+ /* Set a page dirty */
24886+ .set_page_dirty = formatted_set_page_dirty,
24887+ /* used for read-ahead. Not applicable */
24888+ .readpages = NULL,
24889+ .prepare_write = NULL,
24890+ .commit_write = NULL,
24891+ .bmap = NULL,
24892+ /* called just before page is being detached from inode mapping and
24893+ removed from memory. Called on truncate, cut/squeeze, and
24894+ umount. */
24895+ .invalidatepage = reiser4_invalidatepage,
24896+ /* this is called by shrink_cache() so that file system can try to
24897+ release objects (jnodes, buffers, journal heads) attached to page
24898+ and, may be made page itself free-able.
24899+ */
24900+ .releasepage = reiser4_releasepage,
24901+ .direct_IO = NULL
24902+};
24903+
24904+/* called just before page is released (no longer used by reiser4). Callers:
24905+ jdelete() and extent2tail(). */
71430cf6 24906+void reiser4_drop_page(struct page *page)
44254afd
MT
24907+{
24908+ assert("nikita-2181", PageLocked(page));
24909+ clear_page_dirty_for_io(page);
24910+ ClearPageUptodate(page);
24911+#if defined(PG_skipped)
24912+ ClearPageSkipped(page);
24913+#endif
71430cf6 24914+ unlock_page(page);
44254afd
MT
24915+}
24916+
24917+#define JNODE_GANG_SIZE (16)
24918+
71430cf6 24919+/* find all jnodes from range specified and invalidate them */
44254afd
MT
24920+static int
24921+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24922+{
24923+ reiser4_inode *info;
24924+ int truncated_jnodes;
24925+ reiser4_tree *tree;
24926+ unsigned long index;
24927+ unsigned long end;
24928+
71430cf6
MT
24929+ if (inode_file_plugin(inode) ==
24930+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24931+ /*
24932+ * No need to get rid of jnodes here: if the single jnode of
24933+ * page cluster did not have page, then it was found and killed
24934+ * before in
24935+ * truncate_complete_page_cluster()->jput()->jput_final(),
24936+ * otherwise it will be dropped by reiser4_invalidatepage()
24937+ */
24938+ return 0;
44254afd
MT
24939+ truncated_jnodes = 0;
24940+
24941+ info = reiser4_inode_data(inode);
71430cf6 24942+ tree = reiser4_tree_by_inode(inode);
44254afd
MT
24943+
24944+ index = from;
24945+ end = from + count;
24946+
24947+ while (1) {
24948+ jnode *gang[JNODE_GANG_SIZE];
24949+ int taken;
24950+ int i;
24951+ jnode *node;
24952+
24953+ assert("nikita-3466", index <= end);
24954+
24955+ read_lock_tree(tree);
24956+ taken =
24957+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24958+ (void **)gang, index,
24959+ JNODE_GANG_SIZE);
24960+ for (i = 0; i < taken; ++i) {
24961+ node = gang[i];
24962+ if (index_jnode(node) < end)
24963+ jref(node);
24964+ else
24965+ gang[i] = NULL;
24966+ }
24967+ read_unlock_tree(tree);
24968+
24969+ for (i = 0; i < taken; ++i) {
24970+ node = gang[i];
24971+ if (node != NULL) {
24972+ index = max(index, index_jnode(node));
71430cf6
MT
24973+ spin_lock_jnode(node);
24974+ assert("edward-1457", node->pg == NULL);
24975+ /* this is always called after
24976+ truncate_inode_pages_range(). Therefore, here
24977+ jnode can not have page. New pages can not be
24978+ created because truncate_jnodes_range goes
24979+ under exclusive access on file obtained,
24980+ where as new page creation requires
24981+ non-exclusive access obtained */
24982+ JF_SET(node, JNODE_HEARD_BANSHEE);
24983+ reiser4_uncapture_jnode(node);
24984+ unhash_unformatted_jnode(node);
44254afd
MT
24985+ truncated_jnodes++;
24986+ jput(node);
24987+ } else
24988+ break;
24989+ }
24990+ if (i != taken || taken == 0)
24991+ break;
24992+ }
24993+ return truncated_jnodes;
24994+}
24995+
71430cf6
MT
24996+/* Truncating files in reiser4: problems and solutions.
24997+
24998+ VFS calls fs's truncate after it has called truncate_inode_pages()
24999+ to get rid of pages corresponding to part of file being truncated.
25000+ In reiser4 it may cause existence of unallocated extents which do
25001+ not have jnodes. Flush code does not expect that. Solution of this
25002+ problem is straightforward. As vfs's truncate is implemented using
25003+ setattr operation, it seems reasonable to have ->setattr() that
25004+ will cut file body. However, flush code also does not expect dirty
25005+ pages without parent items, so it is impossible to cut all items,
25006+ then truncate all pages in two steps. We resolve this problem by
25007+ cutting items one-by-one. Each such fine-grained step performed
25008+ under longterm znode lock calls at the end ->kill_hook() method of
25009+ a killed item to remove its binded pages and jnodes.
25010+
25011+ The following function is a common part of mentioned kill hooks.
25012+ Also, this is called before tail-to-extent conversion (to not manage
25013+ few copies of the data).
25014+*/
25015+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25016+ unsigned long count, int even_cows)
44254afd
MT
25017+{
25018+ loff_t from_bytes, count_bytes;
25019+
25020+ if (count == 0)
25021+ return;
25022+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25023+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25024+
25025+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25026+ truncate_inode_pages_range(mapping, from_bytes,
25027+ from_bytes + count_bytes - 1);
25028+ truncate_jnodes_range(mapping->host, from, count);
25029+}
25030+
25031+/*
25032+ * Local variables:
25033+ * c-indentation-style: "K&R"
25034+ * mode-name: "LC"
25035+ * c-basic-offset: 8
25036+ * tab-width: 8
25037+ * fill-column: 120
25038+ * scroll-step: 1
25039+ * End:
25040+ */
71430cf6
MT
25041diff -urN linux-2.6.22.orig/fs/reiser4/page_cache.h linux-2.6.22/fs/reiser4/page_cache.h
25042--- linux-2.6.22.orig/fs/reiser4/page_cache.h 1970-01-01 03:00:00.000000000 +0300
25043+++ linux-2.6.22/fs/reiser4/page_cache.h 2007-07-29 00:25:34.888699583 +0400
25044@@ -0,0 +1,68 @@
44254afd
MT
25045+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25046+ * reiser4/README */
25047+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25048+
25049+#if !defined( __REISER4_PAGE_CACHE_H__ )
25050+#define __REISER4_PAGE_CACHE_H__
25051+
25052+#include "forward.h"
71430cf6 25053+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
44254afd
MT
25054+
25055+#include <linux/fs.h> /* for struct super_block, address_space */
25056+#include <linux/mm.h> /* for struct page */
25057+#include <linux/pagemap.h> /* for lock_page() */
71430cf6 25058+#include <linux/vmalloc.h> /* for __vmalloc() */
44254afd 25059+
71430cf6
MT
25060+extern int reiser4_init_formatted_fake(struct super_block *);
25061+extern void reiser4_done_formatted_fake(struct super_block *);
44254afd 25062+
71430cf6 25063+extern reiser4_tree *reiser4_tree_by_page(const struct page *);
44254afd 25064+
71430cf6 25065+extern int reiser4_set_page_dirty_internal(struct page *);
44254afd
MT
25066+
25067+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25068+
25069+extern void reiser4_wait_page_writeback(struct page *);
25070+static inline void lock_and_wait_page_writeback(struct page *page)
25071+{
25072+ lock_page(page);
25073+ if (unlikely(PageWriteback(page)))
25074+ reiser4_wait_page_writeback(page);
25075+}
25076+
25077+#define jprivate(page) ((jnode *)page_private(page))
25078+
71430cf6
MT
25079+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25080+extern void reiser4_drop_page(struct page *);
44254afd
MT
25081+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25082+ unsigned long count, int even_cows);
25083+extern void capture_reiser4_inodes(struct super_block *,
25084+ struct writeback_control *);
71430cf6
MT
25085+static inline void * reiser4_vmalloc (unsigned long size)
25086+{
25087+ return __vmalloc(size,
25088+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25089+ PAGE_KERNEL);
25090+}
44254afd
MT
25091+
25092+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25093+
25094+#if REISER4_DEBUG
25095+extern void print_page(const char *prefix, struct page *page);
25096+#else
25097+#define print_page(prf, p) noop
25098+#endif
25099+
25100+/* __REISER4_PAGE_CACHE_H__ */
25101+#endif
25102+
25103+/* Make Linus happy.
25104+ Local variables:
25105+ c-indentation-style: "K&R"
25106+ mode-name: "LC"
25107+ c-basic-offset: 8
25108+ tab-width: 8
25109+ fill-column: 120
25110+ scroll-step: 1
25111+ End:
25112+*/
71430cf6
MT
25113diff -urN linux-2.6.22.orig/fs/reiser4/plugin/cluster.c linux-2.6.22/fs/reiser4/plugin/cluster.c
25114--- linux-2.6.22.orig/fs/reiser4/plugin/cluster.c 1970-01-01 03:00:00.000000000 +0300
25115+++ linux-2.6.22/fs/reiser4/plugin/cluster.c 2007-07-29 00:25:34.892700618 +0400
25116@@ -0,0 +1,71 @@
44254afd
MT
25117+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25118+
25119+/* Contains reiser4 cluster plugins (see
25120+ http://www.namesys.com/cryptcompress_design.html
25121+ "Concepts of clustering" for details). */
25122+
25123+#include "plugin_header.h"
25124+#include "plugin.h"
25125+#include "../inode.h"
25126+
71430cf6
MT
25127+static int change_cluster(struct inode *inode,
25128+ reiser4_plugin * plugin,
25129+ pset_member memb)
44254afd 25130+{
44254afd
MT
25131+ assert("edward-1324", inode != NULL);
25132+ assert("edward-1325", plugin != NULL);
25133+ assert("edward-1326", is_reiser4_inode(inode));
25134+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25135+
71430cf6
MT
25136+ /* Can't change the cluster plugin for already existent regular files. */
25137+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25138+ return RETERR(-EINVAL);
25139+
25140+ /* If matches, nothing to change. */
25141+ if (inode_hash_plugin(inode) != NULL &&
25142+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25143+ return 0;
25144+
25145+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25146+ PSET_CLUSTER, plugin);
44254afd
MT
25147+}
25148+
25149+static reiser4_plugin_ops cluster_plugin_ops = {
25150+ .init = NULL,
25151+ .load = NULL,
25152+ .save_len = NULL,
25153+ .save = NULL,
25154+ .change = &change_cluster
25155+};
25156+
25157+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25158+ [CLUSTER_ ## ID ## _ID] = { \
25159+ .h = { \
25160+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25161+ .id = CLUSTER_ ## ID ## _ID, \
25162+ .pops = &cluster_plugin_ops, \
25163+ .label = LABEL, \
25164+ .desc = DESC, \
25165+ .linkage = {NULL, NULL} \
25166+ }, \
25167+ .shift = SHIFT \
25168+ }
25169+
25170+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25171+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25172+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25173+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25174+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25175+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25176+};
25177+
25178+/*
25179+ Local variables:
25180+ c-indentation-style: "K&R"
25181+ mode-name: "LC"
25182+ c-basic-offset: 8
25183+ tab-width: 8
25184+ fill-column: 120
25185+ scroll-step: 1
25186+ End:
25187+*/
71430cf6
MT
25188diff -urN linux-2.6.22.orig/fs/reiser4/plugin/cluster.h linux-2.6.22/fs/reiser4/plugin/cluster.h
25189--- linux-2.6.22.orig/fs/reiser4/plugin/cluster.h 1970-01-01 03:00:00.000000000 +0300
25190+++ linux-2.6.22/fs/reiser4/plugin/cluster.h 2007-07-29 00:25:34.892700618 +0400
25191@@ -0,0 +1,399 @@
44254afd
MT
25192+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25193+
71430cf6
MT
25194+/* This file contains size/offset translators, modulators
25195+ and other helper functions. */
44254afd
MT
25196+
25197+#if !defined( __FS_REISER4_CLUSTER_H__ )
25198+#define __FS_REISER4_CLUSTER_H__
25199+
25200+#include "../inode.h"
25201+
25202+static inline int inode_cluster_shift(struct inode *inode)
25203+{
25204+ assert("edward-92", inode != NULL);
25205+ assert("edward-93", reiser4_inode_data(inode) != NULL);
25206+
25207+ return inode_cluster_plugin(inode)->shift;
25208+}
25209+
25210+static inline unsigned cluster_nrpages_shift(struct inode *inode)
25211+{
25212+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25213+}
25214+
25215+/* cluster size in page units */
25216+static inline unsigned cluster_nrpages(struct inode *inode)
25217+{
25218+ return 1U << cluster_nrpages_shift(inode);
25219+}
25220+
25221+static inline size_t inode_cluster_size(struct inode *inode)
25222+{
25223+ assert("edward-96", inode != NULL);
25224+
25225+ return 1U << inode_cluster_shift(inode);
25226+}
25227+
25228+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25229+{
25230+ return idx >> cluster_nrpages_shift(inode);
25231+}
25232+
25233+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25234+{
25235+ return idx << cluster_nrpages_shift(inode);
25236+}
25237+
25238+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25239+{
25240+ return clust_to_pg(pg_to_clust(idx, inode), inode);
25241+}
25242+
25243+static inline pgoff_t off_to_pg(loff_t off)
25244+{
25245+ return (off >> PAGE_CACHE_SHIFT);
25246+}
25247+
25248+static inline loff_t pg_to_off(pgoff_t idx)
25249+{
25250+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25251+}
25252+
25253+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25254+{
25255+ return off >> inode_cluster_shift(inode);
25256+}
25257+
25258+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25259+{
25260+ return (loff_t) idx << inode_cluster_shift(inode);
25261+}
25262+
71430cf6 25263+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
44254afd 25264+{
71430cf6 25265+ return clust_to_off(off_to_clust(off, inode), inode);
44254afd
MT
25266+}
25267+
71430cf6 25268+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
44254afd 25269+{
71430cf6 25270+ return clust_to_pg(off_to_clust(off, inode), inode);
44254afd
MT
25271+}
25272+
71430cf6 25273+static inline unsigned off_to_pgoff(loff_t off)
44254afd 25274+{
71430cf6 25275+ return off & (PAGE_CACHE_SIZE - 1);
44254afd
MT
25276+}
25277+
71430cf6 25278+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
44254afd 25279+{
71430cf6 25280+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
44254afd
MT
25281+}
25282+
71430cf6 25283+static inline pgoff_t offset_in_clust(struct page * page)
44254afd 25284+{
71430cf6
MT
25285+ assert("edward-1488", page != NULL);
25286+ assert("edward-1489", page->mapping != NULL);
44254afd 25287+
71430cf6 25288+ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
44254afd
MT
25289+}
25290+
71430cf6 25291+static inline int first_page_in_cluster(struct page * page)
44254afd 25292+{
71430cf6 25293+ return offset_in_clust(page) == 0;
44254afd
MT
25294+}
25295+
71430cf6 25296+static inline int last_page_in_cluster(struct page * page)
44254afd 25297+{
71430cf6
MT
25298+ return offset_in_clust(page) ==
25299+ cluster_nrpages(page->mapping->host) - 1;
44254afd
MT
25300+}
25301+
25302+static inline unsigned
25303+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25304+{
25305+ return off_to_cloff(pg_to_off(idx), inode);
25306+}
25307+
71430cf6
MT
25308+/*********************** Size translators **************************/
25309+
25310+/* Translate linear size.
25311+ * New units are (1 << @blk_shift) times larger, then old ones.
25312+ * In other words, calculate number of logical blocks, occupied
25313+ * by @count elements
25314+ */
25315+static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
44254afd 25316+{
71430cf6 25317+ return (count + (1UL << blkbits) - 1) >> blkbits;
44254afd
MT
25318+}
25319+
71430cf6
MT
25320+/* size in pages */
25321+static inline pgoff_t size_in_pages(loff_t size)
44254afd 25322+{
71430cf6 25323+ return size_in_blocks(size, PAGE_CACHE_SHIFT);
44254afd
MT
25324+}
25325+
71430cf6
MT
25326+/* size in logical clusters */
25327+static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
44254afd 25328+{
71430cf6 25329+ return size_in_blocks(size, inode_cluster_shift(inode));
44254afd
MT
25330+}
25331+
71430cf6
MT
25332+/* size in pages to the size in page clusters */
25333+static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25334+{
25335+ return size_in_blocks(size, cluster_nrpages_shift(inode));
25336+}
25337+
25338+/*********************** Size modulators ***************************/
25339+
25340+/*
25341+ Modulate linear size by nominated block size and offset.
25342+
25343+ The "finite" function (which is zero almost everywhere).
25344+ How much is a height of the figure at a position @pos,
25345+ when trying to construct rectangle of height (1 << @blkbits),
25346+ and square @size.
25347+
25348+ ******
25349+ *******
25350+ *******
25351+ *******
25352+ ----------> pos
25353+*/
25354+static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
44254afd 25355+{
71430cf6
MT
25356+ unsigned end = size >> blkbits;
25357+ if (pos < end)
25358+ return 1U << blkbits;
25359+ if (unlikely(pos > end))
44254afd 25360+ return 0;
71430cf6 25361+ return size & ~(~0ull << blkbits);
44254afd
MT
25362+}
25363+
71430cf6
MT
25364+/* the same as above, but block size is page size */
25365+static inline unsigned __mbp(loff_t size, pgoff_t pos)
25366+{
25367+ return __mbb(size, pos, PAGE_CACHE_SHIFT);
25368+}
25369+
25370+/* number of file's bytes in the nominated logical cluster */
25371+static inline unsigned lbytes(cloff_t index, struct inode * inode)
44254afd 25372+{
71430cf6
MT
25373+ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25374+}
44254afd 25375+
71430cf6
MT
25376+/* number of file's bytes in the nominated page */
25377+static inline unsigned pbytes(pgoff_t index, struct inode * inode)
25378+{
25379+ return __mbp(i_size_read(inode), index);
44254afd
MT
25380+}
25381+
71430cf6
MT
25382+/* return true, if logical cluster is not occupied by the file */
25383+static inline int new_logical_cluster(struct cluster_handle * clust,
25384+ struct inode *inode)
25385+{
25386+ return clust_to_off(clust->index, inode) >= i_size_read(inode);
25387+}
25388+
25389+/* return true, if pages @p1 and @p2 are of the same page cluster */
25390+static inline int same_page_cluster(struct page * p1, struct page * p2)
25391+{
25392+ assert("edward-1490", p1 != NULL);
25393+ assert("edward-1491", p2 != NULL);
25394+ assert("edward-1492", p1->mapping != NULL);
25395+ assert("edward-1493", p2->mapping != NULL);
25396+
25397+ return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25398+ pg_to_clust(page_index(p2), p2->mapping->host));
25399+}
25400+
25401+static inline int cluster_is_complete(struct cluster_handle * clust,
25402+ struct inode * inode)
44254afd
MT
25403+{
25404+ return clust->tc.lsize == inode_cluster_size(inode);
25405+}
25406+
71430cf6 25407+static inline void reiser4_slide_init(struct reiser4_slide * win)
44254afd
MT
25408+{
25409+ assert("edward-1084", win != NULL);
25410+ memset(win, 0, sizeof *win);
25411+}
25412+
71430cf6
MT
25413+static inline tfm_action
25414+cluster_get_tfm_act(struct tfm_cluster * tc)
25415+{
25416+ assert("edward-1356", tc != NULL);
25417+ return tc->act;
25418+}
25419+
44254afd 25420+static inline void
71430cf6 25421+cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act)
44254afd
MT
25422+{
25423+ assert("edward-1356", tc != NULL);
25424+ tc->act = act;
25425+}
25426+
71430cf6
MT
25427+static inline void cluster_init_act(struct cluster_handle * clust,
25428+ tfm_action act,
25429+ struct reiser4_slide * window)
25430+{
44254afd
MT
25431+ assert("edward-84", clust != NULL);
25432+ memset(clust, 0, sizeof *clust);
71430cf6 25433+ cluster_set_tfm_act(&clust->tc, act);
44254afd
MT
25434+ clust->dstat = INVAL_DISK_CLUSTER;
25435+ clust->win = window;
25436+}
25437+
71430cf6
MT
25438+static inline void cluster_init_read(struct cluster_handle * clust,
25439+ struct reiser4_slide * window)
44254afd 25440+{
71430cf6 25441+ cluster_init_act (clust, TFMA_READ, window);
44254afd
MT
25442+}
25443+
71430cf6
MT
25444+static inline void cluster_init_write(struct cluster_handle * clust,
25445+ struct reiser4_slide * window)
44254afd 25446+{
71430cf6 25447+ cluster_init_act (clust, TFMA_WRITE, window);
44254afd
MT
25448+}
25449+
71430cf6
MT
25450+/* true if @p1 and @p2 are items of the same disk cluster */
25451+static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2)
25452+{
25453+ /* drop this if you have other items to aggregate */
25454+ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25455+
25456+ return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25457+}
25458+
25459+static inline int dclust_get_extension_dsize(hint_t * hint)
25460+{
25461+ return hint->ext_coord.extension.ctail.dsize;
25462+}
25463+
25464+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25465+{
25466+ hint->ext_coord.extension.ctail.dsize = dsize;
25467+}
25468+
25469+static inline int dclust_get_extension_shift(hint_t * hint)
44254afd
MT
25470+{
25471+ return hint->ext_coord.extension.ctail.shift;
25472+}
25473+
71430cf6 25474+static inline int dclust_get_extension_ncount(hint_t * hint)
44254afd 25475+{
71430cf6
MT
25476+ return hint->ext_coord.extension.ctail.ncount;
25477+}
25478+
25479+static inline void dclust_inc_extension_ncount(hint_t * hint)
25480+{
25481+ hint->ext_coord.extension.ctail.ncount ++;
25482+}
25483+
25484+static inline void dclust_init_extension(hint_t * hint)
25485+{
25486+ memset(&hint->ext_coord.extension.ctail, 0,
25487+ sizeof(hint->ext_coord.extension.ctail));
44254afd
MT
25488+}
25489+
25490+static inline int hint_is_unprepped_dclust(hint_t * hint)
25491+{
71430cf6
MT
25492+ assert("edward-1451", hint_is_valid(hint));
25493+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
44254afd
MT
25494+}
25495+
25496+static inline void coord_set_between_clusters(coord_t * coord)
25497+{
25498+#if REISER4_DEBUG
25499+ int result;
25500+ result = zload(coord->node);
25501+ assert("edward-1296", !result);
25502+#endif
25503+ if (!coord_is_between_items(coord)) {
25504+ coord->between = AFTER_ITEM;
25505+ coord->unit_pos = 0;
25506+ }
25507+#if REISER4_DEBUG
25508+ zrelse(coord->node);
25509+#endif
25510+}
25511+
71430cf6
MT
25512+int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25513+int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
44254afd 25514+ znode_lock_mode mode);
71430cf6
MT
25515+int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *);
25516+int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25517+void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25518+ int even_cows);
25519+void invalidate_hint_cluster(struct cluster_handle * clust);
25520+void put_hint_cluster(struct cluster_handle * clust, struct inode *inode,
25521+ znode_lock_mode mode);
25522+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode,
44254afd 25523+ znode_lock_mode lock_mode);
71430cf6
MT
25524+void reset_cluster_params(struct cluster_handle * clust);
25525+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
44254afd 25526+ int count);
71430cf6
MT
25527+int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust,
25528+ rw_op rw);
25529+void __put_page_cluster(int from, int to, struct page ** pages,
25530+ struct inode * inode);
25531+void put_page_cluster(struct cluster_handle * clust,
25532+ struct inode * inode, rw_op rw);
25533+void put_cluster_handle(struct cluster_handle * clust);
25534+int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id);
25535+int tfm_cluster_is_uptodate(struct tfm_cluster * tc);
25536+void tfm_cluster_set_uptodate(struct tfm_cluster * tc);
25537+void tfm_cluster_clr_uptodate(struct tfm_cluster * tc);
44254afd
MT
25538+
25539+/* move cluster handle to the target position
71430cf6
MT
25540+ specified by the page of index @pgidx */
25541+static inline void move_cluster_forward(struct cluster_handle * clust,
25542+ struct inode *inode,
25543+ pgoff_t pgidx)
44254afd
MT
25544+{
25545+ assert("edward-1297", clust != NULL);
25546+ assert("edward-1298", inode != NULL);
25547+
25548+ reset_cluster_params(clust);
71430cf6 25549+ if (clust->index_valid &&
44254afd
MT
25550+ /* Hole in the indices. Hint became invalid and can not be
25551+ used by find_cluster_item() even if seal/node versions
25552+ will coincide */
25553+ pg_to_clust(pgidx, inode) != clust->index + 1) {
71430cf6 25554+ reiser4_unset_hint(clust->hint);
44254afd
MT
25555+ invalidate_hint_cluster(clust);
25556+ }
44254afd 25557+ clust->index = pg_to_clust(pgidx, inode);
71430cf6 25558+ clust->index_valid = 1;
44254afd
MT
25559+}
25560+
71430cf6
MT
25561+static inline int alloc_clust_pages(struct cluster_handle * clust,
25562+ struct inode *inode)
44254afd
MT
25563+{
25564+ assert("edward-791", clust != NULL);
25565+ assert("edward-792", inode != NULL);
25566+ clust->pages =
25567+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
71430cf6 25568+ reiser4_ctx_gfp_mask_get());
44254afd
MT
25569+ if (!clust->pages)
25570+ return -ENOMEM;
25571+ return 0;
25572+}
25573+
71430cf6 25574+static inline void free_clust_pages(struct cluster_handle * clust)
44254afd
MT
25575+{
25576+ kfree(clust->pages);
25577+}
25578+
25579+#endif /* __FS_REISER4_CLUSTER_H__ */
25580+
25581+/* Make Linus happy.
25582+ Local variables:
25583+ c-indentation-style: "K&R"
25584+ mode-name: "LC"
25585+ c-basic-offset: 8
25586+ tab-width: 8
25587+ fill-column: 120
25588+ scroll-step: 1
25589+ End:
25590+*/
71430cf6
MT
25591diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.22/fs/reiser4/plugin/compress/compress.c
25592--- linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 03:00:00.000000000 +0300
25593+++ linux-2.6.22/fs/reiser4/plugin/compress/compress.c 2007-07-29 00:25:34.892700618 +0400
25594@@ -0,0 +1,381 @@
44254afd
MT
25595+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25596+/* reiser4 compression transform plugins */
25597+
25598+#include "../../debug.h"
25599+#include "../../inode.h"
25600+#include "../plugin.h"
25601+#include "minilzo.h"
25602+
44254afd
MT
25603+#include <linux/zlib.h>
25604+#include <linux/types.h>
25605+#include <linux/hardirq.h>
25606+
71430cf6
MT
25607+static int change_compression(struct inode *inode,
25608+ reiser4_plugin * plugin,
25609+ pset_member memb)
44254afd
MT
25610+{
25611+ assert("edward-1316", inode != NULL);
25612+ assert("edward-1317", plugin != NULL);
25613+ assert("edward-1318", is_reiser4_inode(inode));
25614+ assert("edward-1319",
25615+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
71430cf6
MT
25616+
25617+ /* cannot change compression plugin of already existing regular object */
25618+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25619+ return RETERR(-EINVAL);
25620+
25621+ /* If matches, nothing to change. */
25622+ if (inode_hash_plugin(inode) != NULL &&
25623+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25624+ return 0;
25625+
25626+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25627+ PSET_COMPRESSION, plugin);
44254afd
MT
25628+}
25629+
25630+static reiser4_plugin_ops compression_plugin_ops = {
25631+ .init = NULL,
25632+ .load = NULL,
25633+ .save_len = NULL,
25634+ .save = NULL,
25635+ .change = &change_compression
25636+};
25637+
25638+/******************************************************************************/
25639+/* gzip1 compression */
25640+/******************************************************************************/
25641+
25642+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
25643+#define GZIP1_DEF_WINBITS 15
25644+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
25645+
25646+static int gzip1_init(void)
25647+{
25648+ int ret = -EINVAL;
25649+#if REISER4_ZLIB
25650+ ret = 0;
25651+#endif
25652+ if (ret == -EINVAL)
25653+ warning("edward-1337", "Zlib not compiled into kernel");
25654+ return ret;
25655+}
25656+
25657+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25658+{
25659+ return 0;
25660+}
25661+
25662+static coa_t gzip1_alloc(tfm_action act)
25663+{
25664+ coa_t coa = NULL;
25665+#if REISER4_ZLIB
25666+ int ret = 0;
25667+ switch (act) {
71430cf6
MT
25668+ case TFMA_WRITE: /* compress */
25669+ coa = reiser4_vmalloc(zlib_deflate_workspacesize());
44254afd
MT
25670+ if (!coa) {
25671+ ret = -ENOMEM;
25672+ break;
25673+ }
25674+ memset(coa, 0, zlib_deflate_workspacesize());
25675+ break;
71430cf6
MT
25676+ case TFMA_READ: /* decompress */
25677+ coa = reiser4_vmalloc(zlib_inflate_workspacesize());
44254afd
MT
25678+ if (!coa) {
25679+ ret = -ENOMEM;
25680+ break;
25681+ }
25682+ memset(coa, 0, zlib_inflate_workspacesize());
25683+ break;
25684+ default:
25685+ impossible("edward-767",
25686+ "trying to alloc workspace for unknown tfm action");
25687+ }
25688+ if (ret) {
25689+ warning("edward-768",
25690+ "alloc workspace for gzip1 (tfm action = %d) failed\n",
25691+ act);
25692+ return ERR_PTR(ret);
25693+ }
25694+#endif
25695+ return coa;
25696+}
25697+
25698+static void gzip1_free(coa_t coa, tfm_action act)
25699+{
25700+ assert("edward-769", coa != NULL);
25701+
25702+ switch (act) {
71430cf6 25703+ case TFMA_WRITE: /* compress */
44254afd
MT
25704+ vfree(coa);
25705+ break;
71430cf6 25706+ case TFMA_READ: /* decompress */
44254afd
MT
25707+ vfree(coa);
25708+ break;
25709+ default:
25710+ impossible("edward-770", "unknown tfm action");
25711+ }
25712+ return;
25713+}
25714+
25715+static int gzip1_min_size_deflate(void)
25716+{
25717+ return 64;
25718+}
25719+
25720+static void
25721+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25722+ __u8 * dst_first, unsigned *dst_len)
25723+{
25724+#if REISER4_ZLIB
25725+ int ret = 0;
25726+ struct z_stream_s stream;
25727+
25728+ memset(&stream, 0, sizeof(stream));
25729+
25730+ assert("edward-842", coa != NULL);
25731+ assert("edward-875", src_len != 0);
25732+
25733+ stream.workspace = coa;
25734+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25735+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25736+ Z_DEFAULT_STRATEGY);
25737+ if (ret != Z_OK) {
25738+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25739+ goto rollback;
25740+ }
25741+ ret = zlib_deflateReset(&stream);
25742+ if (ret != Z_OK) {
25743+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25744+ goto rollback;
25745+ }
25746+ stream.next_in = src_first;
25747+ stream.avail_in = src_len;
25748+ stream.next_out = dst_first;
25749+ stream.avail_out = *dst_len;
25750+
25751+ ret = zlib_deflate(&stream, Z_FINISH);
25752+ if (ret != Z_STREAM_END) {
25753+ if (ret != Z_OK)
25754+ warning("edward-773",
25755+ "zlib_deflate returned %d\n", ret);
25756+ goto rollback;
25757+ }
25758+ *dst_len = stream.total_out;
25759+ return;
25760+ rollback:
25761+ *dst_len = src_len;
25762+#endif
25763+ return;
25764+}
25765+
25766+static void
25767+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25768+ __u8 * dst_first, unsigned *dst_len)
25769+{
25770+#if REISER4_ZLIB
25771+ int ret = 0;
25772+ struct z_stream_s stream;
25773+
25774+ memset(&stream, 0, sizeof(stream));
25775+
25776+ assert("edward-843", coa != NULL);
25777+ assert("edward-876", src_len != 0);
25778+
25779+ stream.workspace = coa;
25780+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25781+ if (ret != Z_OK) {
25782+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25783+ return;
25784+ }
25785+ ret = zlib_inflateReset(&stream);
25786+ if (ret != Z_OK) {
25787+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25788+ return;
25789+ }
25790+
25791+ stream.next_in = src_first;
25792+ stream.avail_in = src_len;
25793+ stream.next_out = dst_first;
25794+ stream.avail_out = *dst_len;
25795+
25796+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25797+ /*
25798+ * Work around a bug in zlib, which sometimes wants to taste an extra
25799+ * byte when being used in the (undocumented) raw deflate mode.
25800+ * (From USAGI).
25801+ */
25802+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25803+ u8 zerostuff = 0;
25804+ stream.next_in = &zerostuff;
25805+ stream.avail_in = 1;
25806+ ret = zlib_inflate(&stream, Z_FINISH);
25807+ }
25808+ if (ret != Z_STREAM_END) {
25809+ warning("edward-776", "zlib_inflate returned %d\n", ret);
25810+ return;
25811+ }
25812+ *dst_len = stream.total_out;
25813+#endif
25814+ return;
25815+}
25816+
25817+/******************************************************************************/
25818+/* lzo1 compression */
25819+/******************************************************************************/
25820+
25821+static int lzo1_init(void)
25822+{
25823+ int ret;
25824+ ret = lzo_init();
25825+ if (ret != LZO_E_OK)
25826+ warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
25827+ return ret;
25828+}
25829+
25830+static int lzo1_overrun(unsigned in_len)
25831+{
25832+ return in_len / 64 + 16 + 3;
25833+}
25834+
25835+#define LZO_HEAP_SIZE(size) \
25836+ sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
25837+
25838+static coa_t lzo1_alloc(tfm_action act)
25839+{
25840+ int ret = 0;
25841+ coa_t coa = NULL;
25842+
25843+ switch (act) {
71430cf6
MT
25844+ case TFMA_WRITE: /* compress */
25845+ coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
44254afd
MT
25846+ if (!coa) {
25847+ ret = -ENOMEM;
25848+ break;
25849+ }
25850+ memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
71430cf6 25851+ case TFMA_READ: /* decompress */
44254afd
MT
25852+ break;
25853+ default:
25854+ impossible("edward-877",
25855+ "trying to alloc workspace for unknown tfm action");
25856+ }
25857+ if (ret) {
25858+ warning("edward-878",
25859+ "alloc workspace for lzo1 (tfm action = %d) failed\n",
25860+ act);
25861+ return ERR_PTR(ret);
25862+ }
25863+ return coa;
25864+}
25865+
25866+static void lzo1_free(coa_t coa, tfm_action act)
25867+{
25868+ assert("edward-879", coa != NULL);
25869+
25870+ switch (act) {
71430cf6 25871+ case TFMA_WRITE: /* compress */
44254afd
MT
25872+ vfree(coa);
25873+ break;
71430cf6 25874+ case TFMA_READ: /* decompress */
44254afd
MT
25875+ impossible("edward-1304",
25876+ "trying to free non-allocated workspace");
25877+ default:
25878+ impossible("edward-880", "unknown tfm action");
25879+ }
25880+ return;
25881+}
25882+
25883+static int lzo1_min_size_deflate(void)
25884+{
25885+ return 256;
25886+}
25887+
25888+static void
25889+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25890+ __u8 * dst_first, unsigned *dst_len)
25891+{
25892+ int result;
25893+
25894+ assert("edward-846", coa != NULL);
25895+ assert("edward-847", src_len != 0);
25896+
25897+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25898+ if (result != LZO_E_OK) {
25899+ warning("edward-849", "lzo1x_1_compress failed\n");
25900+ goto out;
25901+ }
25902+ if (*dst_len >= src_len) {
25903+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25904+ goto out;
25905+ }
25906+ return;
25907+ out:
25908+ *dst_len = src_len;
25909+ return;
25910+}
25911+
25912+static void
25913+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25914+ __u8 * dst_first, unsigned *dst_len)
25915+{
25916+ int result;
25917+
25918+ assert("edward-851", coa == NULL);
25919+ assert("edward-852", src_len != 0);
25920+
25921+ result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
25922+ if (result != LZO_E_OK)
25923+ warning("edward-853", "lzo1x_1_decompress failed\n");
25924+ return;
25925+}
25926+
25927+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25928+ [LZO1_COMPRESSION_ID] = {
25929+ .h = {
25930+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25931+ .id = LZO1_COMPRESSION_ID,
25932+ .pops = &compression_plugin_ops,
25933+ .label = "lzo1",
25934+ .desc = "lzo1 compression transform",
25935+ .linkage = {NULL, NULL}
25936+ },
25937+ .init = lzo1_init,
25938+ .overrun = lzo1_overrun,
25939+ .alloc = lzo1_alloc,
25940+ .free = lzo1_free,
25941+ .min_size_deflate = lzo1_min_size_deflate,
25942+ .checksum = reiser4_adler32,
25943+ .compress = lzo1_compress,
25944+ .decompress = lzo1_decompress
25945+ },
25946+ [GZIP1_COMPRESSION_ID] = {
25947+ .h = {
25948+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25949+ .id = GZIP1_COMPRESSION_ID,
25950+ .pops = &compression_plugin_ops,
25951+ .label = "gzip1",
25952+ .desc = "gzip1 compression transform",
25953+ .linkage = {NULL, NULL}
25954+ },
25955+ .init = gzip1_init,
25956+ .overrun = gzip1_overrun,
25957+ .alloc = gzip1_alloc,
25958+ .free = gzip1_free,
25959+ .min_size_deflate = gzip1_min_size_deflate,
71430cf6 25960+ .checksum = reiser4_adler32,
44254afd
MT
25961+ .compress = gzip1_compress,
25962+ .decompress = gzip1_decompress
25963+ }
25964+};
25965+
25966+/*
25967+ Local variables:
25968+ c-indentation-style: "K&R"
25969+ mode-name: "LC"
25970+ c-basic-offset: 8
25971+ tab-width: 8
25972+ fill-column: 120
25973+ scroll-step: 1
25974+ End:
25975+*/
71430cf6
MT
25976diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.22/fs/reiser4/plugin/compress/compress.h
25977--- linux-2.6.22.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 03:00:00.000000000 +0300
25978+++ linux-2.6.22/fs/reiser4/plugin/compress/compress.h 2007-07-29 00:25:34.892700618 +0400
25979@@ -0,0 +1,43 @@
44254afd
MT
25980+#if !defined( __FS_REISER4_COMPRESS_H__ )
25981+#define __FS_REISER4_COMPRESS_H__
25982+
25983+#include <linux/types.h>
25984+#include <linux/string.h>
25985+
71430cf6 25986+/* transform direction */
44254afd 25987+typedef enum {
71430cf6
MT
25988+ TFMA_READ, /* decrypt, decompress */
25989+ TFMA_WRITE, /* encrypt, compress */
25990+ TFMA_LAST
44254afd
MT
25991+} tfm_action;
25992+
71430cf6 25993+/* supported compression algorithms */
44254afd
MT
25994+typedef enum {
25995+ LZO1_COMPRESSION_ID,
25996+ GZIP1_COMPRESSION_ID,
25997+ LAST_COMPRESSION_ID,
25998+} reiser4_compression_id;
25999+
71430cf6 26000+/* the same as pgoff, but units are page clusters */
44254afd 26001+typedef unsigned long cloff_t;
71430cf6
MT
26002+
26003+/* working data of a (de)compression algorithm */
44254afd 26004+typedef void *coa_t;
71430cf6
MT
26005+
26006+/* table for all supported (de)compression algorithms */
26007+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
44254afd
MT
26008+
26009+__u32 reiser4_adler32(char *data, __u32 len);
26010+
26011+#endif /* __FS_REISER4_COMPRESS_H__ */
26012+
26013+/* Make Linus happy.
26014+ Local variables:
26015+ c-indentation-style: "K&R"
26016+ mode-name: "LC"
26017+ c-basic-offset: 8
26018+ tab-width: 8
26019+ fill-column: 120
26020+ scroll-step: 1
26021+ End:
26022+*/
71430cf6
MT
26023diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.22/fs/reiser4/plugin/compress/compress_mode.c
26024--- linux-2.6.22.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 03:00:00.000000000 +0300
26025+++ linux-2.6.22/fs/reiser4/plugin/compress/compress_mode.c 2007-07-29 00:25:34.892700618 +0400
26026@@ -0,0 +1,162 @@
44254afd
MT
26027+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26028+/* This file contains Reiser4 compression mode plugins.
26029+
26030+ Compression mode plugin is a set of handlers called by compressor
26031+ at flush time and represent some heuristics including the ones
26032+ which are to avoid compression of incompressible data, see
26033+ http://www.namesys.com/cryptcompress_design.html for more details.
26034+*/
26035+#include "../../inode.h"
26036+#include "../plugin.h"
26037+
44254afd
MT
26038+static int should_deflate_none(struct inode * inode, cloff_t index)
26039+{
26040+ return 0;
26041+}
26042+
26043+static int should_deflate_common(struct inode * inode, cloff_t index)
26044+{
26045+ return compression_is_on(cryptcompress_inode_data(inode));
26046+}
26047+
71430cf6 26048+static int discard_hook_ultim(struct inode *inode, cloff_t index)
44254afd 26049+{
71430cf6 26050+ turn_off_compression(cryptcompress_inode_data(inode));
44254afd
MT
26051+ return 0;
26052+}
26053+
71430cf6 26054+static int discard_hook_lattd(struct inode *inode, cloff_t index)
44254afd 26055+{
71430cf6
MT
26056+ struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26057+
26058+ assert("edward-1462",
26059+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26060+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26061+
26062+ turn_off_compression(info);
26063+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26064+ set_lattice_factor(info, get_lattice_factor(info) << 1);
44254afd
MT
26065+ return 0;
26066+}
26067+
71430cf6 26068+static int accept_hook_lattd(struct inode *inode, cloff_t index)
44254afd 26069+{
71430cf6
MT
26070+ turn_on_compression(cryptcompress_inode_data(inode));
26071+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
44254afd
MT
26072+ return 0;
26073+}
26074+
71430cf6
MT
26075+/* Check on dynamic lattice, the adaptive compression modes which
26076+ defines the following behavior:
26077+
44254afd
MT
26078+ Compression is on: try to compress everything and turn
26079+ it off, whenever cluster is incompressible.
71430cf6
MT
26080+
26081+ Compression is off: try to compress clusters of indexes
44254afd 26082+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
71430cf6 26083+ them is compressible. If incompressible, then increase FACTOR */
44254afd
MT
26084+
26085+/* check if @index belongs to one-dimensional lattice
26086+ of sparce factor @factor */
71430cf6 26087+static int is_on_lattice(cloff_t index, int factor)
44254afd
MT
26088+{
26089+ return (factor ? index % factor == 0: index == 0);
26090+}
26091+
71430cf6
MT
26092+static int should_deflate_lattd(struct inode * inode, cloff_t index)
26093+{
26094+ return should_deflate_common(inode, index) ||
26095+ is_on_lattice(index,
26096+ get_lattice_factor
26097+ (cryptcompress_inode_data(inode)));
44254afd
MT
26098+}
26099+
44254afd
MT
26100+/* compression mode_plugins */
26101+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26102+ [NONE_COMPRESSION_MODE_ID] = {
26103+ .h = {
26104+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26105+ .id = NONE_COMPRESSION_MODE_ID,
26106+ .pops = NULL,
26107+ .label = "none",
71430cf6 26108+ .desc = "Compress nothing",
44254afd
MT
26109+ .linkage = {NULL, NULL}
26110+ },
26111+ .should_deflate = should_deflate_none,
26112+ .accept_hook = NULL,
26113+ .discard_hook = NULL
26114+ },
71430cf6
MT
26115+ /* Check-on-dynamic-lattice adaptive compression mode */
26116+ [LATTD_COMPRESSION_MODE_ID] = {
26117+ .h = {
26118+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26119+ .id = LATTD_COMPRESSION_MODE_ID,
26120+ .pops = NULL,
26121+ .label = "lattd",
26122+ .desc = "Check on dynamic lattice",
26123+ .linkage = {NULL, NULL}
26124+ },
26125+ .should_deflate = should_deflate_lattd,
26126+ .accept_hook = accept_hook_lattd,
26127+ .discard_hook = discard_hook_lattd
26128+ },
26129+ /* Check-ultimately compression mode:
26130+ Turn off compression forever as soon as we meet
26131+ incompressible data */
26132+ [ULTIM_COMPRESSION_MODE_ID] = {
44254afd
MT
26133+ .h = {
26134+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
71430cf6 26135+ .id = ULTIM_COMPRESSION_MODE_ID,
44254afd 26136+ .pops = NULL,
71430cf6
MT
26137+ .label = "ultim",
26138+ .desc = "Check ultimately",
44254afd
MT
26139+ .linkage = {NULL, NULL}
26140+ },
26141+ .should_deflate = should_deflate_common,
26142+ .accept_hook = NULL,
71430cf6 26143+ .discard_hook = discard_hook_ultim
44254afd 26144+ },
71430cf6 26145+ /* Force-to-compress-everything compression mode */
44254afd
MT
26146+ [FORCE_COMPRESSION_MODE_ID] = {
26147+ .h = {
26148+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26149+ .id = FORCE_COMPRESSION_MODE_ID,
26150+ .pops = NULL,
26151+ .label = "force",
71430cf6 26152+ .desc = "Force to compress everything",
44254afd
MT
26153+ .linkage = {NULL, NULL}
26154+ },
26155+ .should_deflate = NULL,
26156+ .accept_hook = NULL,
26157+ .discard_hook = NULL
26158+ },
71430cf6
MT
26159+ /* Convert-to-extent compression mode.
26160+ In this mode items will be converted to extents and management
26161+ will be passed to (classic) unix file plugin as soon as ->write()
26162+ detects that the first complete logical cluster (of index #0) is
26163+ incompressible. */
26164+ [CONVX_COMPRESSION_MODE_ID] = {
44254afd
MT
26165+ .h = {
26166+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
71430cf6 26167+ .id = CONVX_COMPRESSION_MODE_ID,
44254afd 26168+ .pops = NULL,
71430cf6
MT
26169+ .label = "conv",
26170+ .desc = "Convert to extent",
44254afd
MT
26171+ .linkage = {NULL, NULL}
26172+ },
71430cf6 26173+ .should_deflate = should_deflate_common,
44254afd
MT
26174+ .accept_hook = NULL,
26175+ .discard_hook = NULL
26176+ }
26177+};
26178+
26179+/*
26180+ Local variables:
26181+ c-indentation-style: "K&R"
26182+ mode-name: "LC"
26183+ c-basic-offset: 8
26184+ tab-width: 8
26185+ fill-column: 120
26186+ scroll-step: 1
26187+ End:
26188+*/
71430cf6
MT
26189diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/lzoconf.h linux-2.6.22/fs/reiser4/plugin/compress/lzoconf.h
26190--- linux-2.6.22.orig/fs/reiser4/plugin/compress/lzoconf.h 1970-01-01 03:00:00.000000000 +0300
26191+++ linux-2.6.22/fs/reiser4/plugin/compress/lzoconf.h 2007-07-29 00:25:34.896701653 +0400
26192@@ -0,0 +1,216 @@
44254afd
MT
26193+/* lzoconf.h -- configuration for the LZO real-time data compression library
26194+ adopted for reiser4 compression transform plugin.
26195+
26196+ This file is part of the LZO real-time data compression library
26197+ and not included in any proprietary licenses of reiser4.
26198+
26199+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26200+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26201+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26202+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26203+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26204+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26205+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26206+ All Rights Reserved.
26207+
26208+ The LZO library is free software; you can redistribute it and/or
26209+ modify it under the terms of the GNU General Public License as
26210+ published by the Free Software Foundation; either version 2 of
26211+ the License, or (at your option) any later version.
26212+
26213+ The LZO library is distributed in the hope that it will be useful,
26214+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26215+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26216+ GNU General Public License for more details.
26217+
26218+ You should have received a copy of the GNU General Public License
26219+ along with the LZO library; see the file COPYING.
26220+ If not, write to the Free Software Foundation, Inc.,
26221+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26222+
26223+ Markus F.X.J. Oberhumer
26224+ <markus@oberhumer.com>
26225+ http://www.oberhumer.com/opensource/lzo/
26226+ */
26227+
26228+#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26229+
26230+#ifndef __LZOCONF_H
26231+#define __LZOCONF_H
26232+
26233+#define LZO_VERSION 0x1080
26234+#define LZO_VERSION_STRING "1.08"
26235+#define LZO_VERSION_DATE "Jul 12 2002"
26236+
26237+/* internal Autoconf configuration file - only used when building LZO */
44254afd
MT
26238+
26239+/***********************************************************************
26240+// LZO requires a conforming <limits.h>
26241+************************************************************************/
26242+
26243+#define CHAR_BIT 8
26244+#define USHRT_MAX 0xffff
26245+
26246+/* workaround a cpp bug under hpux 10.20 */
26247+#define LZO_0xffffffffL 4294967295ul
26248+
26249+/***********************************************************************
26250+// architecture defines
26251+************************************************************************/
26252+
44254afd 26253+#if !defined(__LZO_i386)
71430cf6 26254+# if defined(__i386__) || defined(__386__) || defined(_M_IX86)
44254afd 26255+# define __LZO_i386
44254afd
MT
26256+# endif
26257+#endif
26258+
26259+/* memory checkers */
26260+#if !defined(__LZO_CHECKER)
26261+# if defined(__BOUNDS_CHECKING_ON)
26262+# define __LZO_CHECKER
26263+# elif defined(__CHECKER__)
26264+# define __LZO_CHECKER
26265+# elif defined(__INSURE__)
26266+# define __LZO_CHECKER
26267+# elif defined(__PURIFY__)
26268+# define __LZO_CHECKER
26269+# endif
26270+#endif
26271+
26272+/***********************************************************************
26273+// integral and pointer types
26274+************************************************************************/
26275+
26276+/* Integral types with 32 bits or more */
26277+#if !defined(LZO_UINT32_MAX)
26278+# if (UINT_MAX >= LZO_0xffffffffL)
26279+ typedef unsigned int lzo_uint32;
26280+ typedef int lzo_int32;
26281+# define LZO_UINT32_MAX UINT_MAX
26282+# define LZO_INT32_MAX INT_MAX
26283+# define LZO_INT32_MIN INT_MIN
26284+# elif (ULONG_MAX >= LZO_0xffffffffL)
26285+ typedef unsigned long lzo_uint32;
26286+ typedef long lzo_int32;
26287+# define LZO_UINT32_MAX ULONG_MAX
26288+# define LZO_INT32_MAX LONG_MAX
26289+# define LZO_INT32_MIN LONG_MIN
26290+# else
26291+# error "lzo_uint32"
26292+# endif
26293+#endif
26294+
26295+/* lzo_uint is used like size_t */
26296+#if !defined(LZO_UINT_MAX)
26297+# if (UINT_MAX >= LZO_0xffffffffL)
26298+ typedef unsigned int lzo_uint;
26299+ typedef int lzo_int;
26300+# define LZO_UINT_MAX UINT_MAX
26301+# define LZO_INT_MAX INT_MAX
26302+# define LZO_INT_MIN INT_MIN
26303+# elif (ULONG_MAX >= LZO_0xffffffffL)
26304+ typedef unsigned long lzo_uint;
26305+ typedef long lzo_int;
26306+# define LZO_UINT_MAX ULONG_MAX
26307+# define LZO_INT_MAX LONG_MAX
26308+# define LZO_INT_MIN LONG_MIN
26309+# else
26310+# error "lzo_uint"
26311+# endif
26312+#endif
26313+
26314+ typedef int lzo_bool;
26315+
26316+/***********************************************************************
26317+// memory models
26318+************************************************************************/
26319+
44254afd
MT
26320+/* Memory model that allows to access memory at offsets of lzo_uint. */
26321+#if !defined(__LZO_MMODEL)
26322+# if (LZO_UINT_MAX <= UINT_MAX)
26323+# define __LZO_MMODEL
44254afd
MT
26324+# else
26325+# error "__LZO_MMODEL"
26326+# endif
26327+#endif
26328+
26329+/* no typedef here because of const-pointer issues */
26330+#define lzo_byte unsigned char __LZO_MMODEL
26331+#define lzo_bytep unsigned char __LZO_MMODEL *
26332+#define lzo_charp char __LZO_MMODEL *
26333+#define lzo_voidp void __LZO_MMODEL *
26334+#define lzo_shortp short __LZO_MMODEL *
26335+#define lzo_ushortp unsigned short __LZO_MMODEL *
26336+#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26337+#define lzo_int32p lzo_int32 __LZO_MMODEL *
26338+#define lzo_uintp lzo_uint __LZO_MMODEL *
26339+#define lzo_intp lzo_int __LZO_MMODEL *
26340+#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26341+#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26342+
26343+#ifndef lzo_sizeof_dict_t
26344+# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26345+#endif
26346+
71430cf6
MT
26347+typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26348+ lzo_byte * dst, lzo_uintp dst_len,
26349+ lzo_voidp wrkmem);
44254afd 26350+
44254afd
MT
26351+
26352+/***********************************************************************
26353+// error codes and prototypes
26354+************************************************************************/
26355+
26356+/* Error codes for the compression/decompression functions. Negative
26357+ * values are errors, positive values will be used for special but
26358+ * normal events.
26359+ */
26360+#define LZO_E_OK 0
26361+#define LZO_E_ERROR (-1)
26362+#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26363+#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26364+#define LZO_E_INPUT_OVERRUN (-4)
26365+#define LZO_E_OUTPUT_OVERRUN (-5)
26366+#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26367+#define LZO_E_EOF_NOT_FOUND (-7)
26368+#define LZO_E_INPUT_NOT_CONSUMED (-8)
26369+
26370+/* lzo_init() should be the first function you call.
26371+ * Check the return code !
26372+ *
26373+ * lzo_init() is a macro to allow checking that the library and the
26374+ * compiler's view of various types are consistent.
26375+ */
26376+#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26377+ (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26378+ (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26379+ (int)sizeof(lzo_compress_t))
71430cf6
MT
26380+ extern int __lzo_init2(unsigned, int, int, int, int, int, int,
26381+ int, int, int);
44254afd
MT
26382+
26383+/* checksum functions */
71430cf6
MT
26384+extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
26385+ lzo_uint _len);
44254afd
MT
26386+/* misc. */
26387+ typedef union {
26388+ lzo_bytep p;
26389+ lzo_uint u;
26390+ } __lzo_pu_u;
26391+ typedef union {
26392+ lzo_bytep p;
26393+ lzo_uint32 u32;
26394+ } __lzo_pu32_u;
26395+ typedef union {
26396+ void *vp;
26397+ lzo_bytep bp;
26398+ lzo_uint32 u32;
26399+ long l;
26400+ } lzo_align_t;
26401+
26402+#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26403+ ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26404+
26405+/* deprecated - only for backward compatibility */
26406+#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26407+
44254afd 26408+#endif /* already included */
71430cf6
MT
26409diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.22/fs/reiser4/plugin/compress/Makefile
26410--- linux-2.6.22.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 03:00:00.000000000 +0300
26411+++ linux-2.6.22/fs/reiser4/plugin/compress/Makefile 2007-07-29 00:25:34.896701653 +0400
26412@@ -0,0 +1,6 @@
26413+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26414+
26415+compress_plugins-objs := \
26416+ compress.o \
26417+ minilzo.o \
26418+ compress_mode.o
26419diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.c linux-2.6.22/fs/reiser4/plugin/compress/minilzo.c
26420--- linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.c 1970-01-01 03:00:00.000000000 +0300
26421+++ linux-2.6.22/fs/reiser4/plugin/compress/minilzo.c 2007-07-29 00:25:34.900702689 +0400
26422@@ -0,0 +1,1967 @@
44254afd
MT
26423+/* minilzo.c -- mini subset of the LZO real-time data compression library
26424+ adopted for reiser4 compression transform plugin.
26425+
26426+ This file is part of the LZO real-time data compression library
26427+ and not included in any proprietary licenses of reiser4.
26428+
26429+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26430+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26431+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26432+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26433+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26434+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26435+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26436+ All Rights Reserved.
26437+
26438+ The LZO library is free software; you can redistribute it and/or
26439+ modify it under the terms of the GNU General Public License as
26440+ published by the Free Software Foundation; either version 2 of
26441+ the License, or (at your option) any later version.
26442+
26443+ The LZO library is distributed in the hope that it will be useful,
26444+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26445+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26446+ GNU General Public License for more details.
26447+
26448+ You should have received a copy of the GNU General Public License
26449+ along with the LZO library; see the file COPYING.
26450+ If not, write to the Free Software Foundation, Inc.,
26451+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26452+
26453+ Markus F.X.J. Oberhumer
26454+ <markus@oberhumer.com>
26455+ http://www.oberhumer.com/opensource/lzo/
26456+ */
26457+
26458+/*
26459+ * NOTE:
26460+ * the full LZO package can be found at
26461+ * http://www.oberhumer.com/opensource/lzo/
26462+ */
26463+
26464+#include "../../debug.h" /* for reiser4 assert macro -edward */
26465+
26466+#define __LZO_IN_MINILZO
26467+#define LZO_BUILD
26468+
44254afd
MT
26469+#include "minilzo.h"
26470+
26471+#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26472+# error "version mismatch in miniLZO source files"
26473+#endif
26474+
44254afd
MT
26475+#ifndef __LZO_CONF_H
26476+#define __LZO_CONF_H
26477+
44254afd
MT
26478+# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26479+# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
44254afd
MT
26480+
26481+# define HAVE_MEMCMP
26482+# define HAVE_MEMCPY
26483+# define HAVE_MEMMOVE
26484+# define HAVE_MEMSET
26485+
44254afd
MT
26486+#undef NDEBUG
26487+#if !defined(LZO_DEBUG)
26488+# define NDEBUG
26489+#endif
26490+#if defined(LZO_DEBUG) || !defined(NDEBUG)
26491+# if !defined(NO_STDIO_H)
26492+# include <stdio.h>
26493+# endif
26494+#endif
44254afd
MT
26495+
26496+#if !defined(LZO_COMPILE_TIME_ASSERT)
26497+# define LZO_COMPILE_TIME_ASSERT(expr) \
26498+ { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26499+#endif
26500+
26501+#if !defined(LZO_UNUSED)
26502+# if 1
26503+# define LZO_UNUSED(var) ((void)&var)
26504+# elif 0
26505+# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26506+# else
26507+# define LZO_UNUSED(parm) (parm = parm)
26508+# endif
26509+#endif
26510+
44254afd
MT
26511+#if defined(NO_MEMCMP)
26512+# undef HAVE_MEMCMP
26513+#endif
26514+
26515+#if !defined(HAVE_MEMSET)
26516+# undef memset
26517+# define memset lzo_memset
26518+#endif
26519+
26520+# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26521+
26522+#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26523+#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26524+#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26525+#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26526+
26527+#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26528+
26529+#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26530+
26531+#define LZO_SIZE(bits) (1u << (bits))
26532+#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26533+
26534+#define LZO_LSIZE(bits) (1ul << (bits))
26535+#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26536+
26537+#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26538+#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26539+
26540+#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26541+#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26542+
26543+#if !defined(SIZEOF_UNSIGNED)
26544+# if (UINT_MAX == 0xffff)
26545+# define SIZEOF_UNSIGNED 2
26546+# elif (UINT_MAX == LZO_0xffffffffL)
26547+# define SIZEOF_UNSIGNED 4
26548+# elif (UINT_MAX >= LZO_0xffffffffL)
26549+# define SIZEOF_UNSIGNED 8
26550+# else
26551+# error "SIZEOF_UNSIGNED"
26552+# endif
26553+#endif
26554+
26555+#if !defined(SIZEOF_UNSIGNED_LONG)
26556+# if (ULONG_MAX == LZO_0xffffffffL)
26557+# define SIZEOF_UNSIGNED_LONG 4
26558+# elif (ULONG_MAX >= LZO_0xffffffffL)
26559+# define SIZEOF_UNSIGNED_LONG 8
26560+# else
26561+# error "SIZEOF_UNSIGNED_LONG"
26562+# endif
26563+#endif
26564+
26565+#if !defined(SIZEOF_SIZE_T)
26566+# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26567+#endif
26568+#if !defined(SIZE_T_MAX)
26569+# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26570+#endif
26571+
26572+#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26573+# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26574+# define LZO_UNALIGNED_OK_2
26575+# endif
26576+# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26577+# define LZO_UNALIGNED_OK_4
26578+# endif
26579+#endif
26580+
26581+#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26582+# if !defined(LZO_UNALIGNED_OK)
26583+# define LZO_UNALIGNED_OK
26584+# endif
26585+#endif
26586+
26587+#if defined(__LZO_NO_UNALIGNED)
26588+# undef LZO_UNALIGNED_OK
26589+# undef LZO_UNALIGNED_OK_2
26590+# undef LZO_UNALIGNED_OK_4
26591+#endif
26592+
26593+#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26594+# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26595+#endif
26596+#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26597+# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26598+#endif
26599+
26600+#if defined(__LZO_NO_ALIGNED)
26601+# undef LZO_ALIGNED_OK_4
26602+#endif
26603+
26604+#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26605+# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26606+#endif
26607+
26608+#define LZO_LITTLE_ENDIAN 1234
26609+#define LZO_BIG_ENDIAN 4321
26610+#define LZO_PDP_ENDIAN 3412
26611+
26612+#if !defined(LZO_BYTE_ORDER)
26613+# if defined(MFX_BYTE_ORDER)
26614+# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26615+# elif defined(__LZO_i386)
26616+# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26617+# elif defined(BYTE_ORDER)
26618+# define LZO_BYTE_ORDER BYTE_ORDER
26619+# elif defined(__BYTE_ORDER)
26620+# define LZO_BYTE_ORDER __BYTE_ORDER
26621+# endif
26622+#endif
26623+
26624+#if defined(LZO_BYTE_ORDER)
26625+# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26626+ (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26627+# error "invalid LZO_BYTE_ORDER"
26628+# endif
26629+#endif
26630+
26631+#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
26632+# error "LZO_BYTE_ORDER is not defined"
26633+#endif
26634+
26635+#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
26636+
26637+#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
26638+# if defined(__GNUC__) && defined(__i386__)
26639+# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
26640+# define LZO_OPTIMIZE_GNUC_i386
26641+# endif
26642+# endif
26643+#endif
26644+
71430cf6 26645+extern const lzo_uint32 _lzo_crc32_table[256];
44254afd
MT
26646+
26647+#define _LZO_STRINGIZE(x) #x
26648+#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
26649+
26650+#define _LZO_CONCAT2(a,b) a ## b
26651+#define _LZO_CONCAT3(a,b,c) a ## b ## c
26652+#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
26653+#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
26654+
26655+#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
26656+#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
26657+#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
26658+#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
26659+
26660+#ifndef __LZO_PTR_H
26661+#define __LZO_PTR_H
26662+
44254afd
MT
26663+#if !defined(lzo_ptrdiff_t)
26664+# if (UINT_MAX >= LZO_0xffffffffL)
71430cf6 26665+typedef ptrdiff_t lzo_ptrdiff_t;
44254afd 26666+# else
71430cf6 26667+typedef long lzo_ptrdiff_t;
44254afd
MT
26668+# endif
26669+#endif
26670+
26671+#if !defined(__LZO_HAVE_PTR_T)
26672+# if defined(lzo_ptr_t)
26673+# define __LZO_HAVE_PTR_T
26674+# endif
26675+#endif
26676+#if !defined(__LZO_HAVE_PTR_T)
26677+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
26678+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
71430cf6
MT
26679+typedef unsigned long lzo_ptr_t;
26680+typedef long lzo_sptr_t;
44254afd
MT
26681+# define __LZO_HAVE_PTR_T
26682+# endif
26683+# endif
26684+#endif
26685+#if !defined(__LZO_HAVE_PTR_T)
26686+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
26687+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
71430cf6
MT
26688+typedef unsigned int lzo_ptr_t;
26689+typedef int lzo_sptr_t;
44254afd
MT
26690+# define __LZO_HAVE_PTR_T
26691+# endif
26692+# endif
26693+#endif
26694+#if !defined(__LZO_HAVE_PTR_T)
26695+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
26696+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
71430cf6
MT
26697+typedef unsigned short lzo_ptr_t;
26698+typedef short lzo_sptr_t;
44254afd
MT
26699+# define __LZO_HAVE_PTR_T
26700+# endif
26701+# endif
26702+#endif
26703+#if !defined(__LZO_HAVE_PTR_T)
26704+# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
26705+# error "no suitable type for lzo_ptr_t"
26706+# else
71430cf6
MT
26707+typedef unsigned long lzo_ptr_t;
26708+typedef long lzo_sptr_t;
44254afd
MT
26709+# define __LZO_HAVE_PTR_T
26710+# endif
26711+#endif
26712+
44254afd
MT
26713+#define PTR(a) ((lzo_ptr_t) (a))
26714+#define PTR_LINEAR(a) PTR(a)
26715+#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
26716+#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
26717+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
26718+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
44254afd
MT
26719+
26720+#define PTR_LT(a,b) (PTR(a) < PTR(b))
26721+#define PTR_GE(a,b) (PTR(a) >= PTR(b))
26722+#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
26723+#define pd(a,b) ((lzo_uint) ((a)-(b)))
26724+
71430cf6
MT
26725+typedef union {
26726+ char a_char;
26727+ unsigned char a_uchar;
26728+ short a_short;
26729+ unsigned short a_ushort;
26730+ int a_int;
26731+ unsigned int a_uint;
26732+ long a_long;
26733+ unsigned long a_ulong;
26734+ lzo_int a_lzo_int;
26735+ lzo_uint a_lzo_uint;
26736+ lzo_int32 a_lzo_int32;
26737+ lzo_uint32 a_lzo_uint32;
26738+ ptrdiff_t a_ptrdiff_t;
26739+ lzo_ptrdiff_t a_lzo_ptrdiff_t;
26740+ lzo_ptr_t a_lzo_ptr_t;
26741+ lzo_voidp a_lzo_voidp;
26742+ void *a_void_p;
26743+ lzo_bytep a_lzo_bytep;
26744+ lzo_bytepp a_lzo_bytepp;
26745+ lzo_uintp a_lzo_uintp;
26746+ lzo_uint *a_lzo_uint_p;
26747+ lzo_uint32p a_lzo_uint32p;
26748+ lzo_uint32 *a_lzo_uint32_p;
26749+ unsigned char *a_uchar_p;
26750+ char *a_char_p;
26751+} lzo_full_align_t;
44254afd 26752+
44254afd
MT
26753+#endif
26754+#define LZO_DETERMINISTIC
26755+#define LZO_DICT_USE_PTR
44254afd
MT
26756+# define lzo_dict_t const lzo_bytep
26757+# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
44254afd
MT
26758+#if !defined(lzo_moff_t)
26759+#define lzo_moff_t lzo_uint
26760+#endif
26761+#endif
26762+static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
26763+{
71430cf6 26764+ return PTR_LINEAR(ptr);
44254afd
MT
26765+}
26766+
26767+static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
26768+{
26769+ lzo_ptr_t p, s, n;
26770+
26771+ assert("lzo-01", size > 0);
26772+
26773+ p = __lzo_ptr_linear(ptr);
26774+ s = (lzo_ptr_t) (size - 1);
26775+ n = (((p + s) / size) * size) - p;
26776+
26777+ assert("lzo-02", (long)n >= 0);
26778+ assert("lzo-03", n <= s);
26779+
26780+ return (unsigned)n;
26781+}
26782+
26783+#ifndef __LZO_UTIL_H
26784+#define __LZO_UTIL_H
26785+
26786+#ifndef __LZO_CONF_H
26787+#endif
26788+
44254afd 26789+#if 1 && defined(HAVE_MEMCPY)
44254afd
MT
26790+#define MEMCPY8_DS(dest,src,len) \
26791+ memcpy(dest,src,len); \
26792+ dest += len; \
26793+ src += len
44254afd
MT
26794+#endif
26795+
26796+#if !defined(MEMCPY8_DS)
26797+
26798+#define MEMCPY8_DS(dest,src,len) \
26799+ { register lzo_uint __l = (len) / 8; \
26800+ do { \
26801+ *dest++ = *src++; \
26802+ *dest++ = *src++; \
26803+ *dest++ = *src++; \
26804+ *dest++ = *src++; \
26805+ *dest++ = *src++; \
26806+ *dest++ = *src++; \
26807+ *dest++ = *src++; \
26808+ *dest++ = *src++; \
26809+ } while (--__l > 0); }
26810+
26811+#endif
26812+
26813+#define MEMCPY_DS(dest,src,len) \
26814+ do *dest++ = *src++; \
26815+ while (--len > 0)
26816+
26817+#define MEMMOVE_DS(dest,src,len) \
26818+ do *dest++ = *src++; \
26819+ while (--len > 0)
26820+
44254afd
MT
26821+#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
26822+
26823+#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
26824+
26825+#else
26826+
26827+#define BZERO8_PTR(s,l,n) \
26828+ lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
26829+
26830+#endif
44254afd
MT
26831+#endif
26832+
26833+/* If you use the LZO library in a product, you *must* keep this
26834+ * copyright string in the executable of your product.
26835+ */
26836+
26837+static const lzo_byte __lzo_copyright[] =
26838+#if !defined(__LZO_IN_MINLZO)
26839+ LZO_VERSION_STRING;
26840+#else
26841+ "\n\n\n"
26842+ "LZO real-time data compression library.\n"
26843+ "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
26844+ "<markus.oberhumer@jk.uni-linz.ac.at>\n"
26845+ "http://www.oberhumer.com/opensource/lzo/\n"
26846+ "\n"
26847+ "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
26848+ "LZO build date: " __DATE__ " " __TIME__ "\n\n"
26849+ "LZO special compilation options:\n"
26850+#ifdef __cplusplus
26851+ " __cplusplus\n"
26852+#endif
26853+#if defined(__PIC__)
26854+ " __PIC__\n"
26855+#elif defined(__pic__)
26856+ " __pic__\n"
26857+#endif
26858+#if (UINT_MAX < LZO_0xffffffffL)
26859+ " 16BIT\n"
26860+#endif
26861+#if defined(__LZO_STRICT_16BIT)
26862+ " __LZO_STRICT_16BIT\n"
26863+#endif
26864+#if (UINT_MAX > LZO_0xffffffffL)
26865+ " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
26866+#endif
26867+#if (ULONG_MAX > LZO_0xffffffffL)
26868+ " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
26869+#endif
26870+#if defined(LZO_BYTE_ORDER)
26871+ " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
26872+#endif
26873+#if defined(LZO_UNALIGNED_OK_2)
26874+ " LZO_UNALIGNED_OK_2\n"
26875+#endif
26876+#if defined(LZO_UNALIGNED_OK_4)
26877+ " LZO_UNALIGNED_OK_4\n"
26878+#endif
26879+#if defined(LZO_ALIGNED_OK_4)
26880+ " LZO_ALIGNED_OK_4\n"
26881+#endif
26882+#if defined(LZO_DICT_USE_PTR)
26883+ " LZO_DICT_USE_PTR\n"
26884+#endif
26885+#if defined(__LZO_QUERY_COMPRESS)
26886+ " __LZO_QUERY_COMPRESS\n"
26887+#endif
26888+#if defined(__LZO_QUERY_DECOMPRESS)
26889+ " __LZO_QUERY_DECOMPRESS\n"
26890+#endif
26891+#if defined(__LZO_IN_MINILZO)
26892+ " __LZO_IN_MINILZO\n"
26893+#endif
26894+ "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
26895+#if defined(__GNUC__) && defined(__VERSION__)
26896+ " by gcc " __VERSION__
26897+#elif defined(__BORLANDC__)
26898+ " by Borland C " _LZO_MEXPAND(__BORLANDC__)
26899+#elif defined(_MSC_VER)
26900+ " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
26901+#elif defined(__PUREC__)
26902+ " by Pure C " _LZO_MEXPAND(__PUREC__)
26903+#elif defined(__SC__)
26904+ " by Symantec C " _LZO_MEXPAND(__SC__)
26905+#elif defined(__TURBOC__)
26906+ " by Turbo C " _LZO_MEXPAND(__TURBOC__)
26907+#elif defined(__WATCOMC__)
26908+ " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
26909+#endif
26910+ " $\n"
26911+ "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
26912+#endif
26913+
26914+#define LZO_BASE 65521u
26915+#define LZO_NMAX 5552
26916+
26917+#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
26918+#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
26919+#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
26920+#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
26921+#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
26922+
26923+# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
26924+# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
26925+
26926+#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
26927+
26928+static lzo_bool schedule_insns_bug(void);
26929+static lzo_bool strength_reduce_bug(int *);
26930+
26931+# define __lzo_assert(x) ((x) ? 1 : 0)
26932+
26933+#undef COMPILE_TIME_ASSERT
26934+
26935+# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
26936+
26937+static lzo_bool basic_integral_check(void)
26938+{
26939+ lzo_bool r = 1;
26940+
26941+ COMPILE_TIME_ASSERT(CHAR_BIT == 8);
26942+ COMPILE_TIME_ASSERT(sizeof(char) == 1);
26943+ COMPILE_TIME_ASSERT(sizeof(short) >= 2);
26944+ COMPILE_TIME_ASSERT(sizeof(long) >= 4);
26945+ COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
26946+ COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
26947+
26948+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
26949+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
26950+
26951+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
26952+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
26953+#if defined(__LZO_STRICT_16BIT)
26954+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
26955+#else
26956+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
26957+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
26958+#endif
26959+
26960+#if (USHRT_MAX == 65535u)
26961+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
26962+#elif (USHRT_MAX == LZO_0xffffffffL)
26963+ COMPILE_TIME_ASSERT(sizeof(short) == 4);
26964+#elif (USHRT_MAX >= LZO_0xffffffffL)
26965+ COMPILE_TIME_ASSERT(sizeof(short) > 4);
26966+#endif
44254afd
MT
26967+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
26968+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
26969+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
26970+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
26971+ COMPILE_TIME_ASSERT(IS_SIGNED(short));
26972+ COMPILE_TIME_ASSERT(IS_SIGNED(int));
26973+ COMPILE_TIME_ASSERT(IS_SIGNED(long));
26974+
26975+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
26976+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
26977+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
26978+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
26979+
26980+ COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
26981+ COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
26982+ COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
26983+ COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
44254afd
MT
26984+ COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
26985+ COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
26986+ LZO_UTYPE_MAX(sizeof(lzo_uint32)));
26987+ COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
44254afd
MT
26988+
26989+ r &= __lzo_assert(LZO_BYTE(257) == 1);
26990+
26991+ return r;
26992+}
26993+
26994+static lzo_bool basic_ptr_check(void)
26995+{
26996+ lzo_bool r = 1;
26997+
26998+ COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
26999+ COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27000+
27001+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27002+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27003+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27004+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27005+
27006+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27007+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27008+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27009+
27010+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27011+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27012+
27013+ COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27014+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27015+
27016+#if defined(SIZEOF_CHAR_P)
27017+ COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27018+#endif
27019+#if defined(SIZEOF_PTRDIFF_T)
27020+ COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27021+#endif
27022+
27023+ COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27024+ COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27025+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27026+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27027+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27028+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27029+
27030+ return r;
27031+}
27032+
27033+static lzo_bool ptr_check(void)
27034+{
27035+ lzo_bool r = 1;
27036+ int i;
27037+ char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27038+ lzo_bytep wrkmem;
27039+ lzo_bytepp dict;
27040+ unsigned char x[4 * sizeof(lzo_full_align_t)];
27041+ long d;
27042+ lzo_full_align_t a;
27043+ lzo_full_align_t u;
27044+
27045+ for (i = 0; i < (int)sizeof(x); i++)
27046+ x[i] = LZO_BYTE(i);
27047+
27048+ wrkmem =
27049+ LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27050+
27051+ u.a_lzo_bytep = wrkmem;
27052+ dict = u.a_lzo_bytepp;
27053+
27054+ d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27055+ r &= __lzo_assert(d >= 0);
27056+ r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27057+
27058+ memset(&a, 0, sizeof(a));
27059+ r &= __lzo_assert(a.a_lzo_voidp == NULL);
27060+
27061+ memset(&a, 0xff, sizeof(a));
27062+ r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27063+ r &= __lzo_assert(a.a_uint == UINT_MAX);
27064+ r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27065+ r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27066+ r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27067+
27068+ if (r == 1) {
27069+ for (i = 0; i < 8; i++)
27070+ r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27071+ (const
27072+ lzo_voidp)(&wrkmem[i *
27073+ sizeof(lzo_byte
27074+ *)]));
27075+ }
27076+
27077+ memset(&a, 0, sizeof(a));
27078+ r &= __lzo_assert(a.a_char_p == NULL);
27079+ r &= __lzo_assert(a.a_lzo_bytep == NULL);
27080+ r &= __lzo_assert(NULL == (void *)0);
27081+ if (r == 1) {
27082+ for (i = 0; i < 10; i++)
27083+ dict[i] = wrkmem;
27084+ BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27085+ r &= __lzo_assert(dict[0] == wrkmem);
27086+ for (i = 1; i < 9; i++)
27087+ r &= __lzo_assert(dict[i] == NULL);
27088+ r &= __lzo_assert(dict[9] == wrkmem);
27089+ }
27090+
27091+ if (r == 1) {
27092+ unsigned k = 1;
27093+ const unsigned n = (unsigned)sizeof(lzo_uint32);
27094+ lzo_byte *p0;
27095+ lzo_byte *p1;
27096+
27097+ k += __lzo_align_gap(&x[k], n);
27098+ p0 = (lzo_bytep) & x[k];
27099+#if defined(PTR_LINEAR)
27100+ r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27101+#else
27102+ r &= __lzo_assert(n == 4);
27103+ r &= __lzo_assert(PTR_ALIGNED_4(p0));
27104+#endif
27105+
27106+ r &= __lzo_assert(k >= 1);
27107+ p1 = (lzo_bytep) & x[1];
27108+ r &= __lzo_assert(PTR_GE(p0, p1));
27109+
27110+ r &= __lzo_assert(k < 1 + n);
27111+ p1 = (lzo_bytep) & x[1 + n];
27112+ r &= __lzo_assert(PTR_LT(p0, p1));
27113+
27114+ if (r == 1) {
27115+ lzo_uint32 v0, v1;
27116+
27117+ u.a_uchar_p = &x[k];
27118+ v0 = *u.a_lzo_uint32_p;
27119+ u.a_uchar_p = &x[k + n];
27120+ v1 = *u.a_lzo_uint32_p;
27121+
27122+ r &= __lzo_assert(v0 > 0);
27123+ r &= __lzo_assert(v1 > 0);
27124+ }
27125+ }
27126+
27127+ return r;
27128+}
27129+
27130+static int _lzo_config_check(void)
27131+{
27132+ lzo_bool r = 1;
27133+ int i;
27134+ union {
27135+ lzo_uint32 a;
27136+ unsigned short b;
27137+ lzo_uint32 aa[4];
27138+ unsigned char x[4 * sizeof(lzo_full_align_t)];
71430cf6 27139+ } u;
44254afd
MT
27140+
27141+ COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27142+ COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27143+ < 0);
27144+
27145+ r &= basic_integral_check();
27146+ r &= basic_ptr_check();
27147+ if (r != 1)
27148+ return LZO_E_ERROR;
27149+
27150+ u.a = 0;
27151+ u.b = 0;
27152+ for (i = 0; i < (int)sizeof(u.x); i++)
27153+ u.x[i] = LZO_BYTE(i);
27154+
27155+#if defined(LZO_BYTE_ORDER)
27156+ if (r == 1) {
27157+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27158+ lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27159+ unsigned short b = (unsigned short)(u.b & 0xffff);
27160+ r &= __lzo_assert(a == 0x03020100L);
27161+ r &= __lzo_assert(b == 0x0100);
27162+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27163+ lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27164+ unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27165+ r &= __lzo_assert(a == 0x00010203L);
27166+ r &= __lzo_assert(b == 0x0001);
27167+# else
27168+# error "invalid LZO_BYTE_ORDER"
27169+# endif
27170+ }
27171+#endif
27172+
27173+#if defined(LZO_UNALIGNED_OK_2)
27174+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
27175+ if (r == 1) {
27176+ unsigned short b[4];
27177+
27178+ for (i = 0; i < 4; i++)
27179+ b[i] = *(const unsigned short *)&u.x[i];
27180+
27181+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27182+ r &= __lzo_assert(b[0] == 0x0100);
27183+ r &= __lzo_assert(b[1] == 0x0201);
27184+ r &= __lzo_assert(b[2] == 0x0302);
27185+ r &= __lzo_assert(b[3] == 0x0403);
27186+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27187+ r &= __lzo_assert(b[0] == 0x0001);
27188+ r &= __lzo_assert(b[1] == 0x0102);
27189+ r &= __lzo_assert(b[2] == 0x0203);
27190+ r &= __lzo_assert(b[3] == 0x0304);
27191+# endif
27192+ }
27193+#endif
27194+
27195+#if defined(LZO_UNALIGNED_OK_4)
27196+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27197+ if (r == 1) {
27198+ lzo_uint32 a[4];
27199+
27200+ for (i = 0; i < 4; i++)
27201+ a[i] = *(const lzo_uint32 *)&u.x[i];
27202+
27203+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27204+ r &= __lzo_assert(a[0] == 0x03020100L);
27205+ r &= __lzo_assert(a[1] == 0x04030201L);
27206+ r &= __lzo_assert(a[2] == 0x05040302L);
27207+ r &= __lzo_assert(a[3] == 0x06050403L);
27208+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27209+ r &= __lzo_assert(a[0] == 0x00010203L);
27210+ r &= __lzo_assert(a[1] == 0x01020304L);
27211+ r &= __lzo_assert(a[2] == 0x02030405L);
27212+ r &= __lzo_assert(a[3] == 0x03040506L);
27213+# endif
27214+ }
27215+#endif
27216+
27217+#if defined(LZO_ALIGNED_OK_4)
27218+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27219+#endif
27220+
27221+ COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27222+
27223+ if (r == 1) {
27224+ r &= __lzo_assert(!schedule_insns_bug());
27225+ }
27226+
27227+ if (r == 1) {
27228+ static int x[3];
27229+ static unsigned xn = 3;
27230+ register unsigned j;
27231+
27232+ for (j = 0; j < xn; j++)
27233+ x[j] = (int)j - 3;
27234+ r &= __lzo_assert(!strength_reduce_bug(x));
27235+ }
27236+
27237+ if (r == 1) {
27238+ r &= ptr_check();
27239+ }
27240+
27241+ return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27242+}
27243+
27244+static lzo_bool schedule_insns_bug(void)
27245+{
27246+#if defined(__LZO_CHECKER)
27247+ return 0;
27248+#else
27249+ const int clone[] = { 1, 2, 0 };
27250+ const int *q;
27251+ q = clone;
27252+ return (*q) ? 0 : 1;
27253+#endif
27254+}
27255+
27256+static lzo_bool strength_reduce_bug(int *x)
27257+{
27258+ return x[0] != -3 || x[1] != -2 || x[2] != -1;
27259+}
27260+
27261+#undef COMPILE_TIME_ASSERT
27262+
71430cf6
MT
27263+int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27264+ int s6, int s7, int s8, int s9)
44254afd
MT
27265+{
27266+ int r;
27267+
27268+ if (v == 0)
27269+ return LZO_E_ERROR;
27270+
27271+ r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27272+ (s2 == -1 || s2 == (int)sizeof(int)) &&
27273+ (s3 == -1 || s3 == (int)sizeof(long)) &&
27274+ (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27275+ (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27276+ (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27277+ (s7 == -1 || s7 == (int)sizeof(char *)) &&
27278+ (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27279+ (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27280+ if (!r)
27281+ return LZO_E_ERROR;
27282+
27283+ r = _lzo_config_check();
27284+ if (r != LZO_E_OK)
27285+ return r;
27286+
27287+ return r;
27288+}
27289+
44254afd
MT
27290+#define do_compress _lzo1x_1_do_compress
27291+
27292+#define LZO_NEED_DICT_H
27293+#define D_BITS 14
27294+#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27295+#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27296+
27297+#ifndef __LZO_CONFIG1X_H
27298+#define __LZO_CONFIG1X_H
27299+
27300+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27301+# define LZO1X
27302+#endif
27303+
44254afd
MT
27304+#define LZO_EOF_CODE
27305+#undef LZO_DETERMINISTIC
27306+
27307+#define M1_MAX_OFFSET 0x0400
27308+#ifndef M2_MAX_OFFSET
27309+#define M2_MAX_OFFSET 0x0800
27310+#endif
27311+#define M3_MAX_OFFSET 0x4000
27312+#define M4_MAX_OFFSET 0xbfff
27313+
27314+#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27315+
27316+#define M1_MIN_LEN 2
27317+#define M1_MAX_LEN 2
27318+#define M2_MIN_LEN 3
27319+#ifndef M2_MAX_LEN
27320+#define M2_MAX_LEN 8
27321+#endif
27322+#define M3_MIN_LEN 3
27323+#define M3_MAX_LEN 33
27324+#define M4_MIN_LEN 3
27325+#define M4_MAX_LEN 9
27326+
27327+#define M1_MARKER 0
27328+#define M2_MARKER 64
27329+#define M3_MARKER 32
27330+#define M4_MARKER 16
27331+
27332+#ifndef MIN_LOOKAHEAD
27333+#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27334+#endif
27335+
27336+#if defined(LZO_NEED_DICT_H)
27337+
27338+#ifndef LZO_HASH
27339+#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27340+#endif
27341+#define DL_MIN_LEN M2_MIN_LEN
27342+
27343+#ifndef __LZO_DICT_H
27344+#define __LZO_DICT_H
27345+
44254afd
MT
27346+#if !defined(D_BITS) && defined(DBITS)
27347+# define D_BITS DBITS
27348+#endif
27349+#if !defined(D_BITS)
27350+# error "D_BITS is not defined"
27351+#endif
27352+#if (D_BITS < 16)
27353+# define D_SIZE LZO_SIZE(D_BITS)
27354+# define D_MASK LZO_MASK(D_BITS)
27355+#else
27356+# define D_SIZE LZO_USIZE(D_BITS)
27357+# define D_MASK LZO_UMASK(D_BITS)
27358+#endif
27359+#define D_HIGH ((D_MASK >> 1) + 1)
27360+
27361+#if !defined(DD_BITS)
27362+# define DD_BITS 0
27363+#endif
27364+#define DD_SIZE LZO_SIZE(DD_BITS)
27365+#define DD_MASK LZO_MASK(DD_BITS)
27366+
27367+#if !defined(DL_BITS)
27368+# define DL_BITS (D_BITS - DD_BITS)
27369+#endif
27370+#if (DL_BITS < 16)
27371+# define DL_SIZE LZO_SIZE(DL_BITS)
27372+# define DL_MASK LZO_MASK(DL_BITS)
27373+#else
27374+# define DL_SIZE LZO_USIZE(DL_BITS)
27375+# define DL_MASK LZO_UMASK(DL_BITS)
27376+#endif
27377+
27378+#if (D_BITS != DL_BITS + DD_BITS)
27379+# error "D_BITS does not match"
27380+#endif
27381+#if (D_BITS < 8 || D_BITS > 18)
27382+# error "invalid D_BITS"
27383+#endif
27384+#if (DL_BITS < 8 || DL_BITS > 20)
27385+# error "invalid DL_BITS"
27386+#endif
27387+#if (DD_BITS < 0 || DD_BITS > 6)
27388+# error "invalid DD_BITS"
27389+#endif
27390+
27391+#if !defined(DL_MIN_LEN)
27392+# define DL_MIN_LEN 3
27393+#endif
27394+#if !defined(DL_SHIFT)
27395+# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27396+#endif
27397+
27398+#define LZO_HASH_GZIP 1
27399+#define LZO_HASH_GZIP_INCREMENTAL 2
27400+#define LZO_HASH_LZO_INCREMENTAL_A 3
27401+#define LZO_HASH_LZO_INCREMENTAL_B 4
27402+
27403+#if !defined(LZO_HASH)
27404+# error "choose a hashing strategy"
27405+#endif
27406+
27407+#if (DL_MIN_LEN == 3)
27408+# define _DV2_A(p,shift1,shift2) \
27409+ (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27410+# define _DV2_B(p,shift1,shift2) \
27411+ (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27412+# define _DV3_B(p,shift1,shift2,shift3) \
27413+ ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27414+#elif (DL_MIN_LEN == 2)
27415+# define _DV2_A(p,shift1,shift2) \
27416+ (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27417+# define _DV2_B(p,shift1,shift2) \
27418+ (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27419+#else
27420+# error "invalid DL_MIN_LEN"
27421+#endif
27422+#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27423+#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27424+#define DA2(p,s1,s2) \
27425+ (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27426+#define DS2(p,s1,s2) \
27427+ (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27428+#define DX2(p,s1,s2) \
27429+ (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27430+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27431+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27432+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27433+#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27434+#define DM(v) DMS(v,0)
27435+
27436+#if (LZO_HASH == LZO_HASH_GZIP)
27437+# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27438+
27439+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27440+# define __LZO_HASH_INCREMENTAL
27441+# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27442+# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27443+# define _DINDEX(dv,p) (dv)
27444+# define DVAL_LOOKAHEAD DL_MIN_LEN
27445+
27446+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27447+# define __LZO_HASH_INCREMENTAL
27448+# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27449+# define DVAL_NEXT(dv,p) \
27450+ dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27451+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27452+# define DVAL_LOOKAHEAD DL_MIN_LEN
27453+
27454+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27455+# define __LZO_HASH_INCREMENTAL
27456+# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27457+# define DVAL_NEXT(dv,p) \
27458+ dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27459+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27460+# define DVAL_LOOKAHEAD DL_MIN_LEN
27461+
27462+#else
27463+# error "choose a hashing strategy"
27464+#endif
27465+
27466+#ifndef DINDEX
27467+#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27468+#endif
27469+#if !defined(DINDEX1) && defined(D_INDEX1)
27470+#define DINDEX1 D_INDEX1
27471+#endif
27472+#if !defined(DINDEX2) && defined(D_INDEX2)
27473+#define DINDEX2 D_INDEX2
27474+#endif
27475+
27476+#if !defined(__LZO_HASH_INCREMENTAL)
27477+# define DVAL_FIRST(dv,p) ((void) 0)
27478+# define DVAL_NEXT(dv,p) ((void) 0)
27479+# define DVAL_LOOKAHEAD 0
27480+#endif
27481+
27482+#if !defined(DVAL_ASSERT)
27483+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
71430cf6
MT
27484+static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
27485+{
27486+ lzo_uint32 df;
27487+ DVAL_FIRST(df, (p));
27488+ assert(DINDEX(dv, p) == DINDEX(df, p));
27489+}
44254afd
MT
27490+#else
27491+# define DVAL_ASSERT(dv,p) ((void) 0)
27492+#endif
27493+#endif
27494+
44254afd
MT
27495+# define DENTRY(p,in) (p)
27496+# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
44254afd
MT
27497+
27498+#if (DD_BITS == 0)
27499+
27500+# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27501+# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27502+# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27503+
27504+#else
27505+
27506+# define UPDATE_D(dict,drun,dv,p,in) \
27507+ dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27508+# define UPDATE_I(dict,drun,index,p,in) \
27509+ dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27510+# define UPDATE_P(ptr,drun,p,in) \
27511+ (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
27512+
27513+#endif
27514+
44254afd
MT
27515+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
27516+ (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
27517+
27518+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
27519+ (BOUNDS_CHECKING_OFF_IN_EXPR( \
27520+ (PTR_LT(m_pos,in) || \
27521+ (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
27522+ m_off > max_offset) ))
27523+
44254afd
MT
27524+#if defined(LZO_DETERMINISTIC)
27525+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
27526+#else
27527+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
27528+#endif
44254afd
MT
27529+#endif
27530+#endif
27531+#endif
27532+#define DO_COMPRESS lzo1x_1_compress
27533+static
27534+lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
27535+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27536+{
27537+ register const lzo_byte *ip;
27538+ lzo_byte *op;
27539+ const lzo_byte *const in_end = in + in_len;
27540+ const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
27541+ const lzo_byte *ii;
27542+ lzo_dict_p const dict = (lzo_dict_p) wrkmem;
27543+
27544+ op = out;
27545+ ip = in;
27546+ ii = ip;
27547+
27548+ ip += 4;
27549+ for (;;) {
27550+ register const lzo_byte *m_pos;
27551+
27552+ lzo_moff_t m_off;
27553+ lzo_uint m_len;
27554+ lzo_uint dindex;
27555+
27556+ DINDEX1(dindex, ip);
27557+ GINDEX(m_pos, m_off, dict, dindex, in);
27558+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27559+ goto literal;
27560+#if 1
27561+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27562+ goto try_match;
27563+ DINDEX2(dindex, ip);
27564+#endif
27565+ GINDEX(m_pos, m_off, dict, dindex, in);
27566+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27567+ goto literal;
27568+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27569+ goto try_match;
27570+ goto literal;
27571+
27572+ try_match:
27573+#if 1 && defined(LZO_UNALIGNED_OK_2)
27574+ if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
27575+#else
27576+ if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
27577+#endif
27578+ ;
27579+ } else {
27580+ if (m_pos[2] == ip[2]) {
27581+ goto match;
27582+ } else {
27583+ ;
27584+ }
27585+ }
27586+
27587+ literal:
27588+ UPDATE_I(dict, 0, dindex, ip, in);
27589+ ++ip;
27590+ if (ip >= ip_end)
27591+ break;
27592+ continue;
27593+
27594+ match:
27595+ UPDATE_I(dict, 0, dindex, ip, in);
27596+ if (pd(ip, ii) > 0) {
27597+ register lzo_uint t = pd(ip, ii);
27598+
27599+ if (t <= 3) {
27600+ assert("lzo-04", op - 2 > out);
27601+ op[-2] |= LZO_BYTE(t);
27602+ } else if (t <= 18)
27603+ *op++ = LZO_BYTE(t - 3);
27604+ else {
27605+ register lzo_uint tt = t - 18;
27606+
27607+ *op++ = 0;
27608+ while (tt > 255) {
27609+ tt -= 255;
27610+ *op++ = 0;
27611+ }
27612+ assert("lzo-05", tt > 0);
27613+ *op++ = LZO_BYTE(tt);
27614+ }
27615+ do
27616+ *op++ = *ii++;
27617+ while (--t > 0);
27618+ }
27619+
27620+ assert("lzo-06", ii == ip);
27621+ ip += 3;
27622+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
27623+ || m_pos[6] != *ip++ || m_pos[7] != *ip++
27624+ || m_pos[8] != *ip++
27625+#ifdef LZO1Y
27626+ || m_pos[9] != *ip++ || m_pos[10] != *ip++
27627+ || m_pos[11] != *ip++ || m_pos[12] != *ip++
27628+ || m_pos[13] != *ip++ || m_pos[14] != *ip++
27629+#endif
27630+ ) {
27631+ --ip;
27632+ m_len = ip - ii;
27633+ assert("lzo-07", m_len >= 3);
27634+ assert("lzo-08", m_len <= M2_MAX_LEN);
27635+
27636+ if (m_off <= M2_MAX_OFFSET) {
27637+ m_off -= 1;
27638+#if defined(LZO1X)
27639+ *op++ =
27640+ LZO_BYTE(((m_len -
27641+ 1) << 5) | ((m_off & 7) << 2));
27642+ *op++ = LZO_BYTE(m_off >> 3);
27643+#elif defined(LZO1Y)
27644+ *op++ =
27645+ LZO_BYTE(((m_len +
27646+ 1) << 4) | ((m_off & 3) << 2));
27647+ *op++ = LZO_BYTE(m_off >> 2);
27648+#endif
27649+ } else if (m_off <= M3_MAX_OFFSET) {
27650+ m_off -= 1;
27651+ *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
27652+ goto m3_m4_offset;
27653+ } else
27654+#if defined(LZO1X)
27655+ {
27656+ m_off -= 0x4000;
27657+ assert("lzo-09", m_off > 0);
27658+ assert("lzo-10", m_off <= 0x7fff);
27659+ *op++ = LZO_BYTE(M4_MARKER |
27660+ ((m_off & 0x4000) >> 11) |
27661+ (m_len - 2));
27662+ goto m3_m4_offset;
27663+ }
27664+#elif defined(LZO1Y)
27665+ goto m4_match;
27666+#endif
27667+ } else {
27668+ {
27669+ const lzo_byte *end = in_end;
27670+ const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
27671+ while (ip < end && *m == *ip)
27672+ m++, ip++;
27673+ m_len = (ip - ii);
27674+ }
27675+ assert("lzo-11", m_len > M2_MAX_LEN);
27676+
27677+ if (m_off <= M3_MAX_OFFSET) {
27678+ m_off -= 1;
27679+ if (m_len <= 33)
27680+ *op++ =
27681+ LZO_BYTE(M3_MARKER | (m_len - 2));
27682+ else {
27683+ m_len -= 33;
27684+ *op++ = M3_MARKER | 0;
27685+ goto m3_m4_len;
27686+ }
27687+ } else {
27688+#if defined(LZO1Y)
27689+ m4_match:
27690+#endif
27691+ m_off -= 0x4000;
27692+ assert("lzo-12", m_off > 0);
27693+ assert("lzo-13", m_off <= 0x7fff);
27694+ if (m_len <= M4_MAX_LEN)
27695+ *op++ = LZO_BYTE(M4_MARKER |
27696+ ((m_off & 0x4000) >>
27697+ 11) | (m_len - 2));
27698+ else {
27699+ m_len -= M4_MAX_LEN;
27700+ *op++ =
27701+ LZO_BYTE(M4_MARKER |
27702+ ((m_off & 0x4000) >> 11));
27703+ m3_m4_len:
27704+ while (m_len > 255) {
27705+ m_len -= 255;
27706+ *op++ = 0;
27707+ }
27708+ assert("lzo-14", m_len > 0);
27709+ *op++ = LZO_BYTE(m_len);
27710+ }
27711+ }
27712+
27713+ m3_m4_offset:
27714+ *op++ = LZO_BYTE((m_off & 63) << 2);
27715+ *op++ = LZO_BYTE(m_off >> 6);
27716+ }
27717+
27718+ ii = ip;
27719+ if (ip >= ip_end)
27720+ break;
27721+ }
27722+
27723+ *out_len = op - out;
27724+ return pd(in_end, ii);
27725+}
27726+
71430cf6
MT
27727+int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
27728+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
44254afd
MT
27729+{
27730+ lzo_byte *op = out;
27731+ lzo_uint t;
27732+
27733+#if defined(__LZO_QUERY_COMPRESS)
27734+ if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27735+ return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
27736+ D_SIZE, lzo_sizeof(lzo_dict_t));
27737+#endif
27738+
27739+ if (in_len <= M2_MAX_LEN + 5)
27740+ t = in_len;
27741+ else {
27742+ t = do_compress(in, in_len, op, out_len, wrkmem);
27743+ op += *out_len;
27744+ }
27745+
27746+ if (t > 0) {
27747+ const lzo_byte *ii = in + in_len - t;
27748+
27749+ if (op == out && t <= 238)
27750+ *op++ = LZO_BYTE(17 + t);
27751+ else if (t <= 3)
27752+ op[-2] |= LZO_BYTE(t);
27753+ else if (t <= 18)
27754+ *op++ = LZO_BYTE(t - 3);
27755+ else {
27756+ lzo_uint tt = t - 18;
27757+
27758+ *op++ = 0;
27759+ while (tt > 255) {
27760+ tt -= 255;
27761+ *op++ = 0;
27762+ }
27763+ assert("lzo-15", tt > 0);
27764+ *op++ = LZO_BYTE(tt);
27765+ }
27766+ do
27767+ *op++ = *ii++;
27768+ while (--t > 0);
27769+ }
27770+
27771+ *op++ = M4_MARKER | 1;
27772+ *op++ = 0;
27773+ *op++ = 0;
27774+
27775+ *out_len = op - out;
27776+ return LZO_E_OK;
27777+}
27778+
27779+#undef do_compress
27780+#undef DO_COMPRESS
27781+#undef LZO_HASH
27782+
27783+#undef LZO_TEST_DECOMPRESS_OVERRUN
27784+#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
27785+#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
27786+#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27787+#undef DO_DECOMPRESS
27788+#define DO_DECOMPRESS lzo1x_decompress
27789+
27790+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
27791+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27792+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
27793+# endif
27794+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27795+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
27796+# endif
27797+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27798+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
27799+# endif
27800+#endif
27801+
27802+#undef TEST_IP
27803+#undef TEST_OP
27804+#undef TEST_LOOKBEHIND
27805+#undef NEED_IP
27806+#undef NEED_OP
27807+#undef HAVE_TEST_IP
27808+#undef HAVE_TEST_OP
27809+#undef HAVE_NEED_IP
27810+#undef HAVE_NEED_OP
27811+#undef HAVE_ANY_IP
27812+#undef HAVE_ANY_OP
27813+
27814+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
27815+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
27816+# define TEST_IP (ip < ip_end)
27817+# endif
27818+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
27819+# define NEED_IP(x) \
27820+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
27821+# endif
27822+#endif
27823+
27824+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
27825+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
27826+# define TEST_OP (op <= op_end)
27827+# endif
27828+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
27829+# undef TEST_OP
27830+# define NEED_OP(x) \
27831+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
27832+# endif
27833+#endif
27834+
27835+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
27836+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
27837+#else
27838+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
27839+#endif
27840+
27841+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
27842+# define TEST_IP (ip < ip_end)
27843+#endif
27844+
27845+#if defined(TEST_IP)
27846+# define HAVE_TEST_IP
27847+#else
27848+# define TEST_IP 1
27849+#endif
27850+#if defined(TEST_OP)
27851+# define HAVE_TEST_OP
27852+#else
27853+# define TEST_OP 1
27854+#endif
27855+
27856+#if defined(NEED_IP)
27857+# define HAVE_NEED_IP
27858+#else
27859+# define NEED_IP(x) ((void) 0)
27860+#endif
27861+#if defined(NEED_OP)
27862+# define HAVE_NEED_OP
27863+#else
27864+# define NEED_OP(x) ((void) 0)
27865+#endif
27866+
27867+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
27868+# define HAVE_ANY_IP
27869+#endif
27870+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
27871+# define HAVE_ANY_OP
27872+#endif
27873+
27874+#undef __COPY4
27875+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
27876+
27877+#undef COPY4
27878+#if defined(LZO_UNALIGNED_OK_4)
27879+# define COPY4(dst,src) __COPY4(dst,src)
27880+#elif defined(LZO_ALIGNED_OK_4)
27881+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
27882+#endif
27883+
27884+#if defined(DO_DECOMPRESS)
71430cf6
MT
27885+int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
27886+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
44254afd
MT
27887+#endif
27888+{
27889+ register lzo_byte *op;
27890+ register const lzo_byte *ip;
27891+ register lzo_uint t;
27892+#if defined(COPY_DICT)
27893+ lzo_uint m_off;
27894+ const lzo_byte *dict_end;
27895+#else
27896+ register const lzo_byte *m_pos;
27897+#endif
27898+
27899+ const lzo_byte *const ip_end = in + in_len;
27900+#if defined(HAVE_ANY_OP)
27901+ lzo_byte *const op_end = out + *out_len;
27902+#endif
27903+#if defined(LZO1Z)
27904+ lzo_uint last_m_off = 0;
27905+#endif
27906+
27907+ LZO_UNUSED(wrkmem);
27908+
27909+#if defined(__LZO_QUERY_DECOMPRESS)
27910+ if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
27911+ return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
27912+ 0, 0);
27913+#endif
27914+
27915+#if defined(COPY_DICT)
27916+ if (dict) {
27917+ if (dict_len > M4_MAX_OFFSET) {
27918+ dict += dict_len - M4_MAX_OFFSET;
27919+ dict_len = M4_MAX_OFFSET;
27920+ }
27921+ dict_end = dict + dict_len;
27922+ } else {
27923+ dict_len = 0;
27924+ dict_end = NULL;
27925+ }
27926+#endif
27927+
27928+ *out_len = 0;
27929+
27930+ op = out;
27931+ ip = in;
27932+
27933+ if (*ip > 17) {
27934+ t = *ip++ - 17;
27935+ if (t < 4)
27936+ goto match_next;
27937+ assert("lzo-16", t > 0);
27938+ NEED_OP(t);
27939+ NEED_IP(t + 1);
27940+ do
27941+ *op++ = *ip++;
27942+ while (--t > 0);
27943+ goto first_literal_run;
27944+ }
27945+
27946+ while (TEST_IP && TEST_OP) {
27947+ t = *ip++;
27948+ if (t >= 16)
27949+ goto match;
27950+ if (t == 0) {
27951+ NEED_IP(1);
27952+ while (*ip == 0) {
27953+ t += 255;
27954+ ip++;
27955+ NEED_IP(1);
27956+ }
27957+ t += 15 + *ip++;
27958+ }
27959+ assert("lzo-17", t > 0);
27960+ NEED_OP(t + 3);
27961+ NEED_IP(t + 4);
27962+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
27963+#if !defined(LZO_UNALIGNED_OK_4)
27964+ if (PTR_ALIGNED2_4(op, ip)) {
27965+#endif
27966+ COPY4(op, ip);
27967+ op += 4;
27968+ ip += 4;
27969+ if (--t > 0) {
27970+ if (t >= 4) {
27971+ do {
27972+ COPY4(op, ip);
27973+ op += 4;
27974+ ip += 4;
27975+ t -= 4;
27976+ } while (t >= 4);
27977+ if (t > 0)
27978+ do
27979+ *op++ = *ip++;
27980+ while (--t > 0);
27981+ } else
27982+ do
27983+ *op++ = *ip++;
27984+ while (--t > 0);
27985+ }
27986+#if !defined(LZO_UNALIGNED_OK_4)
27987+ } else
27988+#endif
27989+#endif
27990+#if !defined(LZO_UNALIGNED_OK_4)
27991+ {
27992+ *op++ = *ip++;
27993+ *op++ = *ip++;
27994+ *op++ = *ip++;
27995+ do
27996+ *op++ = *ip++;
27997+ while (--t > 0);
27998+ }
27999+#endif
28000+
28001+ first_literal_run:
28002+
28003+ t = *ip++;
28004+ if (t >= 16)
28005+ goto match;
28006+#if defined(COPY_DICT)
28007+#if defined(LZO1Z)
28008+ m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28009+ last_m_off = m_off;
28010+#else
28011+ m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28012+#endif
28013+ NEED_OP(3);
28014+ t = 3;
28015+ COPY_DICT(t, m_off)
28016+#else
28017+#if defined(LZO1Z)
28018+ t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28019+ m_pos = op - t;
28020+ last_m_off = t;
28021+#else
28022+ m_pos = op - (1 + M2_MAX_OFFSET);
28023+ m_pos -= t >> 2;
28024+ m_pos -= *ip++ << 2;
28025+#endif
28026+ TEST_LOOKBEHIND(m_pos, out);
28027+ NEED_OP(3);
28028+ *op++ = *m_pos++;
28029+ *op++ = *m_pos++;
28030+ *op++ = *m_pos;
28031+#endif
28032+ goto match_done;
28033+
28034+ while (TEST_IP && TEST_OP) {
28035+ match:
28036+ if (t >= 64) {
28037+#if defined(COPY_DICT)
28038+#if defined(LZO1X)
28039+ m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28040+ t = (t >> 5) - 1;
28041+#elif defined(LZO1Y)
28042+ m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28043+ t = (t >> 4) - 3;
28044+#elif defined(LZO1Z)
28045+ m_off = t & 0x1f;
28046+ if (m_off >= 0x1c)
28047+ m_off = last_m_off;
28048+ else {
28049+ m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28050+ last_m_off = m_off;
28051+ }
28052+ t = (t >> 5) - 1;
28053+#endif
28054+#else
28055+#if defined(LZO1X)
28056+ m_pos = op - 1;
28057+ m_pos -= (t >> 2) & 7;
28058+ m_pos -= *ip++ << 3;
28059+ t = (t >> 5) - 1;
28060+#elif defined(LZO1Y)
28061+ m_pos = op - 1;
28062+ m_pos -= (t >> 2) & 3;
28063+ m_pos -= *ip++ << 2;
28064+ t = (t >> 4) - 3;
28065+#elif defined(LZO1Z)
28066+ {
28067+ lzo_uint off = t & 0x1f;
28068+ m_pos = op;
28069+ if (off >= 0x1c) {
28070+ assert(last_m_off > 0);
28071+ m_pos -= last_m_off;
28072+ } else {
28073+ off =
28074+ 1 + (off << 6) +
28075+ (*ip++ >> 2);
28076+ m_pos -= off;
28077+ last_m_off = off;
28078+ }
28079+ }
28080+ t = (t >> 5) - 1;
28081+#endif
28082+ TEST_LOOKBEHIND(m_pos, out);
28083+ assert("lzo-18", t > 0);
28084+ NEED_OP(t + 3 - 1);
28085+ goto copy_match;
28086+#endif
28087+ } else if (t >= 32) {
28088+ t &= 31;
28089+ if (t == 0) {
28090+ NEED_IP(1);
28091+ while (*ip == 0) {
28092+ t += 255;
28093+ ip++;
28094+ NEED_IP(1);
28095+ }
28096+ t += 31 + *ip++;
28097+ }
28098+#if defined(COPY_DICT)
28099+#if defined(LZO1Z)
28100+ m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28101+ last_m_off = m_off;
28102+#else
28103+ m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28104+#endif
28105+#else
28106+#if defined(LZO1Z)
28107+ {
28108+ lzo_uint off =
28109+ 1 + (ip[0] << 6) + (ip[1] >> 2);
28110+ m_pos = op - off;
28111+ last_m_off = off;
28112+ }
28113+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28114+ m_pos = op - 1;
28115+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28116+#else
28117+ m_pos = op - 1;
28118+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28119+#endif
28120+#endif
28121+ ip += 2;
28122+ } else if (t >= 16) {
28123+#if defined(COPY_DICT)
28124+ m_off = (t & 8) << 11;
28125+#else
28126+ m_pos = op;
28127+ m_pos -= (t & 8) << 11;
28128+#endif
28129+ t &= 7;
28130+ if (t == 0) {
28131+ NEED_IP(1);
28132+ while (*ip == 0) {
28133+ t += 255;
28134+ ip++;
28135+ NEED_IP(1);
28136+ }
28137+ t += 7 + *ip++;
28138+ }
28139+#if defined(COPY_DICT)
28140+#if defined(LZO1Z)
28141+ m_off += (ip[0] << 6) + (ip[1] >> 2);
28142+#else
28143+ m_off += (ip[0] >> 2) + (ip[1] << 6);
28144+#endif
28145+ ip += 2;
28146+ if (m_off == 0)
28147+ goto eof_found;
28148+ m_off += 0x4000;
28149+#if defined(LZO1Z)
28150+ last_m_off = m_off;
28151+#endif
28152+#else
28153+#if defined(LZO1Z)
28154+ m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28155+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28156+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28157+#else
28158+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28159+#endif
28160+ ip += 2;
28161+ if (m_pos == op)
28162+ goto eof_found;
28163+ m_pos -= 0x4000;
28164+#if defined(LZO1Z)
28165+ last_m_off = op - m_pos;
28166+#endif
28167+#endif
28168+ } else {
28169+#if defined(COPY_DICT)
28170+#if defined(LZO1Z)
28171+ m_off = 1 + (t << 6) + (*ip++ >> 2);
28172+ last_m_off = m_off;
28173+#else
28174+ m_off = 1 + (t >> 2) + (*ip++ << 2);
28175+#endif
28176+ NEED_OP(2);
28177+ t = 2;
28178+ COPY_DICT(t, m_off)
28179+#else
28180+#if defined(LZO1Z)
28181+ t = 1 + (t << 6) + (*ip++ >> 2);
28182+ m_pos = op - t;
28183+ last_m_off = t;
28184+#else
28185+ m_pos = op - 1;
28186+ m_pos -= t >> 2;
28187+ m_pos -= *ip++ << 2;
28188+#endif
28189+ TEST_LOOKBEHIND(m_pos, out);
28190+ NEED_OP(2);
28191+ *op++ = *m_pos++;
28192+ *op++ = *m_pos;
28193+#endif
28194+ goto match_done;
28195+ }
28196+
28197+#if defined(COPY_DICT)
28198+
28199+ NEED_OP(t + 3 - 1);
28200+ t += 3 - 1;
28201+ COPY_DICT(t, m_off)
28202+#else
28203+
28204+ TEST_LOOKBEHIND(m_pos, out);
28205+ assert("lzo-19", t > 0);
28206+ NEED_OP(t + 3 - 1);
28207+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28208+#if !defined(LZO_UNALIGNED_OK_4)
28209+ if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28210+ assert((op - m_pos) >= 4);
28211+#else
28212+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28213+#endif
28214+ COPY4(op, m_pos);
28215+ op += 4;
28216+ m_pos += 4;
28217+ t -= 4 - (3 - 1);
28218+ do {
28219+ COPY4(op, m_pos);
28220+ op += 4;
28221+ m_pos += 4;
28222+ t -= 4;
28223+ } while (t >= 4);
28224+ if (t > 0)
28225+ do
28226+ *op++ = *m_pos++;
28227+ while (--t > 0);
28228+ } else
28229+#endif
28230+ {
28231+ copy_match:
28232+ *op++ = *m_pos++;
28233+ *op++ = *m_pos++;
28234+ do
28235+ *op++ = *m_pos++;
28236+ while (--t > 0);
28237+ }
28238+
28239+#endif
28240+
28241+ match_done:
28242+#if defined(LZO1Z)
28243+ t = ip[-1] & 3;
28244+#else
28245+ t = ip[-2] & 3;
28246+#endif
28247+ if (t == 0)
28248+ break;
28249+
28250+ match_next:
28251+ assert("lzo-20", t > 0);
28252+ NEED_OP(t);
28253+ NEED_IP(t + 1);
28254+ do
28255+ *op++ = *ip++;
28256+ while (--t > 0);
28257+ t = *ip++;
28258+ }
28259+ }
28260+
28261+#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28262+ *out_len = op - out;
28263+ return LZO_E_EOF_NOT_FOUND;
28264+#endif
28265+
28266+ eof_found:
28267+ assert("lzo-21", t == 1);
28268+ *out_len = op - out;
28269+ return (ip == ip_end ? LZO_E_OK :
28270+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28271+
28272+#if defined(HAVE_NEED_IP)
28273+ input_overrun:
28274+ *out_len = op - out;
28275+ return LZO_E_INPUT_OVERRUN;
28276+#endif
28277+
28278+#if defined(HAVE_NEED_OP)
28279+ output_overrun:
28280+ *out_len = op - out;
28281+ return LZO_E_OUTPUT_OVERRUN;
28282+#endif
28283+
28284+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28285+ lookbehind_overrun:
28286+ *out_len = op - out;
28287+ return LZO_E_LOOKBEHIND_OVERRUN;
28288+#endif
28289+}
28290+
28291+#define LZO_TEST_DECOMPRESS_OVERRUN
28292+#undef DO_DECOMPRESS
28293+#define DO_DECOMPRESS lzo1x_decompress_safe
28294+
28295+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28296+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28297+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28298+# endif
28299+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28300+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28301+# endif
28302+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28303+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28304+# endif
28305+#endif
28306+
28307+#undef TEST_IP
28308+#undef TEST_OP
28309+#undef TEST_LOOKBEHIND
28310+#undef NEED_IP
28311+#undef NEED_OP
28312+#undef HAVE_TEST_IP
28313+#undef HAVE_TEST_OP
28314+#undef HAVE_NEED_IP
28315+#undef HAVE_NEED_OP
28316+#undef HAVE_ANY_IP
28317+#undef HAVE_ANY_OP
28318+
28319+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28320+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28321+# define TEST_IP (ip < ip_end)
28322+# endif
28323+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28324+# define NEED_IP(x) \
28325+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28326+# endif
28327+#endif
28328+
28329+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28330+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28331+# define TEST_OP (op <= op_end)
28332+# endif
28333+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28334+# undef TEST_OP
28335+# define NEED_OP(x) \
28336+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28337+# endif
28338+#endif
28339+
28340+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28341+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28342+#else
28343+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28344+#endif
28345+
28346+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28347+# define TEST_IP (ip < ip_end)
28348+#endif
28349+
28350+#if defined(TEST_IP)
28351+# define HAVE_TEST_IP
28352+#else
28353+# define TEST_IP 1
28354+#endif
28355+#if defined(TEST_OP)
28356+# define HAVE_TEST_OP
28357+#else
28358+# define TEST_OP 1
28359+#endif
28360+
28361+#if defined(NEED_IP)
28362+# define HAVE_NEED_IP
28363+#else
28364+# define NEED_IP(x) ((void) 0)
28365+#endif
28366+#if defined(NEED_OP)
28367+# define HAVE_NEED_OP
28368+#else
28369+# define NEED_OP(x) ((void) 0)
28370+#endif
28371+
28372+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28373+# define HAVE_ANY_IP
28374+#endif
28375+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28376+# define HAVE_ANY_OP
28377+#endif
28378+
28379+#undef __COPY4
28380+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28381+
28382+#undef COPY4
28383+#if defined(LZO_UNALIGNED_OK_4)
28384+# define COPY4(dst,src) __COPY4(dst,src)
28385+#elif defined(LZO_ALIGNED_OK_4)
28386+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28387+#endif
28388+
28389+/***** End of minilzo.c *****/
71430cf6
MT
28390diff -urN linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.h linux-2.6.22/fs/reiser4/plugin/compress/minilzo.h
28391--- linux-2.6.22.orig/fs/reiser4/plugin/compress/minilzo.h 1970-01-01 03:00:00.000000000 +0300
28392+++ linux-2.6.22/fs/reiser4/plugin/compress/minilzo.h 2007-07-29 00:25:34.900702689 +0400
28393@@ -0,0 +1,70 @@
44254afd
MT
28394+/* minilzo.h -- mini subset of the LZO real-time data compression library
28395+ adopted for reiser4 compression transform plugin.
28396+
28397+ This file is part of the LZO real-time data compression library
28398+ and not included in any proprietary licenses of reiser4.
28399+
28400+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28401+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28402+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28403+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28404+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28405+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28406+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28407+ All Rights Reserved.
28408+
28409+ The LZO library is free software; you can redistribute it and/or
28410+ modify it under the terms of the GNU General Public License as
28411+ published by the Free Software Foundation; either version 2 of
28412+ the License, or (at your option) any later version.
28413+
28414+ The LZO library is distributed in the hope that it will be useful,
28415+ but WITHOUT ANY WARRANTY; without even the implied warranty of
28416+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28417+ GNU General Public License for more details.
28418+
28419+ You should have received a copy of the GNU General Public License
28420+ along with the LZO library; see the file COPYING.
28421+ If not, write to the Free Software Foundation, Inc.,
28422+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28423+
28424+ Markus F.X.J. Oberhumer
28425+ <markus@oberhumer.com>
28426+ http://www.oberhumer.com/opensource/lzo/
28427+ */
28428+
28429+/*
28430+ * NOTE:
28431+ * the full LZO package can be found at
28432+ * http://www.oberhumer.com/opensource/lzo/
28433+ */
28434+
28435+#ifndef __MINILZO_H
28436+#define __MINILZO_H
28437+
28438+#define MINILZO_VERSION 0x1080
28439+
44254afd
MT
28440+#include "lzoconf.h"
28441+
44254afd
MT
28442+/* Memory required for the wrkmem parameter.
28443+ * When the required size is 0, you can also pass a NULL pointer.
28444+ */
28445+
28446+#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28447+#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28448+#define LZO1X_MEM_DECOMPRESS (0)
28449+
28450+/* compression */
71430cf6
MT
28451+extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28452+ lzo_byte * dst, lzo_uintp dst_len,
28453+ lzo_voidp wrkmem);
44254afd 28454+/* decompression */
71430cf6
MT
28455+extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28456+ lzo_byte * dst, lzo_uintp dst_len,
28457+ lzo_voidp wrkmem /* NOT USED */);
44254afd 28458+/* safe decompression with overrun testing */
71430cf6
MT
28459+extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
28460+ lzo_byte * dst, lzo_uintp dst_len,
28461+ lzo_voidp wrkmem /* NOT USED */ );
44254afd 28462+
44254afd 28463+#endif /* already included */
71430cf6
MT
28464diff -urN linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.22/fs/reiser4/plugin/crypto/cipher.c
28465--- linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
28466+++ linux-2.6.22/fs/reiser4/plugin/crypto/cipher.c 2007-07-29 00:25:34.900702689 +0400
28467@@ -0,0 +1,37 @@
44254afd
MT
28468+/* Copyright 2001, 2002, 2003 by Hans Reiser,
28469+ licensing governed by reiser4/README */
28470+/* Reiser4 cipher transform plugins */
28471+
28472+#include "../../debug.h"
28473+#include "../plugin.h"
44254afd
MT
28474+
28475+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
28476+ [NONE_CIPHER_ID] = {
28477+ .h = {
28478+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
28479+ .id = NONE_CIPHER_ID,
28480+ .pops = NULL,
28481+ .label = "none",
28482+ .desc = "no cipher transform",
28483+ .linkage = {NULL, NULL}
28484+ },
28485+ .alloc = NULL,
28486+ .free = NULL,
28487+ .scale = NULL,
28488+ .align_stream = NULL,
28489+ .setkey = NULL,
28490+ .encrypt = NULL,
28491+ .decrypt = NULL
44254afd
MT
28492+ }
28493+};
28494+
28495+/* Make Linus happy.
28496+ Local variables:
28497+ c-indentation-style: "K&R"
28498+ mode-name: "LC"
28499+ c-basic-offset: 8
28500+ tab-width: 8
28501+ fill-column: 120
28502+ scroll-step: 1
28503+ End:
28504+*/
71430cf6
MT
28505diff -urN linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.22/fs/reiser4/plugin/crypto/cipher.h
28506--- linux-2.6.22.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
28507+++ linux-2.6.22/fs/reiser4/plugin/crypto/cipher.h 2007-07-29 00:25:34.900702689 +0400
28508@@ -0,0 +1,55 @@
44254afd
MT
28509+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28510+/* This file contains definitions for the objects operated
28511+ by reiser4 key manager, which is something like keyring
28512+ wrapped by appropriate reiser4 plugin */
28513+
28514+#if !defined( __FS_REISER4_CRYPT_H__ )
28515+#define __FS_REISER4_CRYPT_H__
28516+
28517+#include <linux/crypto.h>
28518+
44254afd 28519+/* key info imported from user space */
71430cf6 28520+struct reiser4_crypto_data {
44254afd
MT
28521+ int keysize; /* uninstantiated key size */
28522+ __u8 * key; /* uninstantiated key */
28523+ int keyid_size; /* size of passphrase */
28524+ __u8 * keyid; /* passphrase */
71430cf6 28525+};
44254afd
MT
28526+
28527+/* This object contains all needed infrastructure to implement
28528+ cipher transform. This is operated (allocating, inheriting,
28529+ validating, binding to host inode, etc..) by reiser4 key manager.
28530+
28531+ This info can be allocated in two cases:
71430cf6 28532+ 1. importing a key from user space.
44254afd 28533+ 2. reading inode from disk */
71430cf6
MT
28534+struct reiser4_crypto_info {
28535+ struct inode * host;
28536+ struct crypto_hash * digest;
28537+ struct crypto_blkcipher * cipher;
28538+#if 0
28539+ cipher_key_plugin * kplug; /* key manager */
28540+#endif
44254afd
MT
28541+ __u8 * keyid; /* key fingerprint, created by digest plugin,
28542+ using uninstantiated key and passphrase.
28543+ supposed to be stored in disk stat-data */
28544+ int inst; /* this indicates if the cipher key is
28545+ instantiated (case 1 above) */
28546+ int keysize; /* uninstantiated key size (bytes), supposed
28547+ to be stored in disk stat-data */
28548+ int keyload_count; /* number of the objects which has this
28549+ crypto-stat attached */
71430cf6 28550+};
44254afd
MT
28551+
28552+#endif /* __FS_REISER4_CRYPT_H__ */
28553+
28554+/*
28555+ Local variables:
28556+ c-indentation-style: "K&R"
28557+ mode-name: "LC"
28558+ c-basic-offset: 8
28559+ tab-width: 8
28560+ fill-column: 120
28561+ scroll-step: 1
28562+ End:
28563+*/
71430cf6
MT
28564diff -urN linux-2.6.22.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.22/fs/reiser4/plugin/crypto/digest.c
28565--- linux-2.6.22.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
28566+++ linux-2.6.22/fs/reiser4/plugin/crypto/digest.c 2007-07-29 00:25:34.900702689 +0400
44254afd
MT
28567@@ -0,0 +1,58 @@
28568+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28569+
28570+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
28571+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
28572+#include "../../debug.h"
28573+#include "../plugin_header.h"
28574+#include "../plugin.h"
28575+#include "../file/cryptcompress.h"
28576+
28577+#include <linux/types.h>
28578+
28579+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
28580+
71430cf6 28581+static struct crypto_hash * alloc_sha256 (void)
44254afd
MT
28582+{
28583+#if REISER4_SHA256
71430cf6 28584+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
44254afd
MT
28585+#else
28586+ warning("edward-1418", "sha256 unsupported");
28587+ return ERR_PTR(-EINVAL);
28588+#endif
28589+}
28590+
71430cf6 28591+static void free_sha256 (struct crypto_hash * tfm)
44254afd
MT
28592+{
28593+#if REISER4_SHA256
71430cf6 28594+ crypto_free_hash(tfm);
44254afd
MT
28595+#endif
28596+ return;
28597+}
28598+
28599+/* digest plugins */
28600+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
28601+ [SHA256_32_DIGEST_ID] = {
28602+ .h = {
28603+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
28604+ .id = SHA256_32_DIGEST_ID,
28605+ .pops = NULL,
28606+ .label = "sha256_32",
28607+ .desc = "sha256_32 digest transform",
28608+ .linkage = {NULL, NULL}
28609+ },
28610+ .fipsize = sizeof(__u32),
28611+ .alloc = alloc_sha256,
28612+ .free = free_sha256
28613+ }
28614+};
28615+
28616+/*
28617+ Local variables:
28618+ c-indentation-style: "K&R"
28619+ mode-name: "LC"
28620+ c-basic-offset: 8
28621+ tab-width: 8
28622+ fill-column: 120
28623+ scroll-step: 1
28624+ End:
28625+*/
71430cf6
MT
28626diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.22/fs/reiser4/plugin/dir/dir.h
28627--- linux-2.6.22.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 03:00:00.000000000 +0300
28628+++ linux-2.6.22/fs/reiser4/plugin/dir/dir.h 2007-07-29 00:25:34.900702689 +0400
44254afd
MT
28629@@ -0,0 +1,36 @@
28630+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28631+ * reiser4/README */
28632+
28633+/* this file contains declarations of methods implementing directory plugins */
28634+
28635+#if !defined( __REISER4_DIR_H__ )
28636+#define __REISER4_DIR_H__
28637+
28638+/*#include "../../key.h"
28639+
28640+#include <linux/fs.h>*/
28641+
28642+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
28643+
28644+/* "hashed" directory methods of dir plugin */
28645+void build_entry_key_hashed(const struct inode *, const struct qstr *,
28646+ reiser4_key *);
28647+
28648+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
28649+
28650+/* "seekable" directory methods of dir plugin */
28651+void build_entry_key_seekable(const struct inode *, const struct qstr *,
28652+ reiser4_key *);
28653+
28654+/* __REISER4_DIR_H__ */
28655+#endif
28656+
28657+/*
28658+ Local variables:
28659+ c-indentation-style: "K&R"
28660+ mode-name: "LC"
28661+ c-basic-offset: 8
28662+ tab-width: 8
28663+ fill-column: 120
28664+ End:
28665+*/
71430cf6
MT
28666diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.22/fs/reiser4/plugin/dir/hashed_dir.c
28667--- linux-2.6.22.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 03:00:00.000000000 +0300
28668+++ linux-2.6.22/fs/reiser4/plugin/dir/hashed_dir.c 2007-07-29 00:25:34.900702689 +0400
44254afd
MT
28669@@ -0,0 +1,81 @@
28670+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
28671+ * reiser4/README */
28672+
28673+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
28674+ names to the files. */
28675+
28676+/*
28677+ * Hashed directory logically consists of persistent directory
28678+ * entries. Directory entry is a pair of a file name and a key of stat-data of
28679+ * a file that has this name in the given directory.
28680+ *
28681+ * Directory entries are stored in the tree in the form of directory
28682+ * items. Directory item should implement dir_entry_ops portion of item plugin
28683+ * interface (see plugin/item/item.h). Hashed directory interacts with
28684+ * directory item plugin exclusively through dir_entry_ops operations.
28685+ *
28686+ * Currently there are two implementations of directory items: "simple
28687+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
28688+ * (plugin/item/cde.[ch]) with the latter being the default.
28689+ *
28690+ * There is, however some delicate way through which directory code interferes
28691+ * with item plugin: key assignment policy. A key for a directory item is
28692+ * chosen by directory code, and as described in kassign.c, this key contains
28693+ * a portion of file name. Directory item uses this knowledge to avoid storing
28694+ * this portion of file name twice: in the key and in the directory item body.
28695+ *
28696+ */
28697+
28698+#include "../../inode.h"
28699+
28700+void complete_entry_key(const struct inode *, const char *name,
28701+ int len, reiser4_key * result);
28702+
28703+/* this is implementation of build_entry_key method of dir
28704+ plugin for HASHED_DIR_PLUGIN_ID
28705+ */
28706+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
28707+ * (or will be) in.*/
28708+ const struct qstr *qname, /* name of file referenced
28709+ * by this entry */
28710+ reiser4_key * result /* resulting key of directory
28711+ * entry */ )
28712+{
28713+ const char *name;
28714+ int len;
28715+
28716+ assert("nikita-1139", dir != NULL);
28717+ assert("nikita-1140", qname != NULL);
28718+ assert("nikita-1141", qname->name != NULL);
28719+ assert("nikita-1142", result != NULL);
28720+
28721+ name = qname->name;
28722+ len = qname->len;
28723+
28724+ assert("nikita-2867", strlen(name) == len);
28725+
28726+ reiser4_key_init(result);
28727+ /* locality of directory entry's key is objectid of parent
28728+ directory */
28729+ set_key_locality(result, get_inode_oid(dir));
28730+ /* minor packing locality is constant */
28731+ set_key_type(result, KEY_FILE_NAME_MINOR);
28732+ /* dot is special case---we always want it to be first entry in
28733+ a directory. Actually, we just want to have smallest
28734+ directory entry.
28735+ */
28736+ if (len == 1 && name[0] == '.')
28737+ return;
28738+
28739+ /* initialize part of entry key which depends on file name */
28740+ complete_entry_key(dir, name, len, result);
28741+}
28742+
28743+/* Local variables:
28744+ c-indentation-style: "K&R"
28745+ mode-name: "LC"
28746+ c-basic-offset: 8
28747+ tab-width: 8
28748+ fill-column: 120
28749+ End:
28750+*/
71430cf6
MT
28751diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.22/fs/reiser4/plugin/dir/Makefile
28752--- linux-2.6.22.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 03:00:00.000000000 +0300
28753+++ linux-2.6.22/fs/reiser4/plugin/dir/Makefile 2007-07-29 00:25:34.900702689 +0400
28754@@ -0,0 +1,5 @@
28755+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
28756+
28757+dir_plugins-objs := \
28758+ hashed_dir.o \
28759+ seekable_dir.o
28760diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.22/fs/reiser4/plugin/dir/seekable_dir.c
28761--- linux-2.6.22.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 03:00:00.000000000 +0300
28762+++ linux-2.6.22/fs/reiser4/plugin/dir/seekable_dir.c 2007-07-29 00:25:34.904703724 +0400
44254afd
MT
28763@@ -0,0 +1,46 @@
28764+/* Copyright 2005 by Hans Reiser, licensing governed by
28765+ * reiser4/README */
28766+
28767+#include "../../inode.h"
28768+
28769+/* this is implementation of build_entry_key method of dir
28770+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
28771+ This is for directories where we want repeatable and restartable readdir()
28772+ even in case 32bit user level struct dirent (readdir(3)).
28773+*/
28774+void
28775+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
28776+ reiser4_key * result)
28777+{
28778+ oid_t objectid;
28779+
28780+ assert("nikita-2283", dir != NULL);
28781+ assert("nikita-2284", name != NULL);
28782+ assert("nikita-2285", name->name != NULL);
28783+ assert("nikita-2286", result != NULL);
28784+
28785+ reiser4_key_init(result);
28786+ /* locality of directory entry's key is objectid of parent
28787+ directory */
28788+ set_key_locality(result, get_inode_oid(dir));
28789+ /* minor packing locality is constant */
28790+ set_key_type(result, KEY_FILE_NAME_MINOR);
28791+ /* dot is special case---we always want it to be first entry in
28792+ a directory. Actually, we just want to have smallest
28793+ directory entry.
28794+ */
28795+ if ((name->len == 1) && (name->name[0] == '.'))
28796+ return;
28797+
28798+ /* objectid of key is 31 lowest bits of hash. */
28799+ objectid =
28800+ inode_hash_plugin(dir)->hash(name->name,
28801+ (int)name->len) & 0x7fffffff;
28802+
28803+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
28804+ set_key_objectid(result, objectid);
28805+
28806+ /* offset is always 0. */
28807+ set_key_offset(result, (__u64) 0);
28808+ return;
28809+}
71430cf6
MT
28810diff -urN linux-2.6.22.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.22/fs/reiser4/plugin/dir_plugin_common.c
28811--- linux-2.6.22.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
28812+++ linux-2.6.22/fs/reiser4/plugin/dir_plugin_common.c 2007-07-29 00:25:34.904703724 +0400
28813@@ -0,0 +1,872 @@
44254afd
MT
28814+/* Copyright 2005 by Hans Reiser, licensing governed by
28815+ reiser4/README */
28816+
28817+/* this file contains typical implementations for most of methods of
28818+ directory plugin
28819+*/
28820+
28821+#include "../inode.h"
28822+
71430cf6 28823+int reiser4_find_entry(struct inode *dir, struct dentry *name,
44254afd 28824+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
71430cf6 28825+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
44254afd
MT
28826+void check_light_weight(struct inode *inode, struct inode *parent);
28827+
28828+/* this is common implementation of get_parent method of dir plugin
28829+ this is used by NFS kernel server to "climb" up directory tree to
28830+ check permissions
28831+ */
28832+struct dentry *get_parent_common(struct inode *child)
28833+{
28834+ struct super_block *s;
28835+ struct inode *parent;
28836+ struct dentry dotdot;
28837+ struct dentry *dentry;
28838+ reiser4_key key;
28839+ int result;
28840+
28841+ /*
28842+ * lookup dotdot entry.
28843+ */
28844+
28845+ s = child->i_sb;
28846+ memset(&dotdot, 0, sizeof(dotdot));
28847+ dotdot.d_name.name = "..";
28848+ dotdot.d_name.len = 2;
28849+ dotdot.d_op = &get_super_private(s)->ops.dentry;
28850+
71430cf6 28851+ result = reiser4_lookup_name(child, &dotdot, &key);
44254afd
MT
28852+ if (result != 0)
28853+ return ERR_PTR(result);
28854+
28855+ parent = reiser4_iget(s, &key, 1);
28856+ if (!IS_ERR(parent)) {
28857+ /*
28858+ * FIXME-NIKITA dubious: attributes are inherited from @child
28859+ * to @parent. But:
28860+ *
28861+ * (*) this is the only this we can do
28862+ *
28863+ * (*) attributes of light-weight object are inherited
28864+ * from a parent through which object was looked up first,
28865+ * so it is ambiguous anyway.
28866+ *
28867+ */
28868+ check_light_weight(parent, child);
28869+ reiser4_iget_complete(parent);
28870+ dentry = d_alloc_anon(parent);
28871+ if (dentry == NULL) {
28872+ iput(parent);
28873+ dentry = ERR_PTR(RETERR(-ENOMEM));
28874+ } else
28875+ dentry->d_op = &get_super_private(s)->ops.dentry;
28876+ } else if (PTR_ERR(parent) == -ENOENT)
28877+ dentry = ERR_PTR(RETERR(-ESTALE));
28878+ else
28879+ dentry = (void *)parent;
28880+ return dentry;
28881+}
28882+
28883+/* this is common implementation of is_name_acceptable method of dir
28884+ plugin
28885+ */
28886+int is_name_acceptable_common(const struct inode *inode, /* directory to check */
28887+ const char *name UNUSED_ARG, /* name to check */
28888+ int len /* @name's length */ )
28889+{
28890+ assert("nikita-733", inode != NULL);
28891+ assert("nikita-734", name != NULL);
28892+ assert("nikita-735", len > 0);
28893+
28894+ return len <= reiser4_max_filename_len(inode);
28895+}
28896+
28897+/* there is no common implementation of build_entry_key method of dir
28898+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
28899+ plugin/dir/seekable.c:build_entry_key_seekable() for example
28900+*/
28901+
28902+/* this is common implementation of build_readdir_key method of dir
28903+ plugin
71430cf6 28904+ see reiser4_readdir_common for more details
44254afd
MT
28905+*/
28906+int build_readdir_key_common(struct file *dir /* directory being read */ ,
28907+ reiser4_key * result /* where to store key */ )
28908+{
28909+ reiser4_file_fsdata *fdata;
28910+ struct inode *inode;
28911+
28912+ assert("nikita-1361", dir != NULL);
28913+ assert("nikita-1362", result != NULL);
28914+ assert("nikita-1363", dir->f_dentry != NULL);
28915+ inode = dir->f_dentry->d_inode;
28916+ assert("nikita-1373", inode != NULL);
28917+
28918+ fdata = reiser4_get_file_fsdata(dir);
28919+ if (IS_ERR(fdata))
28920+ return PTR_ERR(fdata);
28921+ assert("nikita-1364", fdata != NULL);
28922+ return extract_key_from_de_id(get_inode_oid(inode),
28923+ &fdata->dir.readdir.position.
28924+ dir_entry_key, result);
28925+
28926+}
28927+
71430cf6
MT
28928+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
28929+ int adj);
44254afd
MT
28930+
28931+/* this is common implementation of add_entry method of dir plugin
28932+*/
71430cf6
MT
28933+int reiser4_add_entry_common(struct inode *object, /* directory to add new name
28934+ * in */
28935+ struct dentry *where, /* new name */
28936+ reiser4_object_create_data * data, /* parameters of
28937+ * new object */
28938+ reiser4_dir_entry_desc * entry /* parameters of
28939+ * new directory
28940+ * entry */)
44254afd
MT
28941+{
28942+ int result;
28943+ coord_t *coord;
28944+ lock_handle lh;
71430cf6 28945+ struct reiser4_dentry_fsdata *fsdata;
44254afd
MT
28946+ reiser4_block_nr reserve;
28947+
28948+ assert("nikita-1114", object != NULL);
28949+ assert("nikita-1250", where != NULL);
28950+
28951+ fsdata = reiser4_get_dentry_fsdata(where);
28952+ if (unlikely(IS_ERR(fsdata)))
28953+ return PTR_ERR(fsdata);
28954+
28955+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
28956+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
28957+ return RETERR(-ENOSPC);
28958+
28959+ init_lh(&lh);
28960+ coord = &fsdata->dec.entry_coord;
28961+ coord_clear_iplug(coord);
28962+
28963+ /* check for this entry in a directory. This is plugin method. */
71430cf6
MT
28964+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
28965+ entry);
44254afd
MT
28966+ if (likely(result == -ENOENT)) {
28967+ /* add new entry. Just pass control to the directory
28968+ item plugin. */
28969+ assert("nikita-1709", inode_dir_item_plugin(object));
28970+ assert("nikita-2230", coord->node == lh.node);
71430cf6 28971+ reiser4_seal_done(&fsdata->dec.entry_seal);
44254afd
MT
28972+ result =
28973+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
28974+ coord, &lh,
28975+ where,
28976+ entry);
28977+ if (result == 0) {
71430cf6
MT
28978+ reiser4_adjust_dir_file(object, where,
28979+ fsdata->dec.pos + 1, +1);
44254afd
MT
28980+ INODE_INC_FIELD(object, i_size);
28981+ }
28982+ } else if (result == 0) {
28983+ assert("nikita-2232", coord->node == lh.node);
28984+ result = RETERR(-EEXIST);
28985+ }
28986+ done_lh(&lh);
28987+
28988+ return result;
28989+}
28990+
28991+/**
28992+ * rem_entry - remove entry from directory item
28993+ * @dir:
28994+ * @dentry:
28995+ * @entry:
28996+ * @coord:
28997+ * @lh:
28998+ *
28999+ * Checks that coordinate @coord is set properly and calls item plugin
29000+ * method to cut entry.
29001+ */
29002+static int
29003+rem_entry(struct inode *dir, struct dentry *dentry,
29004+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29005+{
29006+ item_plugin *iplug;
29007+ struct inode *child;
29008+
29009+ iplug = inode_dir_item_plugin(dir);
29010+ child = dentry->d_inode;
29011+ assert("nikita-3399", child != NULL);
29012+
29013+ /* check that we are really destroying an entry for @child */
29014+ if (REISER4_DEBUG) {
29015+ int result;
29016+ reiser4_key key;
29017+
29018+ result = iplug->s.dir.extract_key(coord, &key);
29019+ if (result != 0)
29020+ return result;
29021+ if (get_key_objectid(&key) != get_inode_oid(child)) {
29022+ warning("nikita-3397",
29023+ "rem_entry: %#llx != %#llx\n",
29024+ get_key_objectid(&key),
29025+ (unsigned long long)get_inode_oid(child));
29026+ return RETERR(-EIO);
29027+ }
29028+ }
29029+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29030+}
29031+
29032+/**
71430cf6 29033+ * reiser4_rem_entry_common - remove entry from a directory
44254afd
MT
29034+ * @dir: directory to remove entry from
29035+ * @where: name that is being removed
29036+ * @entry: description of entry being removed
29037+ *
29038+ * This is common implementation of rem_entry method of dir plugin.
29039+ */
71430cf6
MT
29040+int reiser4_rem_entry_common(struct inode *dir,
29041+ struct dentry *dentry,
29042+ reiser4_dir_entry_desc *entry)
44254afd
MT
29043+{
29044+ int result;
29045+ coord_t *coord;
29046+ lock_handle lh;
71430cf6 29047+ struct reiser4_dentry_fsdata *fsdata;
44254afd
MT
29048+ __u64 tograb;
29049+
29050+ assert("nikita-1124", dir != NULL);
29051+ assert("nikita-1125", dentry != NULL);
29052+
29053+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29054+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29055+ if (result != 0)
29056+ return RETERR(-ENOSPC);
29057+
29058+ init_lh(&lh);
29059+
29060+ /* check for this entry in a directory. This is plugin method. */
71430cf6 29061+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
44254afd
MT
29062+ fsdata = reiser4_get_dentry_fsdata(dentry);
29063+ if (IS_ERR(fsdata)) {
29064+ done_lh(&lh);
29065+ return PTR_ERR(fsdata);
29066+ }
29067+
29068+ coord = &fsdata->dec.entry_coord;
29069+
29070+ assert("nikita-3404",
29071+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29072+ dir->i_size <= 1);
29073+
29074+ coord_clear_iplug(coord);
29075+ if (result == 0) {
29076+ /* remove entry. Just pass control to the directory item
29077+ plugin. */
29078+ assert("vs-542", inode_dir_item_plugin(dir));
71430cf6
MT
29079+ reiser4_seal_done(&fsdata->dec.entry_seal);
29080+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
44254afd
MT
29081+ result =
29082+ WITH_COORD(coord,
29083+ rem_entry(dir, dentry, entry, coord, &lh));
29084+ if (result == 0) {
29085+ if (dir->i_size >= 1)
29086+ INODE_DEC_FIELD(dir, i_size);
29087+ else {
29088+ warning("nikita-2509", "Dir %llu is runt",
29089+ (unsigned long long)
29090+ get_inode_oid(dir));
29091+ result = RETERR(-EIO);
29092+ }
29093+
29094+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
29095+ dentry->d_inode->i_size != 2 ||
29096+ inode_dir_plugin(dentry->d_inode) == NULL);
29097+ }
29098+ }
29099+ done_lh(&lh);
29100+
29101+ return result;
29102+}
29103+
29104+static reiser4_block_nr estimate_init(struct inode *parent,
29105+ struct inode *object);
29106+static int create_dot_dotdot(struct inode *object, struct inode *parent);
29107+
29108+/* this is common implementation of init method of dir plugin
29109+ create "." and ".." entries
29110+*/
71430cf6
MT
29111+int reiser4_dir_init_common(struct inode *object, /* new directory */
29112+ struct inode *parent, /* parent directory */
29113+ reiser4_object_create_data * data /* info passed
29114+ * to us, this
29115+ * is filled by
29116+ * reiser4()
29117+ * syscall in
29118+ * particular */)
44254afd
MT
29119+{
29120+ reiser4_block_nr reserve;
29121+
29122+ assert("nikita-680", object != NULL);
29123+ assert("nikita-681", S_ISDIR(object->i_mode));
29124+ assert("nikita-682", parent != NULL);
29125+ assert("nikita-684", data != NULL);
29126+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29127+ assert("nikita-687", object->i_mode & S_IFDIR);
29128+
29129+ reserve = estimate_init(parent, object);
29130+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29131+ return RETERR(-ENOSPC);
29132+
29133+ return create_dot_dotdot(object, parent);
29134+}
29135+
29136+/* this is common implementation of done method of dir plugin
29137+ remove "." entry
29138+*/
71430cf6 29139+int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
44254afd
MT
29140+{
29141+ int result;
29142+ reiser4_block_nr reserve;
29143+ struct dentry goodby_dots;
29144+ reiser4_dir_entry_desc entry;
29145+
29146+ assert("nikita-1449", object != NULL);
29147+
71430cf6 29148+ if (reiser4_inode_get_flag(object, REISER4_NO_SD))
44254afd
MT
29149+ return 0;
29150+
29151+ /* of course, this can be rewritten to sweep everything in one
71430cf6 29152+ reiser4_cut_tree(). */
44254afd
MT
29153+ memset(&entry, 0, sizeof entry);
29154+
71430cf6 29155+ /* FIXME: this done method is called from reiser4_delete_dir_common which
44254afd
MT
29156+ * reserved space already */
29157+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29158+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29159+ return RETERR(-ENOSPC);
29160+
29161+ memset(&goodby_dots, 0, sizeof goodby_dots);
29162+ entry.obj = goodby_dots.d_inode = object;
29163+ goodby_dots.d_name.name = ".";
29164+ goodby_dots.d_name.len = 1;
71430cf6 29165+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
44254afd
MT
29166+ reiser4_free_dentry_fsdata(&goodby_dots);
29167+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29168+ /* only worth a warning
29169+
29170+ "values of \ eB\ f will give rise to dom!\n"
29171+ -- v6src/s2/mv.c:89
29172+ */
29173+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
29174+ (unsigned long long)get_inode_oid(object), result);
29175+ return 0;
29176+}
29177+
29178+/* this is common implementation of attach method of dir plugin
29179+*/
71430cf6
MT
29180+int reiser4_attach_common(struct inode *child UNUSED_ARG,
29181+ struct inode *parent UNUSED_ARG)
44254afd
MT
29182+{
29183+ assert("nikita-2647", child != NULL);
29184+ assert("nikita-2648", parent != NULL);
29185+
29186+ return 0;
29187+}
29188+
29189+/* this is common implementation of detach method of dir plugin
29190+ remove "..", decrease nlink on parent
29191+*/
71430cf6 29192+int reiser4_detach_common(struct inode *object, struct inode *parent)
44254afd
MT
29193+{
29194+ int result;
29195+ struct dentry goodby_dots;
29196+ reiser4_dir_entry_desc entry;
29197+
29198+ assert("nikita-2885", object != NULL);
71430cf6 29199+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
44254afd
MT
29200+
29201+ memset(&entry, 0, sizeof entry);
29202+
29203+ /* NOTE-NIKITA this only works if @parent is -the- parent of
29204+ @object, viz. object whose key is stored in dotdot
29205+ entry. Wouldn't work with hard-links on directories. */
29206+ memset(&goodby_dots, 0, sizeof goodby_dots);
29207+ entry.obj = goodby_dots.d_inode = parent;
29208+ goodby_dots.d_name.name = "..";
29209+ goodby_dots.d_name.len = 2;
71430cf6 29210+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
44254afd
MT
29211+ reiser4_free_dentry_fsdata(&goodby_dots);
29212+ if (result == 0) {
29213+ /* the dot should be the only entry remaining at this time... */
71430cf6
MT
29214+ assert("nikita-3400",
29215+ object->i_size == 1 && object->i_nlink <= 2);
44254afd
MT
29216+#if 0
29217+ /* and, together with the only name directory can have, they
29218+ * provides for the last 2 remaining references. If we get
29219+ * here as part of error handling during mkdir, @object
29220+ * possibly has no name yet, so its nlink == 1. If we get here
29221+ * from rename (targeting empty directory), it has no name
29222+ * already, so its nlink == 1. */
29223+ assert("nikita-3401",
29224+ object->i_nlink == 2 || object->i_nlink == 1);
29225+#endif
29226+
29227+ /* decrement nlink of directory removed ".." pointed
29228+ to */
29229+ reiser4_del_nlink(parent, NULL, 0);
29230+ }
29231+ return result;
29232+}
29233+
29234+/* this is common implementation of estimate.add_entry method of
29235+ dir plugin
29236+ estimation of adding entry which supposes that entry is inserting a
29237+ unit into item
29238+*/
29239+reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29240+{
71430cf6 29241+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
44254afd
MT
29242+}
29243+
29244+/* this is common implementation of estimate.rem_entry method of dir
29245+ plugin
29246+*/
29247+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29248+{
71430cf6 29249+ return estimate_one_item_removal(reiser4_tree_by_inode(inode));
44254afd
MT
29250+}
29251+
29252+/* this is common implementation of estimate.unlink method of dir
29253+ plugin
29254+*/
29255+reiser4_block_nr
29256+dir_estimate_unlink_common(const struct inode * parent,
29257+ const struct inode * object)
29258+{
29259+ reiser4_block_nr res;
29260+
29261+ /* hashed_rem_entry(object) */
29262+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
29263+ /* del_nlink(parent) */
29264+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29265+
29266+ return res;
29267+}
29268+
29269+/*
29270+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29271+ * methods: if @inode is a light-weight file, setup its credentials
29272+ * that are not stored in the stat-data in this case
29273+ */
29274+void check_light_weight(struct inode *inode, struct inode *parent)
29275+{
71430cf6 29276+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
44254afd
MT
29277+ inode->i_uid = parent->i_uid;
29278+ inode->i_gid = parent->i_gid;
29279+ /* clear light-weight flag. If inode would be read by any
29280+ other name, [ug]id wouldn't change. */
71430cf6 29281+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
44254afd
MT
29282+ }
29283+}
29284+
29285+/* looks for name specified in @dentry in directory @parent and if name is
29286+ found - key of object found entry points to is stored in @entry->key */
71430cf6 29287+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
44254afd
MT
29288+ * name in */
29289+ struct dentry *dentry, /* name to look for */
29290+ reiser4_key * key /* place to store key */ )
29291+{
29292+ int result;
29293+ coord_t *coord;
29294+ lock_handle lh;
29295+ const char *name;
29296+ int len;
29297+ reiser4_dir_entry_desc entry;
71430cf6 29298+ struct reiser4_dentry_fsdata *fsdata;
44254afd
MT
29299+
29300+ assert("nikita-1247", parent != NULL);
29301+ assert("nikita-1248", dentry != NULL);
29302+ assert("nikita-1123", dentry->d_name.name != NULL);
29303+ assert("vs-1486",
29304+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29305+
29306+ name = dentry->d_name.name;
29307+ len = dentry->d_name.len;
29308+
29309+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29310+ /* some arbitrary error code to return */
29311+ return RETERR(-ENAMETOOLONG);
29312+
29313+ fsdata = reiser4_get_dentry_fsdata(dentry);
29314+ if (IS_ERR(fsdata))
29315+ return PTR_ERR(fsdata);
29316+
29317+ coord = &fsdata->dec.entry_coord;
29318+ coord_clear_iplug(coord);
29319+ init_lh(&lh);
29320+
29321+ /* find entry in a directory. This is plugin method. */
71430cf6
MT
29322+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
29323+ &entry);
44254afd
MT
29324+ if (result == 0) {
29325+ /* entry was found, extract object key from it. */
29326+ result =
29327+ WITH_COORD(coord,
29328+ item_plugin_by_coord(coord)->s.dir.
29329+ extract_key(coord, key));
29330+ }
29331+ done_lh(&lh);
29332+ return result;
29333+
29334+}
29335+
71430cf6 29336+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
44254afd
MT
29337+static reiser4_block_nr
29338+estimate_init(struct inode *parent, struct inode *object)
29339+{
29340+ reiser4_block_nr res = 0;
29341+
29342+ assert("vpf-321", parent != NULL);
29343+ assert("vpf-322", object != NULL);
29344+
29345+ /* hashed_add_entry(object) */
29346+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29347+ /* reiser4_add_nlink(object) */
29348+ res += inode_file_plugin(object)->estimate.update(object);
29349+ /* hashed_add_entry(object) */
29350+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29351+ /* reiser4_add_nlink(parent) */
29352+ res += inode_file_plugin(parent)->estimate.update(parent);
29353+
29354+ return 0;
29355+}
29356+
71430cf6
MT
29357+/* helper function for reiser4_dir_init_common(). Create "." and ".." */
29358+static int create_dot_dotdot(struct inode *object /* object to create dot and
29359+ * dotdot for */ ,
29360+ struct inode *parent /* parent of @object */)
44254afd
MT
29361+{
29362+ int result;
29363+ struct dentry dots_entry;
29364+ reiser4_dir_entry_desc entry;
29365+
29366+ assert("nikita-688", object != NULL);
29367+ assert("nikita-689", S_ISDIR(object->i_mode));
29368+ assert("nikita-691", parent != NULL);
29369+
29370+ /* We store dot and dotdot as normal directory entries. This is
29371+ not necessary, because almost all information stored in them
29372+ is already in the stat-data of directory, the only thing
29373+ being missed is objectid of grand-parent directory that can
29374+ easily be added there as extension.
29375+
29376+ But it is done the way it is done, because not storing dot
29377+ and dotdot will lead to the following complications:
29378+
29379+ . special case handling in ->lookup().
29380+ . addition of another extension to the sd.
29381+ . dependency on key allocation policy for stat data.
29382+
29383+ */
29384+
29385+ memset(&entry, 0, sizeof entry);
29386+ memset(&dots_entry, 0, sizeof dots_entry);
29387+ entry.obj = dots_entry.d_inode = object;
29388+ dots_entry.d_name.name = ".";
29389+ dots_entry.d_name.len = 1;
71430cf6 29390+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
44254afd
MT
29391+ reiser4_free_dentry_fsdata(&dots_entry);
29392+
29393+ if (result == 0) {
29394+ result = reiser4_add_nlink(object, object, 0);
29395+ if (result == 0) {
29396+ entry.obj = dots_entry.d_inode = parent;
29397+ dots_entry.d_name.name = "..";
29398+ dots_entry.d_name.len = 2;
71430cf6 29399+ result = reiser4_add_entry_common(object,
44254afd
MT
29400+ &dots_entry, NULL, &entry);
29401+ reiser4_free_dentry_fsdata(&dots_entry);
29402+ /* if creation of ".." failed, iput() will delete
29403+ object with ".". */
29404+ if (result == 0) {
29405+ result = reiser4_add_nlink(parent, object, 0);
29406+ if (result != 0)
29407+ /*
29408+ * if we failed to bump i_nlink, try
29409+ * to remove ".."
29410+ */
71430cf6 29411+ reiser4_detach_common(object, parent);
44254afd
MT
29412+ }
29413+ }
29414+ }
29415+
29416+ if (result != 0) {
29417+ /*
29418+ * in the case of error, at least update stat-data so that,
29419+ * ->i_nlink updates are not lingering.
29420+ */
29421+ reiser4_update_sd(object);
29422+ reiser4_update_sd(parent);
29423+ }
29424+
29425+ return result;
29426+}
29427+
29428+/*
29429+ * return 0 iff @coord contains a directory entry for the file with the name
29430+ * @name.
29431+ */
29432+static int
29433+check_item(const struct inode *dir, const coord_t * coord, const char *name)
29434+{
29435+ item_plugin *iplug;
29436+ char buf[DE_NAME_BUF_LEN];
29437+
29438+ iplug = item_plugin_by_coord(coord);
29439+ if (iplug == NULL) {
29440+ warning("nikita-1135", "Cannot get item plugin");
29441+ print_coord("coord", coord, 1);
29442+ return RETERR(-EIO);
29443+ } else if (item_id_by_coord(coord) !=
29444+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
29445+ /* item id of current item does not match to id of items a
29446+ directory is built of */
29447+ warning("nikita-1136", "Wrong item plugin");
29448+ print_coord("coord", coord, 1);
29449+ return RETERR(-EIO);
29450+ }
29451+ assert("nikita-1137", iplug->s.dir.extract_name);
29452+
29453+ /* Compare name stored in this entry with name we are looking for.
29454+
29455+ NOTE-NIKITA Here should go code for support of something like
29456+ unicode, code tables, etc.
29457+ */
29458+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
29459+}
29460+
29461+static int
29462+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
29463+{
29464+ return WITH_COORD(coord, check_item(dir, coord, name->name));
29465+}
29466+
29467+/*
29468+ * argument package used by entry_actor to scan entries with identical keys.
29469+ */
71430cf6 29470+struct entry_actor_args {
44254afd
MT
29471+ /* name we are looking for */
29472+ const char *name;
29473+ /* key of directory entry. entry_actor() scans through sequence of
29474+ * items/units having the same key */
29475+ reiser4_key *key;
29476+ /* how many entries with duplicate key was scanned so far. */
29477+ int non_uniq;
29478+#if REISER4_USE_COLLISION_LIMIT
29479+ /* scan limit */
29480+ int max_non_uniq;
29481+#endif
29482+ /* return parameter: set to true, if ->name wasn't found */
29483+ int not_found;
29484+ /* what type of lock to take when moving to the next node during
29485+ * scan */
29486+ znode_lock_mode mode;
29487+
29488+ /* last coord that was visited during scan */
29489+ coord_t last_coord;
29490+ /* last node locked during scan */
29491+ lock_handle last_lh;
29492+ /* inode of directory */
29493+ const struct inode *inode;
71430cf6 29494+};
44254afd 29495+
71430cf6
MT
29496+/* Function called by reiser4_find_entry() to look for given name
29497+ in the directory. */
44254afd
MT
29498+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
29499+ coord_t * coord /* current coord */ ,
29500+ lock_handle * lh /* current lock handle */ ,
29501+ void *entry_actor_arg /* argument to scan */ )
29502+{
29503+ reiser4_key unit_key;
71430cf6 29504+ struct entry_actor_args *args;
44254afd
MT
29505+
29506+ assert("nikita-1131", tree != NULL);
29507+ assert("nikita-1132", coord != NULL);
29508+ assert("nikita-1133", entry_actor_arg != NULL);
29509+
29510+ args = entry_actor_arg;
29511+ ++args->non_uniq;
29512+#if REISER4_USE_COLLISION_LIMIT
29513+ if (args->non_uniq > args->max_non_uniq) {
29514+ args->not_found = 1;
29515+ /* hash collision overflow. */
29516+ return RETERR(-EBUSY);
29517+ }
29518+#endif
29519+
29520+ /*
29521+ * did we just reach the end of the sequence of items/units with
29522+ * identical keys?
29523+ */
29524+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
29525+ assert("nikita-1791",
29526+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
29527+ args->not_found = 1;
29528+ args->last_coord.between = AFTER_UNIT;
29529+ return 0;
29530+ }
29531+
29532+ coord_dup(&args->last_coord, coord);
29533+ /*
29534+ * did scan just moved to the next node?
29535+ */
29536+ if (args->last_lh.node != lh->node) {
29537+ int lock_result;
29538+
29539+ /*
29540+ * if so, lock new node with the mode requested by the caller
29541+ */
29542+ done_lh(&args->last_lh);
29543+ assert("nikita-1896", znode_is_any_locked(lh->node));
29544+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
29545+ args->mode, ZNODE_LOCK_HIPRI);
29546+ if (lock_result != 0)
29547+ return lock_result;
29548+ }
29549+ return check_item(args->inode, coord, args->name);
29550+}
29551+
29552+/* Look for given @name within directory @dir.
29553+
29554+ This is called during lookup, creation and removal of directory
71430cf6 29555+ entries and on reiser4_rename_common
44254afd
MT
29556+
29557+ First calculate key that directory entry for @name would have. Search
29558+ for this key in the tree. If such key is found, scan all items with
29559+ the same key, checking name in each directory entry along the way.
29560+*/
71430cf6
MT
29561+int reiser4_find_entry(struct inode *dir, /* directory to scan */
29562+ struct dentry *de, /* name to search for */
29563+ lock_handle * lh, /* resulting lock handle */
29564+ znode_lock_mode mode, /* required lock mode */
29565+ reiser4_dir_entry_desc * entry /* parameters of found
29566+ directory entry */)
44254afd
MT
29567+{
29568+ const struct qstr *name;
29569+ seal_t *seal;
29570+ coord_t *coord;
29571+ int result;
29572+ __u32 flags;
71430cf6
MT
29573+ struct de_location *dec;
29574+ struct reiser4_dentry_fsdata *fsdata;
44254afd
MT
29575+
29576+ assert("nikita-1130", lh != NULL);
29577+ assert("nikita-1128", dir != NULL);
29578+
29579+ name = &de->d_name;
29580+ assert("nikita-1129", name != NULL);
29581+
29582+ /* dentry private data don't require lock, because dentry
29583+ manipulations are protected by i_mutex on parent.
29584+
29585+ This is not so for inodes, because there is no -the- parent in
29586+ inode case.
29587+ */
29588+ fsdata = reiser4_get_dentry_fsdata(de);
29589+ if (IS_ERR(fsdata))
29590+ return PTR_ERR(fsdata);
29591+ dec = &fsdata->dec;
29592+
29593+ coord = &dec->entry_coord;
29594+ coord_clear_iplug(coord);
29595+ seal = &dec->entry_seal;
29596+ /* compose key of directory entry for @name */
29597+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
29598+
71430cf6 29599+ if (reiser4_seal_is_set(seal)) {
44254afd 29600+ /* check seal */
71430cf6
MT
29601+ result = reiser4_seal_validate(seal, coord, &entry->key,
29602+ lh, mode, ZNODE_LOCK_LOPRI);
44254afd
MT
29603+ if (result == 0) {
29604+ /* key was found. Check that it is really item we are
29605+ looking for. */
29606+ result = check_entry(dir, coord, name);
29607+ if (result == 0)
29608+ return 0;
29609+ }
29610+ }
29611+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
29612+ /*
29613+ * find place in the tree where directory item should be located.
29614+ */
71430cf6
MT
29615+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
29616+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
29617+ flags, NULL /*ra_info */ );
44254afd 29618+ if (result == CBK_COORD_FOUND) {
71430cf6 29619+ struct entry_actor_args arg;
44254afd
MT
29620+
29621+ /* fast path: no hash collisions */
29622+ result = check_entry(dir, coord, name);
29623+ if (result == 0) {
71430cf6 29624+ reiser4_seal_init(seal, coord, &entry->key);
44254afd
MT
29625+ dec->pos = 0;
29626+ } else if (result > 0) {
29627+ /* Iterate through all units with the same keys. */
29628+ arg.name = name->name;
29629+ arg.key = &entry->key;
29630+ arg.not_found = 0;
29631+ arg.non_uniq = 0;
29632+#if REISER4_USE_COLLISION_LIMIT
29633+ arg.max_non_uniq = max_hash_collisions(dir);
29634+ assert("nikita-2851", arg.max_non_uniq > 1);
29635+#endif
29636+ arg.mode = mode;
29637+ arg.inode = dir;
29638+ coord_init_zero(&arg.last_coord);
29639+ init_lh(&arg.last_lh);
29640+
71430cf6
MT
29641+ result = reiser4_iterate_tree
29642+ (reiser4_tree_by_inode(dir),
29643+ coord, lh,
29644+ entry_actor, &arg, mode, 1);
44254afd
MT
29645+ /* if end of the tree or extent was reached during
29646+ scanning. */
29647+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
29648+ /* step back */
29649+ done_lh(lh);
29650+
29651+ result = zload(arg.last_coord.node);
29652+ if (result == 0) {
29653+ coord_clear_iplug(&arg.last_coord);
29654+ coord_dup(coord, &arg.last_coord);
29655+ move_lh(lh, &arg.last_lh);
29656+ result = RETERR(-ENOENT);
29657+ zrelse(arg.last_coord.node);
29658+ --arg.non_uniq;
29659+ }
29660+ }
29661+
29662+ done_lh(&arg.last_lh);
29663+ if (result == 0)
71430cf6 29664+ reiser4_seal_init(seal, coord, &entry->key);
44254afd
MT
29665+
29666+ if (result == 0 || result == -ENOENT) {
29667+ assert("nikita-2580", arg.non_uniq > 0);
29668+ dec->pos = arg.non_uniq - 1;
29669+ }
29670+ }
29671+ } else
29672+ dec->pos = -1;
29673+ return result;
29674+}
29675+
71430cf6 29676+/*
44254afd
MT
29677+ Local variables:
29678+ c-indentation-style: "K&R"
29679+ mode-name: "LC"
29680+ c-basic-offset: 8
29681+ tab-width: 8
29682+ fill-column: 120
29683+ scroll-step: 1
29684+ End:
29685+*/
71430cf6
MT
29686diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.c
29687--- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 03:00:00.000000000 +0300
29688+++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.c 2007-07-29 00:25:34.904703724 +0400
29689@@ -0,0 +1,655 @@
44254afd
MT
29690+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29691+
29692+#include "../../debug.h"
29693+#include "../../dformat.h"
29694+#include "../../key.h"
29695+#include "../node/node.h"
29696+#include "../space/space_allocator.h"
29697+#include "disk_format40.h"
29698+#include "../plugin.h"
29699+#include "../../txnmgr.h"
29700+#include "../../jnode.h"
29701+#include "../../tree.h"
29702+#include "../../super.h"
29703+#include "../../wander.h"
29704+#include "../../inode.h"
29705+#include "../../ktxnmgrd.h"
29706+#include "../../status_flags.h"
29707+
29708+#include <linux/types.h> /* for __u?? */
29709+#include <linux/fs.h> /* for struct super_block */
29710+#include <linux/buffer_head.h>
29711+
29712+/* reiser 4.0 default disk layout */
29713+
29714+/* Amount of free blocks needed to perform release_format40 when fs gets
29715+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
29716+ & tx record. */
29717+#define RELEASE_RESERVED 4
29718+
71430cf6
MT
29719+/* The greatest supported format40 version number */
29720+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
29721+
29722+/* This flag indicates that backup should be updated
29723+ (the update is performed by fsck) */
29724+#define FORMAT40_UPDATE_BACKUP (1 << 31)
29725+
44254afd
MT
29726+/* functions to access fields of format40_disk_super_block */
29727+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
29728+{
29729+ return le64_to_cpu(get_unaligned(&sb->block_count));
29730+}
29731+
29732+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
29733+{
29734+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
29735+}
29736+
29737+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
29738+{
29739+ return le64_to_cpu(get_unaligned(&sb->root_block));
29740+}
29741+
29742+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
29743+{
29744+ return le16_to_cpu(get_unaligned(&sb->tree_height));
29745+}
29746+
29747+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
29748+{
29749+ return le64_to_cpu(get_unaligned(&sb->file_count));
29750+}
29751+
29752+static __u64 get_format40_oid(const format40_disk_super_block * sb)
29753+{
29754+ return le64_to_cpu(get_unaligned(&sb->oid));
29755+}
29756+
29757+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
29758+{
29759+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
29760+}
29761+
29762+static __u64 get_format40_flags(const format40_disk_super_block * sb)
29763+{
29764+ return le64_to_cpu(get_unaligned(&sb->flags));
29765+}
29766+
71430cf6
MT
29767+static __u32 get_format40_version(const format40_disk_super_block * sb)
29768+{
29769+ return le32_to_cpu(get_unaligned(&sb->version)) &
29770+ ~FORMAT40_UPDATE_BACKUP;
29771+}
29772+
29773+static int update_backup_version(const format40_disk_super_block * sb)
29774+{
29775+ return (le32_to_cpu(get_unaligned(&sb->version)) &
29776+ FORMAT40_UPDATE_BACKUP);
29777+}
29778+
29779+static int update_disk_version(const format40_disk_super_block * sb)
29780+{
29781+ return (get_format40_version(sb) < FORMAT40_VERSION);
29782+}
29783+
29784+static int incomplete_compatibility(const format40_disk_super_block * sb)
29785+{
29786+ return (get_format40_version(sb) > FORMAT40_VERSION);
29787+}
29788+
44254afd
MT
29789+static format40_super_info *get_sb_info(struct super_block *super)
29790+{
29791+ return &get_super_private(super)->u.format40;
29792+}
29793+
29794+static int consult_diskmap(struct super_block *s)
29795+{
29796+ format40_super_info *info;
29797+ journal_location *jloc;
29798+
29799+ info = get_sb_info(s);
29800+ jloc = &get_super_private(s)->jloc;
29801+ /* Default format-specific locations, if there is nothing in
29802+ * diskmap */
29803+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
29804+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
29805+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
29806+#ifdef CONFIG_REISER4_BADBLOCKS
29807+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
29808+ &jloc->footer);
29809+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
29810+ &jloc->header);
29811+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
29812+ &info->loc.super);
29813+#endif
29814+ return 0;
29815+}
29816+
29817+/* find any valid super block of disk_format40 (even if the first
29818+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
29819+ if needed */
29820+static struct buffer_head *find_a_disk_format40_super_block(struct super_block
29821+ *s)
29822+{
29823+ struct buffer_head *super_bh;
29824+ format40_disk_super_block *disk_sb;
29825+ format40_super_info *info;
29826+
29827+ assert("umka-487", s != NULL);
29828+
29829+ info = get_sb_info(s);
29830+
29831+ super_bh = sb_bread(s, info->loc.super);
29832+ if (super_bh == NULL)
29833+ return ERR_PTR(RETERR(-EIO));
29834+
29835+ disk_sb = (format40_disk_super_block *) super_bh->b_data;
29836+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
29837+ brelse(super_bh);
29838+ return ERR_PTR(RETERR(-EINVAL));
29839+ }
29840+
29841+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
29842+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
29843+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29844+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29845+
29846+ return super_bh;
29847+}
29848+
29849+/* find the most recent version of super block. This is called after journal is
29850+ replayed */
29851+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
29852+{
29853+ /* Here the most recent superblock copy has to be read. However, as
29854+ journal replay isn't complete, we are using
29855+ find_a_disk_format40_super_block() function. */
29856+ return find_a_disk_format40_super_block(s);
29857+}
29858+
29859+static int get_super_jnode(struct super_block *s)
29860+{
29861+ reiser4_super_info_data *sbinfo = get_super_private(s);
29862+ jnode *sb_jnode;
29863+ int ret;
29864+
71430cf6 29865+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
44254afd
MT
29866+
29867+ ret = jload(sb_jnode);
29868+
29869+ if (ret) {
71430cf6 29870+ reiser4_drop_io_head(sb_jnode);
44254afd
MT
29871+ return ret;
29872+ }
29873+
29874+ pin_jnode_data(sb_jnode);
29875+ jrelse(sb_jnode);
29876+
29877+ sbinfo->u.format40.sb_jnode = sb_jnode;
29878+
29879+ return 0;
29880+}
29881+
29882+static void done_super_jnode(struct super_block *s)
29883+{
29884+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
29885+
29886+ if (sb_jnode) {
29887+ unpin_jnode_data(sb_jnode);
71430cf6 29888+ reiser4_drop_io_head(sb_jnode);
44254afd
MT
29889+ }
29890+}
29891+
29892+typedef enum format40_init_stage {
29893+ NONE_DONE = 0,
29894+ CONSULT_DISKMAP,
29895+ FIND_A_SUPER,
29896+ INIT_JOURNAL_INFO,
29897+ INIT_STATUS,
29898+ JOURNAL_REPLAY,
29899+ READ_SUPER,
29900+ KEY_CHECK,
29901+ INIT_OID,
29902+ INIT_TREE,
29903+ JOURNAL_RECOVER,
29904+ INIT_SA,
29905+ INIT_JNODE,
29906+ ALL_DONE
29907+} format40_init_stage;
29908+
29909+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
29910+{
29911+ format40_disk_super_block *sb_copy;
29912+
71430cf6
MT
29913+ sb_copy = kmalloc(sizeof(format40_disk_super_block),
29914+ reiser4_ctx_gfp_mask_get());
44254afd
MT
29915+ if (sb_copy == NULL)
29916+ return ERR_PTR(RETERR(-ENOMEM));
29917+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
29918+ sizeof(format40_disk_super_block));
29919+ return sb_copy;
29920+}
29921+
29922+static int check_key_format(const format40_disk_super_block *sb_copy)
29923+{
29924+ if (!equi(REISER4_LARGE_KEY,
29925+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
29926+ warning("nikita-3228", "Key format mismatch. "
29927+ "Only %s keys are supported.",
29928+ REISER4_LARGE_KEY ? "large" : "small");
29929+ return RETERR(-EINVAL);
29930+ }
29931+ return 0;
29932+}
29933+
29934+/**
29935+ * try_init_format40
29936+ * @super:
29937+ * @stage:
29938+ *
29939+ */
29940+static int try_init_format40(struct super_block *super,
29941+ format40_init_stage *stage)
29942+{
29943+ int result;
29944+ struct buffer_head *super_bh;
29945+ reiser4_super_info_data *sbinfo;
29946+ format40_disk_super_block *sb_copy;
29947+ tree_level height;
29948+ reiser4_block_nr root_block;
29949+ node_plugin *nplug;
29950+
29951+ assert("vs-475", super != NULL);
29952+ assert("vs-474", get_super_private(super));
29953+
29954+ *stage = NONE_DONE;
29955+
29956+ result = consult_diskmap(super);
29957+ if (result)
29958+ return result;
29959+ *stage = CONSULT_DISKMAP;
29960+
29961+ super_bh = find_a_disk_format40_super_block(super);
29962+ if (IS_ERR(super_bh))
29963+ return PTR_ERR(super_bh);
29964+ brelse(super_bh);
29965+ *stage = FIND_A_SUPER;
29966+
71430cf6
MT
29967+ /* ok, we are sure that filesystem format is a format40 format */
29968+
44254afd 29969+ /* map jnodes for journal control blocks (header, footer) to disk */
71430cf6 29970+ result = reiser4_init_journal_info(super);
44254afd
MT
29971+ if (result)
29972+ return result;
29973+ *stage = INIT_JOURNAL_INFO;
29974+
29975+ /* ok, we are sure that filesystem format is a format40 format */
29976+ /* Now check it's state */
29977+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
29978+ if (result != 0 && result != -EINVAL)
29979+ /* -EINVAL means there is no magic, so probably just old
29980+ * fs. */
29981+ return result;
29982+ *stage = INIT_STATUS;
29983+
29984+ result = reiser4_status_query(NULL, NULL);
29985+ if (result == REISER4_STATUS_MOUNT_WARN)
71430cf6
MT
29986+ notice("vpf-1363", "Warning: mounting %s with errors.",
29987+ super->s_id);
29988+ if (result == REISER4_STATUS_MOUNT_RO)
29989+ notice("vpf-1364", "Warning: mounting %s with fatal errors,"
29990+ " forcing read-only mount.", super->s_id);
44254afd
MT
29991+ result = reiser4_journal_replay(super);
29992+ if (result)
29993+ return result;
29994+ *stage = JOURNAL_REPLAY;
29995+
29996+ super_bh = read_super_block(super);
29997+ if (IS_ERR(super_bh))
29998+ return PTR_ERR(super_bh);
29999+ *stage = READ_SUPER;
30000+
30001+ /* allocate and make a copy of format40_disk_super_block */
30002+ sb_copy = copy_sb(super_bh);
30003+ brelse(super_bh);
71430cf6 30004+
44254afd
MT
30005+ if (IS_ERR(sb_copy))
30006+ return PTR_ERR(sb_copy);
71430cf6
MT
30007+ printk("reiser4: %s: found disk format 4.0.%u.\n",
30008+ super->s_id,
30009+ get_format40_version(sb_copy));
30010+ if (incomplete_compatibility(sb_copy))
30011+ printk("reiser4: Warning: The last completely supported "
30012+ "version of disk format40 is %u. Some objects of "
30013+ "the semantic tree can be unaccessible.\n",
30014+ FORMAT40_VERSION);
30015+ /* make sure that key format of kernel and filesystem match */
44254afd
MT
30016+ result = check_key_format(sb_copy);
30017+ if (result) {
30018+ kfree(sb_copy);
30019+ return result;
30020+ }
30021+ *stage = KEY_CHECK;
30022+
30023+ result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30024+ get_format40_oid(sb_copy));
30025+ if (result) {
30026+ kfree(sb_copy);
30027+ return result;
30028+ }
30029+ *stage = INIT_OID;
30030+
30031+ /* get things necessary to init reiser4_tree */
30032+ root_block = get_format40_root_block(sb_copy);
30033+ height = get_format40_tree_height(sb_copy);
30034+ nplug = node_plugin_by_id(NODE40_ID);
30035+
44254afd
MT
30036+ /* initialize reiser4_super_info_data */
30037+ sbinfo = get_super_private(super);
30038+ assert("", sbinfo->tree.super == super);
30039+ /* init reiser4_tree for the filesystem */
71430cf6 30040+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
44254afd
MT
30041+ if (result) {
30042+ kfree(sb_copy);
30043+ return result;
30044+ }
30045+ *stage = INIT_TREE;
30046+
30047+ /*
30048+ * initialize reiser4_super_info_data with data from format40 super
30049+ * block
30050+ */
30051+ sbinfo->default_uid = 0;
30052+ sbinfo->default_gid = 0;
30053+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
30054+ /* number of blocks in filesystem and reserved space */
30055+ reiser4_set_block_count(super, get_format40_block_count(sb_copy));
30056+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
71430cf6 30057+ sbinfo->version = get_format40_version(sb_copy);
44254afd
MT
30058+ kfree(sb_copy);
30059+
71430cf6
MT
30060+ if (update_backup_version(sb_copy))
30061+ printk("reiser4: Warning: metadata backup is not updated. "
30062+ "Please run 'fsck.reiser4 --fix' on %s.\n",
30063+ super->s_id);
30064+
44254afd
MT
30065+ sbinfo->fsuid = 0;
30066+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
30067+ * are not supported */
30068+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
30069+ * layout 40 are
30070+ * of one
30071+ * plugin */
30072+ /* sbinfo->tmgr is initialized already */
30073+
30074+ /* recover sb data which were logged separately from sb block */
30075+
30076+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
30077+ * oid_init_allocator() and reiser4_set_free_blocks() with new
30078+ * data. What's the reason to call them above? */
30079+ result = reiser4_journal_recover_sb_data(super);
30080+ if (result != 0)
30081+ return result;
30082+ *stage = JOURNAL_RECOVER;
30083+
30084+ /*
30085+ * Set number of used blocks. The number of used blocks is not stored
30086+ * neither in on-disk super block nor in the journal footer blocks. At
30087+ * this moment actual values of total blocks and free block counters
30088+ * are set in the reiser4 super block (in-memory structure) and we can
30089+ * calculate number of used blocks from them.
30090+ */
30091+ reiser4_set_data_blocks(super,
30092+ reiser4_block_count(super) -
30093+ reiser4_free_blocks(super));
30094+
30095+#if REISER4_DEBUG
30096+ sbinfo->min_blocks_used = 16 /* reserved area */ +
30097+ 2 /* super blocks */ +
30098+ 2 /* journal footer and header */ ;
30099+#endif
30100+
30101+ /* init disk space allocator */
71430cf6
MT
30102+ result = sa_init_allocator(reiser4_get_space_allocator(super),
30103+ super, NULL);
44254afd
MT
30104+ if (result)
30105+ return result;
30106+ *stage = INIT_SA;
30107+
30108+ result = get_super_jnode(super);
30109+ if (result == 0)
30110+ *stage = ALL_DONE;
30111+ return result;
30112+}
30113+
30114+/* plugin->u.format.get_ready */
30115+int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30116+{
30117+ int result;
30118+ format40_init_stage stage;
30119+
30120+ result = try_init_format40(s, &stage);
30121+ switch (stage) {
30122+ case ALL_DONE:
30123+ assert("nikita-3458", result == 0);
30124+ break;
30125+ case INIT_JNODE:
30126+ done_super_jnode(s);
30127+ case INIT_SA:
71430cf6 30128+ sa_destroy_allocator(reiser4_get_space_allocator(s), s);
44254afd
MT
30129+ case JOURNAL_RECOVER:
30130+ case INIT_TREE:
71430cf6 30131+ reiser4_done_tree(&get_super_private(s)->tree);
44254afd
MT
30132+ case INIT_OID:
30133+ case KEY_CHECK:
30134+ case READ_SUPER:
30135+ case JOURNAL_REPLAY:
30136+ case INIT_STATUS:
30137+ reiser4_status_finish();
30138+ case INIT_JOURNAL_INFO:
71430cf6 30139+ reiser4_done_journal_info(s);
44254afd
MT
30140+ case FIND_A_SUPER:
30141+ case CONSULT_DISKMAP:
30142+ case NONE_DONE:
30143+ break;
30144+ default:
30145+ impossible("nikita-3457", "init stage: %i", stage);
30146+ }
30147+
30148+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30149+ return RETERR(-ENOSPC);
30150+
30151+ return result;
30152+}
30153+
30154+static void pack_format40_super(const struct super_block *s, char *data)
30155+{
30156+ format40_disk_super_block *super_data =
30157+ (format40_disk_super_block *) data;
71430cf6 30158+
44254afd
MT
30159+ reiser4_super_info_data *sbinfo = get_super_private(s);
30160+
30161+ assert("zam-591", data != NULL);
30162+
30163+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30164+ &super_data->free_blocks);
44254afd 30165+
71430cf6
MT
30166+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
30167+ &super_data->root_block);
30168+
30169+ put_unaligned(cpu_to_le64(oid_next(s)),
30170+ &super_data->oid);
30171+
30172+ put_unaligned(cpu_to_le64(oids_used(s)),
30173+ &super_data->file_count);
30174+
30175+ put_unaligned(cpu_to_le16(sbinfo->tree.height),
30176+ &super_data->tree_height);
44254afd 30177+
71430cf6
MT
30178+ if (update_disk_version(super_data)) {
30179+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
30180+
30181+ put_unaligned(cpu_to_le32(version), &super_data->version);
30182+ }
44254afd
MT
30183+}
30184+
30185+/* plugin->u.format.log_super
30186+ return a jnode which should be added to transaction when the super block
30187+ gets logged */
30188+jnode *log_super_format40(struct super_block *s)
30189+{
30190+ jnode *sb_jnode;
30191+
30192+ sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30193+
30194+ jload(sb_jnode);
30195+
30196+ pack_format40_super(s, jdata(sb_jnode));
30197+
30198+ jrelse(sb_jnode);
30199+
30200+ return sb_jnode;
30201+}
30202+
30203+/* plugin->u.format.release */
30204+int release_format40(struct super_block *s)
30205+{
30206+ int ret;
30207+ reiser4_super_info_data *sbinfo;
30208+
30209+ sbinfo = get_super_private(s);
30210+ assert("zam-579", sbinfo != NULL);
30211+
30212+ if (!rofs_super(s)) {
71430cf6 30213+ ret = reiser4_capture_super_block(s);
44254afd 30214+ if (ret != 0)
71430cf6
MT
30215+ warning("vs-898",
30216+ "reiser4_capture_super_block failed: %d",
44254afd
MT
30217+ ret);
30218+
30219+ ret = txnmgr_force_commit_all(s, 1);
30220+ if (ret != 0)
30221+ warning("jmacd-74438", "txn_force failed: %d", ret);
30222+
30223+ all_grabbed2free();
30224+ }
30225+
30226+ sa_destroy_allocator(&sbinfo->space_allocator, s);
71430cf6 30227+ reiser4_done_journal_info(s);
44254afd
MT
30228+ done_super_jnode(s);
30229+
30230+ rcu_barrier();
71430cf6 30231+ reiser4_done_tree(&sbinfo->tree);
44254afd 30232+ /* call finish_rcu(), because some znode were "released" in
71430cf6 30233+ * reiser4_done_tree(). */
44254afd
MT
30234+ rcu_barrier();
30235+
30236+ return 0;
30237+}
30238+
30239+#define FORMAT40_ROOT_LOCALITY 41
30240+#define FORMAT40_ROOT_OBJECTID 42
30241+
30242+/* plugin->u.format.root_dir_key */
30243+const reiser4_key *root_dir_key_format40(const struct super_block *super
30244+ UNUSED_ARG)
30245+{
30246+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30247+ .el = {
30248+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30249+#if REISER4_LARGE_KEY
30250+ ON_LARGE_KEY(0ull,)
30251+#endif
30252+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30253+ 0ull
30254+ }
30255+ };
30256+
30257+ return &FORMAT40_ROOT_DIR_KEY;
30258+}
30259+
30260+/* plugin->u.format.check_open.
30261+ Check the opened object for validness. For now it checks for the valid oid &
30262+ locality only, can be improved later and it its work may depend on the mount
30263+ options. */
30264+int check_open_format40(const struct inode *object)
30265+{
30266+ oid_t max, oid;
30267+
30268+ max = oid_next(object->i_sb) - 1;
30269+
30270+ /* Check the oid. */
30271+ oid = get_inode_oid(object);
30272+ if (oid > max) {
30273+ warning("vpf-1360", "The object with the oid %llu "
30274+ "greater then the max used oid %llu found.",
30275+ (unsigned long long)oid, (unsigned long long)max);
30276+
30277+ return RETERR(-EIO);
30278+ }
30279+
30280+ /* Check the locality. */
30281+ oid = reiser4_inode_data(object)->locality_id;
30282+ if (oid > max) {
71430cf6 30283+ warning("vpf-1361", "The object with the locality %llu "
44254afd
MT
30284+ "greater then the max used oid %llu found.",
30285+ (unsigned long long)oid, (unsigned long long)max);
30286+
30287+ return RETERR(-EIO);
30288+ }
30289+
30290+ return 0;
30291+}
30292+
71430cf6
MT
30293+/* plugin->u.format.version_update.
30294+ Perform all version update operations from the on-disk
30295+ format40_disk_super_block.version on disk to FORMAT40_VERSION.
30296+ */
30297+int version_update_format40(struct super_block *super) {
30298+ txn_handle * trans;
30299+ lock_handle lh;
30300+ txn_atom *atom;
30301+ int ret;
30302+
30303+ /* Nothing to do if RO mount or the on-disk version is not less. */
30304+ if (super->s_flags & MS_RDONLY)
30305+ return 0;
30306+
30307+ if (get_super_private(super)->version >= FORMAT40_VERSION)
30308+ return 0;
30309+
30310+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
30311+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
30312+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
30313+
30314+ /* Mark the uber znode dirty to call log_super on write_logs. */
30315+ init_lh(&lh);
30316+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
30317+ ZNODE_LOCK_HIPRI, &lh);
30318+ if (ret != 0)
30319+ return ret;
30320+
30321+ znode_make_dirty(lh.node);
30322+ done_lh(&lh);
30323+
30324+ /* Update the backup blocks. */
30325+
30326+ /* Force write_logs immediately. */
30327+ trans = get_current_context()->trans;
30328+ atom = get_current_atom_locked();
30329+ assert("vpf-1906", atom != NULL);
30330+
30331+ spin_lock_txnh(trans);
30332+ return force_commit_atom(trans);
30333+}
30334+
44254afd
MT
30335+/* Make Linus happy.
30336+ Local variables:
30337+ c-indentation-style: "K&R"
30338+ mode-name: "LC"
30339+ c-basic-offset: 8
30340+ tab-width: 8
30341+ fill-column: 120
30342+ scroll-step: 1
30343+ End:
30344+*/
71430cf6
MT
30345diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.h
30346--- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 03:00:00.000000000 +0300
30347+++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format40.h 2007-07-29 00:25:34.908704759 +0400
30348@@ -0,0 +1,109 @@
44254afd
MT
30349+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30350+
30351+/* this file contains:
30352+ - definition of ondisk super block of standart disk layout for
30353+ reiser 4.0 (layout 40)
30354+ - definition of layout 40 specific portion of in-core super block
30355+ - declarations of functions implementing methods of layout plugin
30356+ for layout 40
30357+ - declarations of functions used to get/set fields in layout 40 super block
30358+*/
30359+
30360+#ifndef __DISK_FORMAT40_H__
30361+#define __DISK_FORMAT40_H__
30362+
30363+/* magic for default reiser4 layout */
30364+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30365+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30366+
30367+#include "../../dformat.h"
30368+
30369+#include <linux/fs.h> /* for struct super_block */
30370+
30371+typedef enum {
30372+ FORMAT40_LARGE_KEYS
30373+} format40_flags;
30374+
30375+/* ondisk super block for format 40. It is 512 bytes long */
30376+typedef struct format40_disk_super_block {
30377+ /* 0 */ d64 block_count;
30378+ /* number of block in a filesystem */
30379+ /* 8 */ d64 free_blocks;
30380+ /* number of free blocks */
30381+ /* 16 */ d64 root_block;
30382+ /* filesystem tree root block */
30383+ /* 24 */ d64 oid;
30384+ /* smallest free objectid */
30385+ /* 32 */ d64 file_count;
30386+ /* number of files in a filesystem */
30387+ /* 40 */ d64 flushes;
30388+ /* number of times super block was
30389+ flushed. Needed if format 40
30390+ will have few super blocks */
30391+ /* 48 */ d32 mkfs_id;
30392+ /* unique identifier of fs */
30393+ /* 52 */ char magic[16];
30394+ /* magic string ReIsEr40FoRmAt */
30395+ /* 68 */ d16 tree_height;
30396+ /* height of filesystem tree */
30397+ /* 70 */ d16 formatting_policy;
71430cf6 30398+ /* not used anymore */
44254afd 30399+ /* 72 */ d64 flags;
71430cf6
MT
30400+ /* 80 */ d32 version;
30401+ /* on-disk format version number
30402+ initially assigned by mkfs as the greatest format40
30403+ version number supported by reiser4progs and updated
30404+ in mount time in accordance with the greatest format40
30405+ version number supported by kernel.
30406+ Is used by fsck to catch possible corruption and
30407+ for various compatibility issues */
30408+ /* 84 */ char not_used[428];
44254afd
MT
30409+} format40_disk_super_block;
30410+
30411+/* format 40 specific part of reiser4_super_info_data */
30412+typedef struct format40_super_info {
30413+/* format40_disk_super_block actual_sb; */
30414+ jnode *sb_jnode;
30415+ struct {
30416+ reiser4_block_nr super;
30417+ } loc;
30418+} format40_super_info;
30419+
30420+/* Defines for journal header and footer respectively. */
30421+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
30422+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
30423+
30424+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
30425+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
30426+
30427+#define FORMAT40_STATUS_BLOCKNR \
30428+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
30429+
30430+/* Diskmap declarations */
30431+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
30432+#define FORMAT40_SUPER 1
30433+#define FORMAT40_JH 2
30434+#define FORMAT40_JF 3
30435+
30436+/* declarations of functions implementing methods of layout plugin for
30437+ format 40. The functions theirself are in disk_format40.c */
71430cf6
MT
30438+extern int init_format_format40(struct super_block *, void *data);
30439+extern const reiser4_key *root_dir_key_format40(const struct super_block *);
30440+extern int release_format40(struct super_block *s);
30441+extern jnode *log_super_format40(struct super_block *s);
30442+extern int check_open_format40(const struct inode *object);
30443+extern int version_update_format40(struct super_block *super);
44254afd
MT
30444+
30445+/* __DISK_FORMAT40_H__ */
30446+#endif
30447+
30448+/* Make Linus happy.
30449+ Local variables:
30450+ c-indentation-style: "K&R"
30451+ mode-name: "LC"
30452+ c-basic-offset: 8
30453+ tab-width: 8
30454+ fill-column: 120
30455+ scroll-step: 1
30456+ End:
30457+*/
71430cf6
MT
30458diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.c
30459--- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 03:00:00.000000000 +0300
30460+++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.c 2007-07-29 00:25:34.908704759 +0400
30461@@ -0,0 +1,38 @@
30462+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30463+
30464+#include "../../debug.h"
30465+#include "../plugin_header.h"
30466+#include "disk_format40.h"
30467+#include "disk_format.h"
30468+#include "../plugin.h"
30469+
30470+/* initialization of disk layout plugins */
30471+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30472+ [FORMAT40_ID] = {
30473+ .h = {
30474+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30475+ .id = FORMAT40_ID,
30476+ .pops = NULL,
30477+ .label = "reiser40",
30478+ .desc = "standard disk layout for reiser40",
30479+ .linkage = {NULL, NULL}
30480+ },
30481+ .init_format = init_format_format40,
30482+ .root_dir_key = root_dir_key_format40,
30483+ .release = release_format40,
30484+ .log_super = log_super_format40,
30485+ .check_open = check_open_format40,
30486+ .version_update = version_update_format40
30487+ }
30488+};
30489+
30490+/* Make Linus happy.
30491+ Local variables:
30492+ c-indentation-style: "K&R"
30493+ mode-name: "LC"
30494+ c-basic-offset: 8
30495+ tab-width: 8
30496+ fill-column: 120
30497+ scroll-step: 1
30498+ End:
30499+*/
30500diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.h
30501--- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 03:00:00.000000000 +0300
30502+++ linux-2.6.22/fs/reiser4/plugin/disk_format/disk_format.h 2007-07-29 00:25:34.908704759 +0400
30503@@ -0,0 +1,27 @@
30504+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30505+
30506+/* identifiers for disk layouts, they are also used as indexes in array of disk
30507+ plugins */
30508+
30509+#if !defined( __REISER4_DISK_FORMAT_H__ )
30510+#define __REISER4_DISK_FORMAT_H__
30511+
30512+typedef enum {
30513+ /* standard reiser4 disk layout plugin id */
30514+ FORMAT40_ID,
30515+ LAST_FORMAT_ID
30516+} disk_format_id;
30517+
30518+/* __REISER4_DISK_FORMAT_H__ */
30519+#endif
30520+
30521+/* Make Linus happy.
30522+ Local variables:
30523+ c-indentation-style: "K&R"
30524+ mode-name: "LC"
30525+ c-basic-offset: 8
30526+ tab-width: 8
30527+ fill-column: 120
30528+ scroll-step: 1
30529+ End:
30530+*/
30531diff -urN linux-2.6.22.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.22/fs/reiser4/plugin/disk_format/Makefile
30532--- linux-2.6.22.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 03:00:00.000000000 +0300
30533+++ linux-2.6.22/fs/reiser4/plugin/disk_format/Makefile 2007-07-29 00:25:34.908704759 +0400
30534@@ -0,0 +1,5 @@
30535+obj-$(CONFIG_REISER4_FS) += df_plugins.o
30536+
30537+df_plugins-objs := \
30538+ disk_format40.o \
30539+ disk_format.o
30540diff -urN linux-2.6.22.orig/fs/reiser4/plugin/fibration.c linux-2.6.22/fs/reiser4/plugin/fibration.c
30541--- linux-2.6.22.orig/fs/reiser4/plugin/fibration.c 1970-01-01 03:00:00.000000000 +0300
30542+++ linux-2.6.22/fs/reiser4/plugin/fibration.c 2007-07-29 00:25:34.908704759 +0400
30543@@ -0,0 +1,175 @@
44254afd
MT
30544+/* Copyright 2004 by Hans Reiser, licensing governed by
30545+ * reiser4/README */
30546+
30547+/* Directory fibrations */
30548+
30549+/*
30550+ * Suppose we have a directory tree with sources of some project. During
30551+ * compilation .o files are created within this tree. This makes access
30552+ * to the original source files less efficient, because source files are
30553+ * now "diluted" by object files: default directory plugin uses prefix
30554+ * of a file name as a part of the key for directory entry (and this
30555+ * part is also inherited by the key of file body). This means that
30556+ * foo.o will be located close to foo.c and foo.h in the tree.
30557+ *
30558+ * To avoid this effect directory plugin fill highest 7 (unused
30559+ * originally) bits of the second component of the directory entry key
30560+ * by bit-pattern depending on the file name (see
30561+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
30562+ * "fibre". Fibre of the file name key is inherited by key of stat data
30563+ * and keys of file body (in the case of REISER4_LARGE_KEY).
30564+ *
30565+ * Fibre for a given file is chosen by per-directory fibration
30566+ * plugin. Names within given fibre are ordered lexicographically.
30567+ */
30568+
30569+#include "../debug.h"
30570+#include "plugin_header.h"
30571+#include "plugin.h"
30572+#include "../super.h"
30573+#include "../inode.h"
30574+
30575+#include <linux/types.h>
30576+
30577+static const int fibre_shift = 57;
30578+
30579+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
30580+
30581+/*
30582+ * Trivial fibration: all files of directory are just ordered
30583+ * lexicographically.
30584+ */
30585+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
30586+{
30587+ return FIBRE_NO(0);
30588+}
30589+
30590+/*
30591+ * dot-o fibration: place .o files after all others.
30592+ */
30593+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
30594+{
30595+ /* special treatment for .*\.o */
30596+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
30597+ return FIBRE_NO(1);
30598+ else
30599+ return FIBRE_NO(0);
30600+}
30601+
30602+/*
30603+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
30604+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
30605+ * default fibre for the rest.
30606+ */
30607+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
30608+{
30609+ if (len > 2 && name[len - 2] == '.')
30610+ return FIBRE_NO(name[len - 1]);
30611+ else
30612+ return FIBRE_NO(0);
30613+}
30614+
30615+/*
30616+ * ext.3 fibration: try to separate files with different 3-character
30617+ * extensions from each other.
30618+ */
30619+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
30620+{
30621+ if (len > 4 && name[len - 4] == '.')
30622+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
30623+ else
30624+ return FIBRE_NO(0);
30625+}
30626+
71430cf6
MT
30627+static int change_fibration(struct inode *inode,
30628+ reiser4_plugin * plugin,
30629+ pset_member memb)
44254afd
MT
30630+{
30631+ int result;
30632+
30633+ assert("nikita-3503", inode != NULL);
30634+ assert("nikita-3504", plugin != NULL);
30635+
30636+ assert("nikita-3505", is_reiser4_inode(inode));
30637+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
30638+ assert("nikita-3507",
30639+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
30640+
30641+ result = 0;
30642+ if (inode_fibration_plugin(inode) == NULL ||
30643+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
30644+ if (is_dir_empty(inode) == 0)
71430cf6
MT
30645+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
30646+ PSET_FIBRATION, plugin);
44254afd
MT
30647+ else
30648+ result = RETERR(-ENOTEMPTY);
30649+
30650+ }
30651+ return result;
30652+}
30653+
30654+static reiser4_plugin_ops fibration_plugin_ops = {
30655+ .init = NULL,
30656+ .load = NULL,
30657+ .save_len = NULL,
30658+ .save = NULL,
30659+ .change = change_fibration
30660+};
30661+
30662+/* fibration plugins */
30663+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
30664+ [FIBRATION_LEXICOGRAPHIC] = {
30665+ .h = {
30666+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30667+ .id = FIBRATION_LEXICOGRAPHIC,
30668+ .pops = &fibration_plugin_ops,
30669+ .label = "lexicographic",
30670+ .desc = "no fibration",
30671+ .linkage = {NULL, NULL}
30672+ },
30673+ .fibre = fibre_trivial
30674+ },
30675+ [FIBRATION_DOT_O] = {
30676+ .h = {
30677+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30678+ .id = FIBRATION_DOT_O,
30679+ .pops = &fibration_plugin_ops,
30680+ .label = "dot-o",
30681+ .desc = "fibrate .o files separately",
30682+ .linkage = {NULL, NULL}
30683+ },
30684+ .fibre = fibre_dot_o
30685+ },
30686+ [FIBRATION_EXT_1] = {
30687+ .h = {
30688+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30689+ .id = FIBRATION_EXT_1,
30690+ .pops = &fibration_plugin_ops,
30691+ .label = "ext-1",
30692+ .desc = "fibrate file by single character extension",
30693+ .linkage = {NULL, NULL}
30694+ },
30695+ .fibre = fibre_ext_1
30696+ },
30697+ [FIBRATION_EXT_3] = {
30698+ .h = {
30699+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
30700+ .id = FIBRATION_EXT_3,
30701+ .pops = &fibration_plugin_ops,
30702+ .label = "ext-3",
30703+ .desc = "fibrate file by three character extension",
30704+ .linkage = {NULL, NULL}
30705+ },
30706+ .fibre = fibre_ext_3
30707+ }
30708+};
30709+
30710+/*
30711+ * Local variables:
30712+ * c-indentation-style: "K&R"
30713+ * mode-name: "LC"
30714+ * c-basic-offset: 8
30715+ * tab-width: 8
30716+ * fill-column: 79
30717+ * End:
30718+ */
71430cf6
MT
30719diff -urN linux-2.6.22.orig/fs/reiser4/plugin/fibration.h linux-2.6.22/fs/reiser4/plugin/fibration.h
30720--- linux-2.6.22.orig/fs/reiser4/plugin/fibration.h 1970-01-01 03:00:00.000000000 +0300
30721+++ linux-2.6.22/fs/reiser4/plugin/fibration.h 2007-07-29 00:25:34.908704759 +0400
44254afd
MT
30722@@ -0,0 +1,37 @@
30723+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
30724+
30725+/* Fibration plugin used by hashed directory plugin to segment content
30726+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
30727+
30728+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
30729+#define __FS_REISER4_PLUGIN_FIBRATION_H__
30730+
30731+#include "plugin_header.h"
30732+
30733+typedef struct fibration_plugin {
30734+ /* generic fields */
30735+ plugin_header h;
30736+
30737+ __u64(*fibre) (const struct inode * dir, const char *name, int len);
30738+} fibration_plugin;
30739+
30740+typedef enum {
30741+ FIBRATION_LEXICOGRAPHIC,
30742+ FIBRATION_DOT_O,
30743+ FIBRATION_EXT_1,
30744+ FIBRATION_EXT_3,
30745+ LAST_FIBRATION_ID
30746+} reiser4_fibration_id;
30747+
30748+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
30749+#endif
30750+
30751+/* Make Linus happy.
30752+ Local variables:
30753+ c-indentation-style: "K&R"
30754+ mode-name: "LC"
30755+ c-basic-offset: 8
30756+ tab-width: 8
30757+ fill-column: 120
30758+ End:
30759+*/
71430cf6
MT
30760diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.c
30761--- linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 03:00:00.000000000 +0300
30762+++ linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.c 2007-07-29 00:25:34.916706830 +0400
30763@@ -0,0 +1,3832 @@
30764+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
44254afd 30765+ reiser4/README */
71430cf6
MT
30766+/*
30767+ * Written by Edward Shishkin.
30768+ *
30769+ * Implementations of inode/file/address_space operations
30770+ * specific for cryptcompress file plugin which manages
30771+ * regular files built of compressed and(or) encrypted bodies.
30772+ * See http://dev.namesys.com/CryptcompressPlugin for details.
44254afd
MT
30773+ */
30774+
44254afd
MT
30775+#include "../../inode.h"
30776+#include "../cluster.h"
30777+#include "../object.h"
30778+#include "../../tree_walk.h"
30779+#include "cryptcompress.h"
30780+
30781+#include <asm/scatterlist.h>
30782+#include <linux/pagevec.h>
30783+#include <asm/uaccess.h>
30784+#include <linux/swap.h>
30785+#include <linux/writeback.h>
30786+#include <linux/random.h>
30787+
71430cf6
MT
30788+/*
30789+ Managing primary and secondary caches by Reiser4
30790+ cryptcompress file plugin. Synchronization scheme.
30791+
30792+
30793+ +------------------+
30794+ +------------------->| tfm stream |
30795+ | | (compressed data)|
30796+ flush | +------------------+
30797+ +-----------------+ |
30798+ |(->)longterm lock| V
30799+--+ writepages() | | +-***-+ reiser4 +---+
30800+ | | +--+ | *** | storage tree | |
30801+ | | | +-***-+ (primary cache)| |
30802+u | write() (secondary| cache) V / | \ | |
30803+s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
30804+e | | | |page cluster | | | **disk cluster** | | i |
30805+r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
30806+ | read() ^ ^ | | k |
30807+ | | (->)longterm lock| | page_io()| |
30808+ | | +------+ | |
30809+--+ readpages() | | +---+
30810+ | V
30811+ | +------------------+
30812+ +--------------------| tfm stream |
30813+ | (plain text) |
30814+ +------------------+
30815+*/
30816+
44254afd 30817+/* get cryptcompress specific portion of inode */
71430cf6 30818+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
44254afd
MT
30819+{
30820+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
30821+}
30822+
30823+/* plugin->u.file.init_inode_data */
71430cf6
MT
30824+void init_inode_data_cryptcompress(struct inode *inode,
30825+ reiser4_object_create_data * crd,
30826+ int create)
44254afd 30827+{
71430cf6 30828+ struct cryptcompress_info *data;
44254afd
MT
30829+
30830+ data = cryptcompress_inode_data(inode);
30831+ assert("edward-685", data != NULL);
30832+
30833+ memset(data, 0, sizeof(*data));
30834+
71430cf6
MT
30835+ mutex_init(&data->checkin_mutex);
30836+ data->trunc_index = ULONG_MAX;
30837+ turn_on_compression(data);
30838+ set_lattice_factor(data, MIN_LATTICE_FACTOR);
44254afd
MT
30839+ init_inode_ordering(inode, crd, create);
30840+}
30841+
44254afd
MT
30842+/* The following is a part of reiser4 cipher key manager
30843+ which is called when opening/creating a cryptcompress file */
30844+
30845+/* get/set cipher key info */
71430cf6 30846+struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
44254afd
MT
30847+{
30848+ assert("edward-90", inode != NULL);
30849+ assert("edward-91", reiser4_inode_data(inode) != NULL);
30850+ return cryptcompress_inode_data(inode)->crypt;
30851+}
30852+
71430cf6
MT
30853+static void set_inode_crypto_info (struct inode * inode,
30854+ struct reiser4_crypto_info * info)
44254afd 30855+{
71430cf6 30856+ cryptcompress_inode_data(inode)->crypt = info;
44254afd
MT
30857+}
30858+
30859+/* allocate a cipher key info */
71430cf6 30860+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
44254afd 30861+{
71430cf6 30862+ struct reiser4_crypto_info * info;
44254afd
MT
30863+ int fipsize;
30864+
71430cf6 30865+ info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
44254afd
MT
30866+ if (!info)
30867+ return ERR_PTR(-ENOMEM);
30868+ memset(info, 0, sizeof (*info));
30869+ fipsize = inode_digest_plugin(inode)->fipsize;
71430cf6 30870+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
44254afd
MT
30871+ if (!info->keyid) {
30872+ kfree(info);
30873+ return ERR_PTR(-ENOMEM);
30874+ }
71430cf6 30875+ info->host = inode;
44254afd
MT
30876+ return info;
30877+}
30878+
30879+#if 0
30880+/* allocate/free low-level info for cipher and digest
30881+ transforms */
71430cf6 30882+static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
44254afd 30883+{
71430cf6
MT
30884+ struct crypto_blkcipher * ctfm = NULL;
30885+ struct crypto_hash * dtfm = NULL;
30886+ cipher_plugin * cplug = inode_cipher_plugin(info->host);
30887+ digest_plugin * dplug = inode_digest_plugin(info->host);
44254afd
MT
30888+
30889+ if (cplug->alloc) {
71430cf6
MT
30890+ ctfm = cplug->alloc();
30891+ if (IS_ERR(ctfm)) {
44254afd
MT
30892+ warning("edward-1364",
30893+ "Can not allocate info for %s\n",
30894+ cplug->h.desc);
71430cf6 30895+ return RETERR(PTR_ERR(ctfm));
44254afd
MT
30896+ }
30897+ }
71430cf6 30898+ info_set_cipher(info, ctfm);
44254afd 30899+ if (dplug->alloc) {
71430cf6
MT
30900+ dtfm = dplug->alloc();
30901+ if (IS_ERR(dtfm)) {
44254afd
MT
30902+ warning("edward-1365",
30903+ "Can not allocate info for %s\n",
30904+ dplug->h.desc);
71430cf6 30905+ goto unhappy_with_digest;
44254afd
MT
30906+ }
30907+ }
71430cf6 30908+ info_set_digest(info, dtfm);
44254afd 30909+ return 0;
71430cf6 30910+ unhappy_with_digest:
44254afd 30911+ if (cplug->free) {
71430cf6
MT
30912+ cplug->free(ctfm);
30913+ info_set_cipher(info, NULL);
44254afd 30914+ }
71430cf6 30915+ return RETERR(PTR_ERR(dtfm));
44254afd
MT
30916+}
30917+#endif
30918+
30919+static void
71430cf6 30920+free_crypto_tfms(struct reiser4_crypto_info * info)
44254afd
MT
30921+{
30922+ assert("edward-1366", info != NULL);
71430cf6
MT
30923+ if (!info_get_cipher(info)) {
30924+ assert("edward-1601", !info_get_digest(info));
44254afd 30925+ return;
71430cf6
MT
30926+ }
30927+ inode_cipher_plugin(info->host)->free(info_get_cipher(info));
30928+ info_set_cipher(info, NULL);
30929+ inode_digest_plugin(info->host)->free(info_get_digest(info));
30930+ info_set_digest(info, NULL);
44254afd
MT
30931+ return;
30932+}
30933+
30934+#if 0
30935+/* create a key fingerprint for disk stat-data */
71430cf6
MT
30936+static int create_keyid (struct reiser4_crypto_info * info,
30937+ struct reiser4_crypto_data * data)
44254afd
MT
30938+{
30939+ int ret = -ENOMEM;
30940+ size_t blk, pad;
30941+ __u8 * dmem;
30942+ __u8 * cmem;
71430cf6
MT
30943+ struct hash_desc ddesc;
30944+ struct blkcipher_desc cdesc;
44254afd
MT
30945+ struct scatterlist sg;
30946+
44254afd
MT
30947+ assert("edward-1367", info != NULL);
30948+ assert("edward-1368", info->keyid != NULL);
30949+
71430cf6
MT
30950+ ddesc.tfm = info_get_digest(info);
30951+ ddesc.flags = 0;
30952+ cdesc.tfm = info_get_cipher(info);
30953+ cdesc.flags = 0;
44254afd 30954+
71430cf6
MT
30955+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
30956+ reiser4_ctx_gfp_mask_get());
44254afd
MT
30957+ if (!dmem)
30958+ goto exit1;
30959+
71430cf6 30960+ blk = crypto_blkcipher_blocksize(cdesc.tfm);
44254afd
MT
30961+
30962+ pad = data->keyid_size % blk;
30963+ pad = (pad ? blk - pad : 0);
30964+
71430cf6
MT
30965+ cmem = kmalloc((size_t)data->keyid_size + pad,
30966+ reiser4_ctx_gfp_mask_get());
44254afd
MT
30967+ if (!cmem)
30968+ goto exit2;
30969+ memcpy(cmem, data->keyid, data->keyid_size);
30970+ memset(cmem + data->keyid_size, 0, pad);
30971+
30972+ sg.page = virt_to_page(cmem);
30973+ sg.offset = offset_in_page(cmem);
30974+ sg.length = data->keyid_size + pad;
30975+
71430cf6
MT
30976+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
30977+ data->keyid_size + pad);
44254afd
MT
30978+ if (ret) {
30979+ warning("edward-1369",
71430cf6 30980+ "encryption failed flags=%x\n", cdesc.flags);
44254afd
MT
30981+ goto exit3;
30982+ }
71430cf6
MT
30983+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
30984+ if (ret) {
30985+ warning("edward-1602",
30986+ "digest failed flags=%x\n", ddesc.flags);
30987+ goto exit3;
30988+ }
30989+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
44254afd
MT
30990+ exit3:
30991+ kfree(cmem);
30992+ exit2:
30993+ kfree(dmem);
30994+ exit1:
30995+ return ret;
30996+}
30997+#endif
30998+
71430cf6 30999+static void destroy_keyid(struct reiser4_crypto_info * info)
44254afd
MT
31000+{
31001+ assert("edward-1370", info != NULL);
31002+ assert("edward-1371", info->keyid != NULL);
31003+ kfree(info->keyid);
31004+ return;
31005+}
31006+
71430cf6 31007+static void __free_crypto_info (struct inode * inode)
44254afd 31008+{
71430cf6 31009+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
44254afd
MT
31010+ assert("edward-1372", info != NULL);
31011+
31012+ free_crypto_tfms(info);
31013+ destroy_keyid(info);
31014+ kfree(info);
31015+}
31016+
31017+#if 0
71430cf6 31018+static void instantiate_crypto_info(struct reiser4_crypto_info * info)
44254afd
MT
31019+{
31020+ assert("edward-1373", info != NULL);
31021+ assert("edward-1374", info->inst == 0);
31022+ info->inst = 1;
31023+}
31024+#endif
31025+
71430cf6 31026+static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
44254afd
MT
31027+{
31028+ assert("edward-1375", info != NULL);
31029+ info->inst = 0;
31030+}
31031+
71430cf6 31032+static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
44254afd
MT
31033+{
31034+ return info->inst;
31035+}
31036+
31037+static int inode_has_cipher_key(struct inode * inode)
31038+{
31039+ assert("edward-1376", inode != NULL);
71430cf6
MT
31040+ return inode_crypto_info(inode) &&
31041+ is_crypto_info_instantiated(inode_crypto_info(inode));
44254afd
MT
31042+}
31043+
71430cf6 31044+static void free_crypto_info (struct inode * inode)
44254afd 31045+{
71430cf6
MT
31046+ uninstantiate_crypto_info(inode_crypto_info(inode));
31047+ __free_crypto_info(inode);
44254afd
MT
31048+}
31049+
31050+static int need_cipher(struct inode * inode)
31051+{
31052+ return inode_cipher_plugin(inode) !=
31053+ cipher_plugin_by_id(NONE_CIPHER_ID);
31054+}
31055+
71430cf6
MT
31056+/* Parse @data which contains a (uninstantiated) cipher key imported
31057+ from user space, create a low-level cipher info and attach it to
31058+ the @object. If success, then info contains an instantiated key */
44254afd 31059+#if 0
71430cf6
MT
31060+struct reiser4_crypto_info * create_crypto_info(struct inode * object,
31061+ struct reiser4_crypto_data * data)
44254afd
MT
31062+{
31063+ int ret;
71430cf6 31064+ struct reiser4_crypto_info * info;
44254afd
MT
31065+
31066+ assert("edward-1377", data != NULL);
31067+ assert("edward-1378", need_cipher(object));
31068+
31069+ if (inode_file_plugin(object) !=
31070+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31071+ return ERR_PTR(-EINVAL);
31072+
71430cf6 31073+ info = reiser4_alloc_crypto_info(object);
44254afd
MT
31074+ if (IS_ERR(info))
31075+ return info;
71430cf6 31076+ ret = alloc_crypto_tfms(info);
44254afd
MT
31077+ if (ret)
31078+ goto err;
44254afd 31079+ /* instantiating a key */
71430cf6
MT
31080+ ret = crypto_blkcipher_setkey(info_get_cipher(info),
31081+ data->key,
31082+ data->keysize);
44254afd
MT
31083+ if (ret) {
31084+ warning("edward-1379",
71430cf6
MT
31085+ "setkey failed flags=%x",
31086+ crypto_blkcipher_get_flags(info_get_cipher(info)));
44254afd
MT
31087+ goto err;
31088+ }
31089+ info->keysize = data->keysize;
31090+ ret = create_keyid(info, data);
31091+ if (ret)
31092+ goto err;
71430cf6 31093+ instantiate_crypto_info(info);
44254afd
MT
31094+ return info;
31095+ err:
71430cf6 31096+ __free_crypto_info(object);
44254afd
MT
31097+ return ERR_PTR(ret);
31098+}
31099+#endif
31100+
71430cf6 31101+/* increment/decrement a load counter when
44254afd 31102+ attaching/detaching the crypto-stat to any object */
71430cf6 31103+static void load_crypto_info(struct reiser4_crypto_info * info)
44254afd
MT
31104+{
31105+ assert("edward-1380", info != NULL);
31106+ inc_keyload_count(info);
31107+}
31108+
71430cf6 31109+static void unload_crypto_info(struct inode * inode)
44254afd 31110+{
71430cf6 31111+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
44254afd
MT
31112+ assert("edward-1381", info->keyload_count > 0);
31113+
71430cf6 31114+ dec_keyload_count(inode_crypto_info(inode));
44254afd
MT
31115+ if (info->keyload_count == 0)
31116+ /* final release */
71430cf6 31117+ free_crypto_info(inode);
44254afd
MT
31118+}
31119+
31120+/* attach/detach an existing crypto-stat */
71430cf6
MT
31121+void reiser4_attach_crypto_info(struct inode * inode,
31122+ struct reiser4_crypto_info * info)
44254afd
MT
31123+{
31124+ assert("edward-1382", inode != NULL);
31125+ assert("edward-1383", info != NULL);
71430cf6 31126+ assert("edward-1384", inode_crypto_info(inode) == NULL);
44254afd 31127+
71430cf6
MT
31128+ set_inode_crypto_info(inode, info);
31129+ load_crypto_info(info);
44254afd
MT
31130+}
31131+
31132+/* returns true, if crypto stat can be attached to the @host */
31133+#if REISER4_DEBUG
71430cf6 31134+static int host_allows_crypto_info(struct inode * host)
44254afd
MT
31135+{
31136+ int ret;
31137+ file_plugin * fplug = inode_file_plugin(host);
31138+
31139+ switch (fplug->h.id) {
71430cf6 31140+ case CRYPTCOMPRESS_FILE_PLUGIN_ID:
44254afd
MT
31141+ ret = 1;
31142+ break;
31143+ default:
31144+ ret = 0;
31145+ }
31146+ return ret;
31147+}
31148+#endif /* REISER4_DEBUG */
31149+
71430cf6 31150+static void reiser4_detach_crypto_info(struct inode * inode)
44254afd
MT
31151+{
31152+ assert("edward-1385", inode != NULL);
71430cf6 31153+ assert("edward-1386", host_allows_crypto_info(inode));
44254afd 31154+
71430cf6
MT
31155+ if (inode_crypto_info(inode))
31156+ unload_crypto_info(inode);
31157+ set_inode_crypto_info(inode, NULL);
44254afd
MT
31158+}
31159+
31160+#if 0
31161+
31162+/* compare fingerprints of @child and @parent */
71430cf6
MT
31163+static int keyid_eq(struct reiser4_crypto_info * child,
31164+ struct reiser4_crypto_info * parent)
44254afd 31165+{
71430cf6
MT
31166+ return !memcmp(child->keyid,
31167+ parent->keyid,
31168+ info_digest_plugin(parent)->fipsize);
44254afd
MT
31169+}
31170+
31171+/* check if a crypto-stat (which is bound to @parent) can be inherited */
71430cf6 31172+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
44254afd
MT
31173+{
31174+ if (!need_cipher(child))
31175+ return 0;
31176+ /* the child is created */
71430cf6 31177+ if (!inode_crypto_info(child))
44254afd
MT
31178+ return 1;
31179+ /* the child is looked up */
71430cf6 31180+ if (!inode_crypto_info(parent))
44254afd
MT
31181+ return 0;
31182+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31183+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
71430cf6
MT
31184+ inode_crypto_info(child)->keysize ==
31185+ inode_crypto_info(parent)->keysize &&
31186+ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
44254afd
MT
31187+}
31188+#endif
31189+
31190+/* helper functions for ->create() method of the cryptcompress plugin */
31191+static int inode_set_crypto(struct inode * object)
31192+{
31193+ reiser4_inode * info;
71430cf6 31194+ if (!inode_crypto_info(object)) {
44254afd
MT
31195+ if (need_cipher(object))
31196+ return RETERR(-EINVAL);
31197+ /* the file is not to be encrypted */
31198+ return 0;
31199+ }
31200+ info = reiser4_inode_data(object);
31201+ info->extmask |= (1 << CRYPTO_STAT);
44254afd
MT
31202+ return 0;
31203+}
31204+
71430cf6 31205+static int inode_init_compression(struct inode * object)
44254afd
MT
31206+{
31207+ int result = 0;
71430cf6
MT
31208+ assert("edward-1461", object != NULL);
31209+ if (inode_compression_plugin(object)->init)
31210+ result = inode_compression_plugin(object)->init();
31211+ return result;
44254afd
MT
31212+}
31213+
71430cf6 31214+static int inode_check_cluster(struct inode * object)
44254afd 31215+{
44254afd
MT
31216+ assert("edward-696", object != NULL);
31217+
71430cf6
MT
31218+ if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
31219+ warning("edward-1320", "Can not support '%s' "
31220+ "logical clusters (less then page size)",
31221+ inode_cluster_plugin(object)->h.label);
31222+ return RETERR(-EINVAL);
31223+ }
31224+ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
31225+ warning("edward-1463", "Can not support '%s' "
31226+ "logical clusters (too big for transform)",
31227+ inode_cluster_plugin(object)->h.label);
44254afd
MT
31228+ return RETERR(-EINVAL);
31229+ }
44254afd
MT
31230+ return 0;
31231+}
31232+
31233+/* ->destroy_inode() method of the cryptcompress plugin */
31234+void destroy_inode_cryptcompress(struct inode * inode)
31235+{
71430cf6
MT
31236+ assert("edward-1464", INODE_PGCOUNT(inode) == 0);
31237+ reiser4_detach_crypto_info(inode);
44254afd
MT
31238+ return;
31239+}
31240+
31241+/* ->create() method of the cryptcompress plugin
31242+
31243+. install plugins
31244+. attach crypto info if specified
31245+. attach compression info if specified
31246+. attach cluster info
31247+*/
71430cf6
MT
31248+int create_cryptcompress(struct inode *object, struct inode *parent,
31249+ reiser4_object_create_data * data)
44254afd
MT
31250+{
31251+ int result;
31252+ reiser4_inode *info;
31253+
31254+ assert("edward-23", object != NULL);
31255+ assert("edward-24", parent != NULL);
31256+ assert("edward-30", data != NULL);
71430cf6
MT
31257+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
31258+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
44254afd
MT
31259+
31260+ info = reiser4_inode_data(object);
31261+
31262+ assert("edward-29", info != NULL);
31263+
31264+ /* set file bit */
31265+ info->plugin_mask |= (1 << PSET_FILE);
31266+
31267+ /* set crypto */
31268+ result = inode_set_crypto(object);
31269+ if (result)
31270+ goto error;
31271+ /* set compression */
71430cf6 31272+ result = inode_init_compression(object);
44254afd
MT
31273+ if (result)
31274+ goto error;
71430cf6
MT
31275+ /* set cluster */
31276+ result = inode_check_cluster(object);
44254afd
MT
31277+ if (result)
31278+ goto error;
44254afd
MT
31279+
31280+ /* save everything in disk stat-data */
31281+ result = write_sd_by_inode_common(object);
31282+ if (!result)
31283+ return 0;
31284+ error:
71430cf6 31285+ reiser4_detach_crypto_info(object);
44254afd
MT
31286+ return result;
31287+}
31288+
71430cf6
MT
31289+/* ->open_object() method of the cryptcompress plugin */
31290+int open_object_cryptcompress(struct inode * inode, struct file * file)
44254afd 31291+{
71430cf6 31292+ int result;
44254afd
MT
31293+ struct inode * parent;
31294+
31295+ assert("edward-1394", inode != NULL);
31296+ assert("edward-1395", file != NULL);
31297+ assert("edward-1396", file != NULL);
31298+ assert("edward-1397", file->f_dentry->d_inode == inode);
31299+ assert("edward-1398", file->f_dentry->d_parent != NULL);
31300+ assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31301+ assert("edward-698",
31302+ inode_file_plugin(inode) ==
71430cf6
MT
31303+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31304+ result = inode_check_cluster(inode);
31305+ if (result)
31306+ return result;
31307+ result = inode_init_compression(inode);
31308+ if (result)
31309+ return result;
44254afd
MT
31310+ if (!need_cipher(inode))
31311+ /* the file is not to be ciphered */
31312+ return 0;
31313+ parent = file->f_dentry->d_parent->d_inode;
31314+ if (!inode_has_cipher_key(inode))
31315+ return RETERR(-EINVAL);
31316+ return 0;
31317+}
31318+
31319+/* returns a blocksize, the attribute of a cipher algorithm */
31320+static unsigned int
31321+cipher_blocksize(struct inode * inode)
31322+{
31323+ assert("edward-758", need_cipher(inode));
71430cf6
MT
31324+ assert("edward-1400", inode_crypto_info(inode) != NULL);
31325+ return crypto_blkcipher_blocksize
31326+ (info_get_cipher(inode_crypto_info(inode)));
44254afd
MT
31327+}
31328+
31329+/* returns offset translated by scale factor of the crypto-algorithm */
31330+static loff_t inode_scaled_offset (struct inode * inode,
31331+ const loff_t src_off /* input offset */)
31332+{
31333+ assert("edward-97", inode != NULL);
31334+
31335+ if (!need_cipher(inode) ||
71430cf6
MT
31336+ src_off == get_key_offset(reiser4_min_key()) ||
31337+ src_off == get_key_offset(reiser4_max_key()))
44254afd
MT
31338+ return src_off;
31339+
31340+ return inode_cipher_plugin(inode)->scale(inode,
31341+ cipher_blocksize(inode),
31342+ src_off);
31343+}
31344+
31345+/* returns disk cluster size */
31346+size_t inode_scaled_cluster_size(struct inode * inode)
31347+{
31348+ assert("edward-110", inode != NULL);
31349+
31350+ return inode_scaled_offset(inode, inode_cluster_size(inode));
31351+}
31352+
44254afd 31353+/* set number of cluster pages */
71430cf6
MT
31354+static void set_cluster_nrpages(struct cluster_handle * clust,
31355+ struct inode *inode)
44254afd 31356+{
71430cf6 31357+ struct reiser4_slide * win;
44254afd
MT
31358+
31359+ assert("edward-180", clust != NULL);
31360+ assert("edward-1040", inode != NULL);
31361+
71430cf6 31362+ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
44254afd
MT
31363+ win = clust->win;
31364+ if (!win) {
71430cf6 31365+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
44254afd
MT
31366+ return;
31367+ }
71430cf6 31368+ assert("edward-1176", clust->op != LC_INVAL);
44254afd
MT
31369+ assert("edward-1064", win->off + win->count + win->delta != 0);
31370+
31371+ if (win->stat == HOLE_WINDOW &&
31372+ win->off == 0 && win->count == inode_cluster_size(inode)) {
71430cf6 31373+ /* special case: writing a "fake" logical cluster */
44254afd
MT
31374+ clust->nr_pages = 0;
31375+ return;
31376+ }
71430cf6
MT
31377+ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
31378+ lbytes(clust->index, inode)));
44254afd
MT
31379+ return;
31380+}
31381+
71430cf6
MT
31382+/* plugin->key_by_inode()
31383+ build key of a disk cluster */
31384+int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
31385+ reiser4_key * key)
44254afd 31386+{
44254afd 31387+ assert("edward-64", inode != 0);
44254afd 31388+
71430cf6
MT
31389+ if (likely(off != get_key_offset(reiser4_max_key())))
31390+ off = off_to_clust_to_off(off, inode);
31391+ if (inode_crypto_info(inode))
31392+ off = inode_scaled_offset(inode, off);
44254afd
MT
31393+
31394+ key_by_inode_and_offset_common(inode, 0, key);
71430cf6 31395+ set_key_offset(key, (__u64)off);
44254afd
MT
31396+ return 0;
31397+}
31398+
71430cf6
MT
31399+/* plugin->flow_by_inode() */
31400+/* flow is used to read/write disk clusters */
31401+int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
31402+ int user, /* 1: @buf is of user space,
31403+ 0: kernel space */
31404+ loff_t size, /* @buf size */
31405+ loff_t off, /* offset to start io from */
31406+ rw_op op, /* READ or WRITE */
31407+ flow_t * f /* resulting flow */)
44254afd
MT
31408+{
31409+ assert("edward-436", f != NULL);
31410+ assert("edward-149", inode != NULL);
31411+ assert("edward-150", inode_file_plugin(inode) != NULL);
71430cf6
MT
31412+ assert("edward-1465", user == 0); /* we use flow to read/write
31413+ disk clusters located in
31414+ kernel space */
44254afd
MT
31415+ f->length = size;
31416+ memcpy(&f->data, &buf, sizeof(buf));
31417+ f->user = user;
31418+ f->op = op;
31419+
44254afd
MT
31420+ return key_by_inode_cryptcompress(inode, off, &f->key);
31421+}
31422+
31423+static int
71430cf6
MT
31424+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
31425+ znode_lock_mode lock_mode)
44254afd
MT
31426+{
31427+ coord_t *coord;
31428+
31429+ assert("edward-704", hint != NULL);
71430cf6 31430+ assert("edward-1089", !hint_is_valid(hint));
44254afd
MT
31431+ assert("edward-706", hint->lh.owner == NULL);
31432+
31433+ coord = &hint->ext_coord.coord;
31434+
31435+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
31436+ /* hint either not set or set by different operation */
31437+ return RETERR(-E_REPEAT);
31438+
31439+ if (get_key_offset(key) != hint->offset)
31440+ /* hint is set for different key */
31441+ return RETERR(-E_REPEAT);
31442+
71430cf6 31443+ assert("edward-707", reiser4_schedulable());
44254afd 31444+
71430cf6
MT
31445+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
31446+ key, &hint->lh, lock_mode,
31447+ ZNODE_LOCK_LOPRI);
44254afd
MT
31448+}
31449+
31450+/* reserve disk space when writing a logical cluster */
71430cf6 31451+static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
44254afd
MT
31452+{
31453+ int result = 0;
31454+
71430cf6 31455+ assert("edward-965", reiser4_schedulable());
44254afd
MT
31456+ assert("edward-439", inode != NULL);
31457+ assert("edward-440", clust != NULL);
31458+ assert("edward-441", clust->pages != NULL);
44254afd
MT
31459+
31460+ if (clust->nr_pages == 0) {
31461+ assert("edward-1152", clust->win != NULL);
31462+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
71430cf6 31463+ /* don't reserve disk space for fake logical cluster */
44254afd
MT
31464+ return 0;
31465+ }
31466+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
31467+
31468+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
31469+ estimate_update_cluster(inode),
31470+ BA_CAN_COMMIT);
31471+ if (result)
31472+ return result;
31473+ clust->reserved = 1;
31474+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
31475+ estimate_update_cluster(inode));
31476+#if REISER4_DEBUG
31477+ clust->reserved_prepped = estimate_update_cluster(inode);
31478+ clust->reserved_unprepped = estimate_insert_cluster(inode);
31479+#endif
31480+ /* there can be space grabbed by txnmgr_force_commit_all */
44254afd
MT
31481+ return 0;
31482+}
31483+
31484+/* free reserved disk space if writing a logical cluster fails */
71430cf6
MT
31485+static void free_reserved4cluster(struct inode *inode,
31486+ struct cluster_handle *ch, int count)
44254afd 31487+{
71430cf6 31488+ assert("edward-967", ch->reserved == 1);
44254afd
MT
31489+
31490+ cluster_reserved2free(count);
71430cf6 31491+ ch->reserved = 0;
44254afd
MT
31492+}
31493+
31494+/* The core search procedure of the cryptcompress plugin.
31495+ If returned value is not cbk_errored, then current znode is locked */
71430cf6 31496+static int find_cluster_item(hint_t * hint,
44254afd
MT
31497+ const reiser4_key * key, /* key of the item we are
31498+ looking for */
31499+ znode_lock_mode lock_mode /* which lock */ ,
31500+ ra_info_t * ra_info, lookup_bias bias, __u32 flags)
31501+{
31502+ int result;
31503+ reiser4_key ikey;
71430cf6 31504+ int went_right = 0;
44254afd
MT
31505+ coord_t *coord = &hint->ext_coord.coord;
31506+ coord_t orig = *coord;
31507+
31508+ assert("edward-152", hint != NULL);
31509+
71430cf6
MT
31510+ if (!hint_is_valid(hint)) {
31511+ result = cryptcompress_hint_validate(hint, key, lock_mode);
44254afd
MT
31512+ if (result == -E_REPEAT)
31513+ goto traverse_tree;
31514+ else if (result) {
31515+ assert("edward-1216", 0);
31516+ return result;
31517+ }
71430cf6 31518+ hint_set_valid(hint);
44254afd
MT
31519+ }
31520+ assert("edward-709", znode_is_any_locked(coord->node));
31521+
31522+ /* In-place lookup is going here, it means we just need to
31523+ check if next item of the @coord match to the @keyhint) */
31524+
31525+ if (equal_to_rdk(coord->node, key)) {
31526+ result = goto_right_neighbor(coord, &hint->lh);
31527+ if (result == -E_NO_NEIGHBOR) {
31528+ assert("edward-1217", 0);
31529+ return RETERR(-EIO);
31530+ }
31531+ if (result)
31532+ return result;
31533+ assert("edward-1218", equal_to_ldk(coord->node, key));
71430cf6 31534+ went_right = 1;
44254afd
MT
31535+ } else {
31536+ coord->item_pos++;
31537+ coord->unit_pos = 0;
31538+ coord->between = AT_UNIT;
31539+ }
31540+ result = zload(coord->node);
31541+ if (result)
31542+ return result;
31543+ assert("edward-1219", !node_is_empty(coord->node));
31544+
31545+ if (!coord_is_existing_item(coord)) {
31546+ zrelse(coord->node);
31547+ goto not_found;
31548+ }
31549+ item_key_by_coord(coord, &ikey);
31550+ zrelse(coord->node);
31551+ if (!keyeq(key, &ikey))
31552+ goto not_found;
71430cf6
MT
31553+ /* Ok, item is found, update node counts */
31554+ if (went_right)
31555+ dclust_inc_extension_ncount(hint);
44254afd
MT
31556+ return CBK_COORD_FOUND;
31557+
71430cf6 31558+ not_found:
44254afd
MT
31559+ assert("edward-1220", coord->item_pos > 0);
31560+ //coord->item_pos--;
31561+ /* roll back */
31562+ *coord = orig;
31563+ ON_DEBUG(coord_update_v(coord));
31564+ return CBK_COORD_NOTFOUND;
31565+
71430cf6 31566+ traverse_tree:
44254afd 31567+ assert("edward-713", hint->lh.owner == NULL);
71430cf6 31568+ assert("edward-714", reiser4_schedulable());
44254afd 31569+
71430cf6
MT
31570+ reiser4_unset_hint(hint);
31571+ dclust_init_extension(hint);
44254afd
MT
31572+ coord_init_zero(coord);
31573+ result = coord_by_key(current_tree, key, coord, &hint->lh,
31574+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
31575+ CBK_UNIQUE | flags, ra_info);
31576+ if (cbk_errored(result))
31577+ return result;
71430cf6
MT
31578+ if(result == CBK_COORD_FOUND)
31579+ dclust_inc_extension_ncount(hint);
31580+ hint_set_valid(hint);
44254afd
MT
31581+ return result;
31582+}
31583+
31584+/* This function is called by deflate[inflate] manager when
31585+ creating a transformed/plain stream to check if we should
31586+ create/cut some overhead. If this returns true, then @oh
31587+ contains the size of this overhead.
31588+ */
71430cf6
MT
31589+static int need_cut_or_align(struct inode * inode,
31590+ struct cluster_handle * ch, rw_op rw, int * oh)
44254afd 31591+{
71430cf6 31592+ struct tfm_cluster * tc = &ch->tc;
44254afd
MT
31593+ switch (rw) {
31594+ case WRITE_OP: /* estimate align */
31595+ *oh = tc->len % cipher_blocksize(inode);
31596+ if (*oh != 0)
31597+ return 1;
31598+ break;
31599+ case READ_OP: /* estimate cut */
71430cf6 31600+ *oh = *(tfm_output_data(ch) + tc->len - 1);
44254afd
MT
31601+ break;
31602+ default:
31603+ impossible("edward-1401", "bad option");
31604+ }
31605+ return (tc->len != tc->lsize);
31606+}
31607+
31608+/* create/cut an overhead of transformed/plain stream */
71430cf6
MT
31609+static void align_or_cut_overhead(struct inode * inode,
31610+ struct cluster_handle * ch, rw_op rw)
44254afd
MT
31611+{
31612+ int oh;
31613+ cipher_plugin * cplug = inode_cipher_plugin(inode);
31614+
31615+ assert("edward-1402", need_cipher(inode));
31616+
71430cf6 31617+ if (!need_cut_or_align(inode, ch, rw, &oh))
44254afd
MT
31618+ return;
31619+ switch (rw) {
31620+ case WRITE_OP: /* do align */
71430cf6
MT
31621+ ch->tc.len +=
31622+ cplug->align_stream(tfm_input_data(ch) +
31623+ ch->tc.len, ch->tc.len,
44254afd 31624+ cipher_blocksize(inode));
71430cf6 31625+ *(tfm_input_data(ch) + ch->tc.len - 1) =
44254afd
MT
31626+ cipher_blocksize(inode) - oh;
31627+ break;
31628+ case READ_OP: /* do cut */
31629+ assert("edward-1403", oh <= cipher_blocksize(inode));
71430cf6 31630+ ch->tc.len -= oh;
44254afd
MT
31631+ break;
31632+ default:
31633+ impossible("edward-1404", "bad option");
31634+ }
31635+ return;
31636+}
31637+
71430cf6 31638+static unsigned max_cipher_overhead(struct inode * inode)
44254afd
MT
31639+{
31640+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
31641+ return 0;
31642+ return cipher_blocksize(inode);
31643+}
31644+
31645+static int deflate_overhead(struct inode *inode)
31646+{
31647+ return (inode_compression_plugin(inode)->
31648+ checksum ? DC_CHECKSUM_SIZE : 0);
31649+}
31650+
31651+static unsigned deflate_overrun(struct inode * inode, int ilen)
31652+{
31653+ return coa_overrun(inode_compression_plugin(inode), ilen);
31654+}
31655+
31656+/* Estimating compressibility of a logical cluster by various
31657+ policies represented by compression mode plugin.
31658+ If this returns false, then compressor won't be called for
31659+ the cluster of index @index.
31660+*/
71430cf6 31661+static int should_compress(struct tfm_cluster * tc, cloff_t index,
44254afd
MT
31662+ struct inode *inode)
31663+{
31664+ compression_plugin *cplug = inode_compression_plugin(inode);
31665+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
31666+
31667+ assert("edward-1321", tc->len != 0);
31668+ assert("edward-1322", cplug != NULL);
31669+ assert("edward-1323", mplug != NULL);
31670+
31671+ return /* estimate by size */
31672+ (cplug->min_size_deflate ?
31673+ tc->len >= cplug->min_size_deflate() :
31674+ 1) &&
31675+ /* estimate by compression mode plugin */
31676+ (mplug->should_deflate ?
31677+ mplug->should_deflate(inode, index) :
31678+ 1);
31679+}
31680+
31681+/* Evaluating results of compression transform.
31682+ Returns true, if we need to accept this results */
71430cf6 31683+static int save_compressed(int size_before, int size_after, struct inode *inode)
44254afd
MT
31684+{
31685+ return (size_after + deflate_overhead(inode) +
31686+ max_cipher_overhead(inode) < size_before);
31687+}
31688+
31689+/* Guess result of the evaluation above */
71430cf6
MT
31690+static int need_inflate(struct cluster_handle * ch, struct inode * inode,
31691+ int encrypted /* is cluster encrypted */ )
44254afd 31692+{
71430cf6 31693+ struct tfm_cluster * tc = &ch->tc;
44254afd
MT
31694+
31695+ assert("edward-142", tc != 0);
31696+ assert("edward-143", inode != NULL);
31697+
31698+ return tc->len <
31699+ (encrypted ?
31700+ inode_scaled_offset(inode, tc->lsize) :
31701+ tc->lsize);
31702+}
31703+
31704+/* If results of compression were accepted, then we add
31705+ a checksum to catch possible disk cluster corruption.
31706+ The following is a format of the data stored in disk clusters:
31707+
31708+ data This is (transformed) logical cluster.
31709+ cipher_overhead This is created by ->align() method
31710+ of cipher plugin. May be absent.
31711+ checksum (4) This is created by ->checksum method
31712+ of compression plugin to check
31713+ integrity. May be absent.
31714+
31715+ Crypto overhead format:
31716+
31717+ data
31718+ control_byte (1) contains aligned overhead size:
31719+ 1 <= overhead <= cipher_blksize
31720+*/
31721+/* Append a checksum at the end of a transformed stream */
71430cf6 31722+static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
44254afd
MT
31723+{
31724+ __u32 checksum;
31725+
31726+ assert("edward-1309", tc != NULL);
31727+ assert("edward-1310", tc->len > 0);
31728+ assert("edward-1311", cplug->checksum != NULL);
31729+
31730+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
31731+ put_unaligned(cpu_to_le32(checksum),
31732+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
31733+ tc->len += (int)DC_CHECKSUM_SIZE;
31734+}
31735+
31736+/* Check a disk cluster checksum.
31737+ Returns 0 if checksum is correct, otherwise returns 1 */
71430cf6 31738+static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
44254afd
MT
31739+{
31740+ assert("edward-1312", tc != NULL);
31741+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
31742+ assert("edward-1314", cplug->checksum != NULL);
31743+
31744+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
31745+ tc->len - (int)DC_CHECKSUM_SIZE) !=
31746+ le32_to_cpu(get_unaligned((d32 *)
31747+ (tfm_stream_data(tc, INPUT_STREAM)
31748+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
31749+ warning("edward-156",
31750+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
31751+ (int)le32_to_cpu
31752+ (get_unaligned((d32 *)
31753+ (tfm_stream_data(tc, INPUT_STREAM) +
31754+ tc->len - (int)DC_CHECKSUM_SIZE))),
31755+ (int)cplug->checksum
31756+ (tfm_stream_data(tc, INPUT_STREAM),
31757+ tc->len - (int)DC_CHECKSUM_SIZE));
31758+ return 1;
31759+ }
31760+ tc->len -= (int)DC_CHECKSUM_SIZE;
31761+ return 0;
31762+}
31763+
31764+/* get input/output stream for some transform action */
71430cf6 31765+int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
44254afd
MT
31766+ tfm_stream_id id)
31767+{
31768+ size_t size = inode_scaled_cluster_size(inode);
31769+
31770+ assert("edward-901", tc != NULL);
31771+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
31772+
71430cf6 31773+ if (cluster_get_tfm_act(tc) == TFMA_WRITE)
44254afd
MT
31774+ size += deflate_overrun(inode, inode_cluster_size(inode));
31775+
71430cf6 31776+ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
44254afd 31777+ alternate_streams(tc);
71430cf6 31778+ if (!get_tfm_stream(tc, id))
44254afd
MT
31779+ return alloc_tfm_stream(tc, size, id);
31780+
31781+ assert("edward-902", tfm_stream_is_set(tc, id));
31782+
31783+ if (tfm_stream_size(tc, id) < size)
31784+ return realloc_tfm_stream(tc, size, id);
31785+ return 0;
31786+}
31787+
31788+/* Common deflate manager */
71430cf6 31789+int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
44254afd
MT
31790+{
31791+ int result = 0;
31792+ int compressed = 0;
31793+ int encrypted = 0;
71430cf6 31794+ struct tfm_cluster * tc = &clust->tc;
44254afd
MT
31795+ compression_plugin * coplug;
31796+
31797+ assert("edward-401", inode != NULL);
31798+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
71430cf6 31799+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
44254afd
MT
31800+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
31801+
31802+ coplug = inode_compression_plugin(inode);
31803+ if (should_compress(tc, clust->index, inode)) {
31804+ /* try to compress, discard bad results */
31805+ __u32 dst_len;
31806+ compression_mode_plugin * mplug =
31807+ inode_compression_mode_plugin(inode);
31808+ assert("edward-602", coplug != NULL);
31809+ assert("edward-1423", coplug->compress != NULL);
31810+
31811+ result = grab_coa(tc, coplug);
31812+ if (result) {
31813+ warning("edward-1424",
31814+ "alloc_coa failed with ret=%d, skipped compression",
31815+ result);
31816+ goto cipher;
31817+ }
31818+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31819+ if (result) {
31820+ warning("edward-1425",
31821+ "alloc stream failed with ret=%d, skipped compression",
31822+ result);
31823+ goto cipher;
31824+ }
31825+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
31826+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
31827+ tfm_input_data(clust), tc->len,
31828+ tfm_output_data(clust), &dst_len);
31829+ /* make sure we didn't overwrite extra bytes */
31830+ assert("edward-603",
31831+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
31832+
31833+ /* evaluate results of compression transform */
31834+ if (save_compressed(tc->len, dst_len, inode)) {
31835+ /* good result, accept */
31836+ tc->len = dst_len;
31837+ if (mplug->accept_hook != NULL) {
31838+ result = mplug->accept_hook(inode, clust->index);
31839+ if (result)
31840+ warning("edward-1426",
31841+ "accept_hook failed with ret=%d",
31842+ result);
31843+ }
31844+ compressed = 1;
31845+ }
31846+ else {
31847+ /* bad result, discard */
71430cf6 31848+#if 0
44254afd 31849+ if (cluster_is_complete(clust, inode))
71430cf6 31850+ warning("edward-1496",
44254afd
MT
31851+ "incompressible cluster %lu (inode %llu)",
31852+ clust->index,
31853+ (unsigned long long)get_inode_oid(inode));
31854+#endif
31855+ if (mplug->discard_hook != NULL &&
31856+ cluster_is_complete(clust, inode)) {
31857+ result = mplug->discard_hook(inode,
31858+ clust->index);
31859+ if (result)
31860+ warning("edward-1427",
31861+ "discard_hook failed with ret=%d",
31862+ result);
31863+ }
31864+ }
31865+ }
31866+ cipher:
31867+ if (need_cipher(inode)) {
31868+ cipher_plugin * ciplug;
71430cf6 31869+ struct blkcipher_desc desc;
44254afd
MT
31870+ struct scatterlist src;
31871+ struct scatterlist dst;
31872+
31873+ ciplug = inode_cipher_plugin(inode);
71430cf6
MT
31874+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
31875+ desc.flags = 0;
44254afd
MT
31876+ if (compressed)
31877+ alternate_streams(tc);
31878+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31879+ if (result)
31880+ return result;
31881+
31882+ align_or_cut_overhead(inode, clust, WRITE_OP);
31883+ src.page = virt_to_page(tfm_input_data(clust));
31884+ src.offset = offset_in_page(tfm_input_data(clust));
31885+ src.length = tc->len;
31886+
31887+ dst.page = virt_to_page(tfm_output_data(clust));
31888+ dst.offset = offset_in_page(tfm_output_data(clust));
31889+ dst.length = tc->len;
31890+
71430cf6 31891+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
44254afd
MT
31892+ if (result) {
31893+ warning("edward-1405",
71430cf6 31894+ "encryption failed flags=%x\n", desc.flags);
44254afd
MT
31895+ return result;
31896+ }
31897+ encrypted = 1;
31898+ }
31899+ if (compressed && coplug->checksum != NULL)
31900+ dc_set_checksum(coplug, tc);
31901+ if (!compressed && !encrypted)
31902+ alternate_streams(tc);
31903+ return result;
31904+}
31905+
31906+/* Common inflate manager. */
71430cf6 31907+int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
44254afd
MT
31908+{
31909+ int result = 0;
31910+ int transformed = 0;
71430cf6 31911+ struct tfm_cluster * tc = &clust->tc;
44254afd
MT
31912+ compression_plugin * coplug;
31913+
31914+ assert("edward-905", inode != NULL);
31915+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
31916+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
71430cf6 31917+ assert("edward-1349", tc->act == TFMA_READ);
44254afd
MT
31918+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
31919+
31920+ /* Handle a checksum (if any) */
31921+ coplug = inode_compression_plugin(inode);
31922+ if (need_inflate(clust, inode, need_cipher(inode)) &&
31923+ coplug->checksum != NULL) {
31924+ result = dc_check_checksum(coplug, tc);
71430cf6
MT
31925+ if (unlikely(result)) {
31926+ warning("edward-1460",
31927+ "Inode %llu: disk cluster %lu looks corrupted",
31928+ (unsigned long long)get_inode_oid(inode),
31929+ clust->index);
44254afd 31930+ return RETERR(-EIO);
71430cf6 31931+ }
44254afd
MT
31932+ }
31933+ if (need_cipher(inode)) {
31934+ cipher_plugin * ciplug;
71430cf6 31935+ struct blkcipher_desc desc;
44254afd
MT
31936+ struct scatterlist src;
31937+ struct scatterlist dst;
31938+
31939+ ciplug = inode_cipher_plugin(inode);
71430cf6
MT
31940+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
31941+ desc.flags = 0;
44254afd
MT
31942+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31943+ if (result)
31944+ return result;
31945+ assert("edward-909", tfm_cluster_is_set(tc));
31946+
31947+ src.page = virt_to_page(tfm_input_data(clust));
31948+ src.offset = offset_in_page(tfm_input_data(clust));
31949+ src.length = tc->len;
31950+
31951+ dst.page = virt_to_page(tfm_output_data(clust));
31952+ dst.offset = offset_in_page(tfm_output_data(clust));
31953+ dst.length = tc->len;
31954+
71430cf6
MT
31955+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
31956+ if (result) {
31957+ warning("edward-1600", "decrypt failed flags=%x\n",
31958+ desc.flags);
44254afd 31959+ return result;
71430cf6 31960+ }
44254afd
MT
31961+ align_or_cut_overhead(inode, clust, READ_OP);
31962+ transformed = 1;
31963+ }
31964+ if (need_inflate(clust, inode, 0)) {
31965+ unsigned dst_len = inode_cluster_size(inode);
31966+ if(transformed)
31967+ alternate_streams(tc);
31968+
31969+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31970+ if (result)
31971+ return result;
31972+ assert("edward-1305", coplug->decompress != NULL);
31973+ assert("edward-910", tfm_cluster_is_set(tc));
31974+
31975+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
31976+ tfm_input_data(clust), tc->len,
31977+ tfm_output_data(clust), &dst_len);
31978+ /* check length */
31979+ tc->len = dst_len;
31980+ assert("edward-157", dst_len == tc->lsize);
31981+ transformed = 1;
31982+ }
31983+ if (!transformed)
31984+ alternate_streams(tc);
31985+ return result;
31986+}
31987+
31988+/* This is implementation of readpage method of struct
31989+ address_space_operations for cryptcompress plugin. */
31990+int readpage_cryptcompress(struct file *file, struct page *page)
31991+{
31992+ reiser4_context *ctx;
71430cf6 31993+ struct cluster_handle clust;
44254afd
MT
31994+ item_plugin *iplug;
31995+ int result;
31996+
31997+ assert("edward-88", PageLocked(page));
31998+ assert("vs-976", !PageUptodate(page));
31999+ assert("edward-89", page->mapping && page->mapping->host);
32000+
71430cf6
MT
32001+ ctx = reiser4_init_context(page->mapping->host->i_sb);
32002+ if (IS_ERR(ctx)) {
44254afd 32003+ unlock_page(page);
71430cf6 32004+ return PTR_ERR(ctx);
44254afd
MT
32005+ }
32006+ assert("edward-113",
32007+ ergo(file != NULL,
32008+ page->mapping == file->f_dentry->d_inode->i_mapping));
32009+
32010+ if (PageUptodate(page)) {
32011+ warning("edward-1338", "page is already uptodate\n");
71430cf6 32012+ unlock_page(page);
44254afd
MT
32013+ reiser4_exit_context(ctx);
32014+ return 0;
32015+ }
32016+ cluster_init_read(&clust, NULL);
32017+ clust.file = file;
32018+ iplug = item_plugin_by_id(CTAIL_ID);
32019+ if (!iplug->s.file.readpage) {
32020+ unlock_page(page);
32021+ put_cluster_handle(&clust);
32022+ reiser4_exit_context(ctx);
32023+ return -EINVAL;
32024+ }
32025+ result = iplug->s.file.readpage(&clust, page);
71430cf6 32026+
44254afd 32027+ put_cluster_handle(&clust);
71430cf6 32028+ reiser4_txn_restart(ctx);
44254afd
MT
32029+ reiser4_exit_context(ctx);
32030+ return result;
32031+}
32032+
71430cf6
MT
32033+/* number of pages to check in */
32034+static int get_new_nrpages(struct cluster_handle * clust)
44254afd
MT
32035+{
32036+ switch (clust->op) {
71430cf6 32037+ case LC_APPOV:
44254afd 32038+ return clust->nr_pages;
71430cf6 32039+ case LC_TRUNC:
44254afd 32040+ assert("edward-1179", clust->win != NULL);
71430cf6 32041+ return size_in_pages(clust->win->off + clust->win->count);
44254afd
MT
32042+ default:
32043+ impossible("edward-1180", "bad page cluster option");
32044+ return 0;
32045+ }
32046+}
32047+
71430cf6
MT
32048+static void set_cluster_pages_dirty(struct cluster_handle * clust,
32049+ struct inode * inode)
44254afd
MT
32050+{
32051+ int i;
32052+ struct page *pg;
71430cf6 32053+ int nrpages = get_new_nrpages(clust);
44254afd
MT
32054+
32055+ for (i = 0; i < nrpages; i++) {
32056+
32057+ pg = clust->pages[i];
32058+ assert("edward-968", pg != NULL);
32059+ lock_page(pg);
32060+ assert("edward-1065", PageUptodate(pg));
71430cf6 32061+ reiser4_set_page_dirty_internal(pg);
44254afd
MT
32062+ unlock_page(pg);
32063+ mark_page_accessed(pg);
32064+ }
32065+}
32066+
71430cf6
MT
32067+/* Grab a page cluster for read/write operations.
32068+ Attach a jnode for write operations (when preparing for modifications, which
32069+ are supposed to be committed).
44254afd 32070+
71430cf6
MT
32071+ We allocate only one jnode per page cluster; this jnode is binded to the
32072+ first page of this cluster, so we have an extra-reference that will be put
32073+ as soon as jnode is evicted from memory), other references will be cleaned
32074+ up in flush time (assume that check in page cluster was successful).
44254afd 32075+*/
71430cf6
MT
32076+int grab_page_cluster(struct inode * inode,
32077+ struct cluster_handle * clust, rw_op rw)
44254afd
MT
32078+{
32079+ int i;
32080+ int result = 0;
32081+ jnode *node = NULL;
32082+
32083+ assert("edward-182", clust != NULL);
32084+ assert("edward-183", clust->pages != NULL);
71430cf6
MT
32085+ assert("edward-1466", clust->node == NULL);
32086+ assert("edward-1428", inode != NULL);
32087+ assert("edward-1429", inode->i_mapping != NULL);
44254afd
MT
32088+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32089+
32090+ if (clust->nr_pages == 0)
32091+ return 0;
32092+
32093+ for (i = 0; i < clust->nr_pages; i++) {
32094+
32095+ assert("edward-1044", clust->pages[i] == NULL);
32096+
32097+ clust->pages[i] =
71430cf6
MT
32098+ find_or_create_page(inode->i_mapping,
32099+ clust_to_pg(clust->index, inode) + i,
32100+ reiser4_ctx_gfp_mask_get());
44254afd
MT
32101+ if (!clust->pages[i]) {
32102+ result = RETERR(-ENOMEM);
32103+ break;
32104+ }
71430cf6 32105+ if (i == 0 && rw == WRITE_OP) {
44254afd
MT
32106+ node = jnode_of_page(clust->pages[i]);
32107+ if (IS_ERR(node)) {
32108+ result = PTR_ERR(node);
32109+ unlock_page(clust->pages[i]);
32110+ break;
32111+ }
32112+ JF_SET(node, JNODE_CLUSTER_PAGE);
71430cf6 32113+ assert("edward-920", jprivate(clust->pages[0]));
44254afd 32114+ }
71430cf6 32115+ INODE_PGCOUNT_INC(inode);
44254afd
MT
32116+ unlock_page(clust->pages[i]);
32117+ }
71430cf6
MT
32118+ if (unlikely(result)) {
32119+ while (i) {
32120+ put_cluster_page(clust->pages[--i]);
32121+ INODE_PGCOUNT_DEC(inode);
32122+ }
44254afd
MT
32123+ if (node && !IS_ERR(node))
32124+ jput(node);
32125+ return result;
32126+ }
71430cf6 32127+ clust->node = node;
44254afd
MT
32128+ return 0;
32129+}
32130+
71430cf6
MT
32131+static void truncate_page_cluster_range(struct inode * inode,
32132+ struct page ** pages,
32133+ cloff_t index,
32134+ int from, int count,
32135+ int even_cows)
44254afd 32136+{
71430cf6
MT
32137+ assert("edward-1467", count > 0);
32138+ reiser4_invalidate_pages(inode->i_mapping,
32139+ clust_to_pg(index, inode) + from,
32140+ count, even_cows);
44254afd
MT
32141+}
32142+
71430cf6
MT
32143+/* Put @count pages starting from @from offset */
32144+void __put_page_cluster(int from, int count,
32145+ struct page ** pages, struct inode * inode)
44254afd
MT
32146+{
32147+ int i;
71430cf6
MT
32148+ assert("edward-1468", pages != NULL);
32149+ assert("edward-1469", inode != NULL);
32150+ assert("edward-1470", from >= 0 && count >= 0);
44254afd 32151+
71430cf6
MT
32152+ for (i = 0; i < count; i++) {
32153+ assert("edward-1471", pages[from + i] != NULL);
32154+ assert("edward-1472",
32155+ pages[from + i]->index == pages[from]->index + i);
44254afd 32156+
71430cf6
MT
32157+ put_cluster_page(pages[from + i]);
32158+ INODE_PGCOUNT_DEC(inode);
44254afd
MT
32159+ }
32160+}
32161+
71430cf6
MT
32162+/*
32163+ * This is dual to grab_page_cluster,
32164+ * however if @rw == WRITE_OP, then we call this function
32165+ * only if something is failed before checkin page cluster.
32166+ */
32167+void put_page_cluster(struct cluster_handle * clust,
32168+ struct inode * inode, rw_op rw)
44254afd 32169+{
44254afd
MT
32170+ assert("edward-445", clust != NULL);
32171+ assert("edward-922", clust->pages != NULL);
71430cf6
MT
32172+ assert("edward-446",
32173+ ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
44254afd 32174+
71430cf6
MT
32175+ __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
32176+ if (rw == WRITE_OP) {
32177+ if (unlikely(clust->node)) {
32178+ assert("edward-447",
32179+ clust->node == jprivate(clust->pages[0]));
32180+ jput(clust->node);
32181+ clust->node = NULL;
32182+ }
32183+ }
44254afd
MT
32184+}
32185+
32186+#if REISER4_DEBUG
71430cf6
MT
32187+int cryptcompress_inode_ok(struct inode *inode)
32188+{
32189+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
32190+ return 0;
32191+ if (!cluster_shift_ok(inode_cluster_shift(inode)))
32192+ return 0;
32193+ return 1;
32194+}
32195+
32196+static int window_ok(struct reiser4_slide * win, struct inode *inode)
44254afd
MT
32197+{
32198+ assert("edward-1115", win != NULL);
32199+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32200+
32201+ return (win->off != inode_cluster_size(inode)) &&
32202+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
32203+}
32204+
71430cf6 32205+static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
44254afd
MT
32206+{
32207+ assert("edward-279", clust != NULL);
32208+
32209+ if (!clust->pages)
32210+ return 0;
32211+ return (clust->win ? window_ok(clust->win, inode) : 1);
32212+}
71430cf6
MT
32213+#if 0
32214+static int pages_truncate_ok(struct inode *inode, pgoff_t start)
32215+{
32216+ int found;
32217+ struct page * page;
32218+
32219+ found = find_get_pages(inode->i_mapping, start, 1, &page);
32220+ if (found)
32221+ put_cluster_page(page);
32222+ return !found;
32223+}
32224+#else
32225+#define pages_truncate_ok(inode, start) 1
32226+#endif
32227+
32228+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
32229+{
32230+ jnode *node;
32231+ node = jlookup(current_tree, get_inode_oid(inode),
32232+ clust_to_pg(index, inode));
32233+ if (likely(!node))
32234+ return 1;
32235+ jput(node);
32236+ return 0;
32237+}
32238+
32239+static int find_fake_appended(struct inode *inode, cloff_t * index);
32240+
32241+static int body_truncate_ok(struct inode *inode, cloff_t aidx)
32242+{
32243+ int result;
32244+ cloff_t raidx;
32245+
32246+ result = find_fake_appended(inode, &raidx);
32247+ return !result && (aidx == raidx);
32248+}
44254afd
MT
32249+#endif
32250+
32251+/* guess next window stat */
71430cf6 32252+static inline window_stat next_window_stat(struct reiser4_slide * win)
44254afd
MT
32253+{
32254+ assert("edward-1130", win != NULL);
32255+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32256+ HOLE_WINDOW : DATA_WINDOW);
32257+}
32258+
71430cf6
MT
32259+/* guess and set next cluster index and window params */
32260+static void move_update_window(struct inode * inode,
32261+ struct cluster_handle * clust,
32262+ loff_t file_off, loff_t to_file)
44254afd 32263+{
71430cf6 32264+ struct reiser4_slide * win;
44254afd
MT
32265+
32266+ assert("edward-185", clust != NULL);
32267+ assert("edward-438", clust->pages != NULL);
32268+ assert("edward-281", cluster_ok(clust, inode));
32269+
32270+ win = clust->win;
32271+ if (!win)
32272+ return;
32273+
32274+ switch (win->stat) {
32275+ case DATA_WINDOW:
71430cf6 32276+ /* increment */
44254afd
MT
32277+ clust->index++;
32278+ win->stat = DATA_WINDOW;
32279+ win->off = 0;
71430cf6 32280+ win->count = min((loff_t)inode_cluster_size(inode), to_file);
44254afd
MT
32281+ break;
32282+ case HOLE_WINDOW:
32283+ switch (next_window_stat(win)) {
32284+ case HOLE_WINDOW:
71430cf6 32285+ /* skip */
44254afd
MT
32286+ clust->index = off_to_clust(file_off, inode);
32287+ win->stat = HOLE_WINDOW;
32288+ win->off = 0;
32289+ win->count = off_to_cloff(file_off, inode);
71430cf6
MT
32290+ win->delta = min((loff_t)(inode_cluster_size(inode) -
32291+ win->count), to_file);
44254afd
MT
32292+ break;
32293+ case DATA_WINDOW:
71430cf6 32294+ /* stay */
44254afd 32295+ win->stat = DATA_WINDOW;
71430cf6 32296+ /* off+count+delta=inv */
44254afd
MT
32297+ win->off = win->off + win->count;
32298+ win->count = win->delta;
32299+ win->delta = 0;
32300+ break;
32301+ default:
32302+ impossible("edward-282", "wrong next window state");
32303+ }
32304+ break;
32305+ default:
32306+ impossible("edward-283", "wrong current window state");
32307+ }
32308+ assert("edward-1068", cluster_ok(clust, inode));
32309+}
32310+
32311+static int update_sd_cryptcompress(struct inode *inode)
32312+{
32313+ int result = 0;
32314+
71430cf6 32315+ assert("edward-978", reiser4_schedulable());
44254afd 32316+
71430cf6
MT
32317+ result = reiser4_grab_space_force(/* one for stat data update */
32318+ estimate_update_common(inode),
32319+ BA_CAN_COMMIT);
44254afd
MT
32320+ if (result)
32321+ return result;
32322+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
32323+ result = reiser4_update_sd(inode);
32324+
44254afd
MT
32325+ return result;
32326+}
32327+
44254afd
MT
32328+static void uncapture_cluster_jnode(jnode * node)
32329+{
32330+ txn_atom *atom;
32331+
32332+ assert_spin_locked(&(node->guard));
32333+
44254afd
MT
32334+ atom = jnode_get_atom(node);
32335+ if (atom == NULL) {
32336+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
32337+ spin_unlock_jnode(node);
32338+ return;
32339+ }
71430cf6 32340+ reiser4_uncapture_block(node);
44254afd
MT
32341+ spin_unlock_atom(atom);
32342+ jput(node);
32343+}
32344+
71430cf6 32345+static void put_found_pages(struct page **pages, int nr)
44254afd
MT
32346+{
32347+ int i;
32348+ for (i = 0; i < nr; i++) {
44254afd 32349+ assert("edward-1045", pages[i] != NULL);
71430cf6
MT
32350+ put_cluster_page(pages[i]);
32351+ }
32352+}
32353+
32354+/* Lifecycle of a logical cluster in the system.
32355+ *
32356+ *
32357+ * Logical cluster of a cryptcompress file is represented in the system by
32358+ * . page cluster (in memory, primary cache, contains plain text);
32359+ * . disk cluster (in memory, secondary cache, contains transformed text).
32360+ * Primary cache is to reduce number of transform operations (compression,
32361+ * encryption), i.e. to implement transform-caching strategy.
32362+ * Secondary cache is to reduce number of I/O operations, i.e. for usual
32363+ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
32364+ * a logical cluster to the primary cache. Disk cluster is a set of items
32365+ * of the same type defined by some reiser4 item plugin id.
32366+ *
32367+ * 1. Performing modifications
32368+ *
32369+ * Every modification of a cryptcompress file is considered as a set of
32370+ * operations performed on file's logical clusters. Every such "atomic"
32371+ * modification is truncate, append and(or) overwrite some bytes of a
32372+ * logical cluster performed in the primary cache with the following
32373+ * synchronization with the secondary cache (in flush time). Disk clusters,
32374+ * which live in the secondary cache, are supposed to be synchronized with
32375+ * disk. The mechanism of synchronization of primary and secondary caches
32376+ * includes so-called checkin/checkout technique described below.
32377+ *
32378+ * 2. Submitting modifications
32379+ *
32380+ * Each page cluster has associated jnode (a special in-memory header to
32381+ * keep a track of transactions in reiser4), which is attached to its first
32382+ * page when grabbing page cluster for modifications (see grab_page_cluster).
32383+ * Submitting modifications (see checkin_logical_cluster) is going per logical
32384+ * cluster and includes:
32385+ * . checkin_cluster_size;
32386+ * . checkin_page_cluster.
32387+ * checkin_cluster_size() is resolved to file size update (which completely
32388+ * defines new size of logical cluster (number of file's bytes in a logical
32389+ * cluster).
32390+ * checkin_page_cluster() captures jnode of a page cluster and installs
32391+ * jnode's dirty flag (if needed) to indicate that modifications are
32392+ * successfully checked in.
32393+ *
32394+ * 3. Checking out modifications
32395+ *
32396+ * Is going per logical cluster in flush time (see checkout_logical_cluster).
32397+ * This is the time of synchronizing primary and secondary caches.
32398+ * checkout_logical_cluster() includes:
32399+ * . checkout_page_cluster (retrieving checked in pages).
32400+ * . uncapture jnode (including clear dirty flag and unlock)
32401+ *
32402+ * 4. Committing modifications
32403+ *
32404+ * Proceeding a synchronization of primary and secondary caches. When checking
32405+ * out page cluster (the phase above) pages are locked/flushed/unlocked
32406+ * one-by-one in ascending order of their indexes to contiguous stream, which
32407+ * is supposed to be transformed (compressed, encrypted), chopped up into items
32408+ * and committed to disk as a disk cluster.
32409+ *
32410+ * 5. Managing page references
32411+ *
32412+ * Every checked in page have a special additional "control" reference,
32413+ * which is dropped at checkout. We need this to avoid unexpected evicting
32414+ * pages from memory before checkout. Control references are managed so
32415+ * they are not accumulated with every checkin:
32416+ *
32417+ * 0
32418+ * checkin -> 1
32419+ * 0 -> checkout
32420+ * checkin -> 1
32421+ * checkin -> 1
32422+ * checkin -> 1
32423+ * 0 -> checkout
32424+ * ...
32425+ *
32426+ * Every page cluster has its own unique "cluster lock". Update/drop
32427+ * references are serialized via this lock. Number of checked in cluster
32428+ * pages is calculated by i_size under cluster lock. File size is updated
32429+ * at every checkin action also under cluster lock (except cases of
32430+ * appending/truncating fake logical clusters).
32431+ *
32432+ * Proof of correctness:
32433+ *
32434+ * Since we update file size under cluster lock, in the case of non-fake
32435+ * logical cluster with its lock held we do have expected number of checked
32436+ * in pages. On the other hand, append/truncate of fake logical clusters
32437+ * doesn't change number of checked in pages of any cluster.
32438+ *
32439+ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
32440+ * Currently, I don't see any reason to create a special lock for those
32441+ * needs.
32442+ */
32443+
32444+static inline void lock_cluster(jnode * node)
32445+{
32446+ spin_lock_jnode(node);
44254afd
MT
32447+}
32448+
71430cf6 32449+static inline void unlock_cluster(jnode * node)
44254afd 32450+{
71430cf6
MT
32451+ spin_unlock_jnode(node);
32452+}
44254afd 32453+
71430cf6
MT
32454+static inline void unlock_cluster_uncapture(jnode * node)
32455+{
32456+ uncapture_cluster_jnode(node);
32457+}
44254afd 32458+
71430cf6
MT
32459+/* Set new file size by window. Cluster lock is required. */
32460+static void checkin_file_size(struct cluster_handle * clust,
32461+ struct inode * inode)
32462+{
32463+ loff_t new_size;
32464+ struct reiser4_slide * win;
32465+
32466+ assert("edward-1181", clust != NULL);
32467+ assert("edward-1182", inode != NULL);
32468+ assert("edward-1473", clust->pages != NULL);
32469+ assert("edward-1474", clust->pages[0] != NULL);
32470+ assert("edward-1475", jprivate(clust->pages[0]) != NULL);
32471+ assert_spin_locked(&(jprivate(clust->pages[0])->guard));
32472+
32473+
32474+ win = clust->win;
32475+ assert("edward-1183", win != NULL);
32476+
32477+ new_size = clust_to_off(clust->index, inode) + win->off;
32478+
32479+ switch (clust->op) {
32480+ case LC_APPOV:
32481+ if (new_size + win->count <= i_size_read(inode))
32482+ /* overwrite only */
32483+ return;
32484+ new_size += win->count;
32485+ break;
32486+ case LC_TRUNC:
32487+ break;
32488+ default:
32489+ impossible("edward-1184", "bad page cluster option");
32490+ break;
44254afd 32491+ }
71430cf6
MT
32492+ inode_check_scale_nolock(inode, i_size_read(inode), new_size);
32493+ i_size_write(inode, new_size);
32494+ return;
32495+}
44254afd 32496+
71430cf6
MT
32497+static inline void checkin_cluster_size(struct cluster_handle * clust,
32498+ struct inode * inode)
32499+{
32500+ if (clust->win)
32501+ checkin_file_size(clust, inode);
32502+}
44254afd 32503+
71430cf6
MT
32504+static int checkin_page_cluster(struct cluster_handle * clust,
32505+ struct inode * inode)
32506+{
32507+ int result;
32508+ jnode * node;
32509+ int old_nrpages = clust->old_nrpages;
32510+ int new_nrpages = get_new_nrpages(clust);
32511+
32512+ node = clust->node;
32513+
32514+ assert("edward-221", node != NULL);
32515+ assert("edward-971", clust->reserved == 1);
32516+ assert("edward-1263",
32517+ clust->reserved_prepped == estimate_update_cluster(inode));
32518+ assert("edward-1264", clust->reserved_unprepped == 0);
32519+
32520+ if (JF_ISSET(node, JNODE_DIRTY)) {
32521+ /*
32522+ * page cluster was checked in, but not yet
32523+ * checked out, so release related resources
32524+ */
32525+ free_reserved4cluster(inode, clust,
32526+ estimate_update_cluster(inode));
32527+ __put_page_cluster(0, clust->old_nrpages,
32528+ clust->pages, inode);
32529+ } else {
32530+ result = capture_cluster_jnode(node);
32531+ if (unlikely(result)) {
32532+ unlock_cluster(node);
32533+ return result;
32534+ }
32535+ jnode_make_dirty_locked(node);
32536+ clust->reserved = 0;
32537+ }
32538+ unlock_cluster(node);
32539+
32540+ if (new_nrpages < old_nrpages) {
32541+ /* truncate >= 1 complete pages */
32542+ __put_page_cluster(new_nrpages,
32543+ old_nrpages - new_nrpages,
32544+ clust->pages, inode);
32545+ truncate_page_cluster_range(inode,
32546+ clust->pages, clust->index,
32547+ new_nrpages,
32548+ old_nrpages - new_nrpages,
32549+ 0);
32550+ }
44254afd 32551+#if REISER4_DEBUG
71430cf6 32552+ clust->reserved_prepped -= estimate_update_cluster(inode);
44254afd 32553+#endif
71430cf6
MT
32554+ return 0;
32555+}
44254afd 32556+
71430cf6
MT
32557+/* Submit modifications of a logical cluster */
32558+static int checkin_logical_cluster(struct cluster_handle * clust,
32559+ struct inode *inode)
32560+{
32561+ int result = 0;
32562+ jnode * node;
44254afd 32563+
71430cf6
MT
32564+ node = clust->node;
32565+
32566+ assert("edward-1035", node != NULL);
32567+ assert("edward-1029", clust != NULL);
32568+ assert("edward-1030", clust->reserved == 1);
32569+ assert("edward-1031", clust->nr_pages != 0);
32570+ assert("edward-1032", clust->pages != NULL);
32571+ assert("edward-1033", clust->pages[0] != NULL);
32572+ assert("edward-1446", jnode_is_cluster_page(node));
32573+ assert("edward-1476", node == jprivate(clust->pages[0]));
32574+
32575+ lock_cluster(node);
32576+ checkin_cluster_size(clust, inode);
32577+ /* this will unlock cluster */
32578+ result = checkin_page_cluster(clust, inode);
32579+ jput(node);
32580+ clust->node = NULL;
32581+ return result;
32582+}
32583+
32584+/*
32585+ * Retrieve size of logical cluster that was checked in at
32586+ * the latest modifying session (cluster lock is required)
32587+ */
32588+static inline void checkout_cluster_size(struct cluster_handle * clust,
32589+ struct inode * inode)
32590+{
32591+ struct tfm_cluster *tc = &clust->tc;
32592+
32593+ tc->len = lbytes(clust->index, inode);
32594+ assert("edward-1478", tc->len != 0);
32595+}
32596+
32597+/*
32598+ * Retrieve a page cluster with the latest submitted modifications
32599+ * and flush its pages to previously allocated contiguous stream.
32600+ */
32601+static void checkout_page_cluster(struct cluster_handle * clust,
32602+ jnode * node, struct inode * inode)
32603+{
32604+ int i;
32605+ int found;
32606+ int to_put;
32607+ struct tfm_cluster *tc = &clust->tc;
32608+
32609+ /* find and put checked in pages: cluster is locked,
32610+ * so we must get expected number (to_put) of pages
32611+ */
32612+ to_put = size_in_pages(lbytes(clust->index, inode));
32613+ found = find_get_pages(inode->i_mapping,
32614+ clust_to_pg(clust->index, inode),
32615+ to_put, clust->pages);
32616+ BUG_ON(found != to_put);
32617+
32618+ __put_page_cluster(0, to_put, clust->pages, inode);
32619+ unlock_cluster_uncapture(node);
32620+
32621+ /* Flush found pages.
32622+ *
32623+ * Note, that we don't disable modifications while flushing,
32624+ * moreover, some found pages can be truncated, as we have
32625+ * released cluster lock.
32626+ */
32627+ for (i = 0; i < found; i++) {
32628+ int in_page;
32629+ char * data;
32630+ assert("edward-1479",
32631+ clust->pages[i]->index == clust->pages[0]->index + i);
44254afd 32632+
44254afd 32633+ lock_page(clust->pages[i]);
71430cf6
MT
32634+ if (!PageUptodate(clust->pages[i])) {
32635+ /* page was truncated */
32636+ assert("edward-1480",
32637+ i_size_read(inode) <= page_offset(clust->pages[i]));
32638+ assert("edward-1481",
32639+ clust->pages[i]->mapping != inode->i_mapping);
32640+ unlock_page(clust->pages[i]);
32641+ break;
32642+ }
32643+ /* Update the number of bytes in the logical cluster,
32644+ * as it could be partially truncated. Note, that only
32645+ * partial truncate is possible (complete truncate can
32646+ * not go here, as it is performed via ->kill_hook()
32647+ * called by cut_file_items(), and the last one must
32648+ * wait for znode locked with parent coord).
32649+ */
32650+ checkout_cluster_size(clust, inode);
44254afd 32651+
71430cf6
MT
32652+ /* this can be zero, as new file size is
32653+ checked in before truncating pages */
32654+ in_page = __mbp(tc->len, i);
44254afd 32655+
71430cf6 32656+ data = kmap(clust->pages[i]);
44254afd 32657+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
71430cf6 32658+ data, in_page);
44254afd 32659+ kunmap(clust->pages[i]);
71430cf6
MT
32660+
32661+ if (PageDirty(clust->pages[i]))
32662+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
32663+
44254afd 32664+ unlock_page(clust->pages[i]);
71430cf6
MT
32665+
32666+ if (in_page < PAGE_CACHE_SIZE)
32667+ /* end of the file */
32668+ break;
44254afd 32669+ }
71430cf6
MT
32670+ put_found_pages(clust->pages, found); /* find_get_pages */
32671+ tc->lsize = tc->len;
32672+ return;
32673+}
32674+
32675+/* Check out modifications of a logical cluster */
32676+int checkout_logical_cluster(struct cluster_handle * clust,
32677+ jnode * node, struct inode *inode)
32678+{
32679+ int result;
32680+ struct tfm_cluster *tc = &clust->tc;
32681+
32682+ assert("edward-980", node != NULL);
32683+ assert("edward-236", inode != NULL);
32684+ assert("edward-237", clust != NULL);
32685+ assert("edward-240", !clust->win);
32686+ assert("edward-241", reiser4_schedulable());
32687+ assert("edward-718", cryptcompress_inode_ok(inode));
32688+
32689+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
32690+ if (result) {
32691+ warning("edward-1430", "alloc stream failed with ret=%d",
32692+ result);
32693+ return RETERR(-E_REPEAT);
32694+ }
32695+ lock_cluster(node);
32696+
32697+ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
32698+ /* race with another flush */
32699+ warning("edward-982",
32700+ "checking out logical cluster %lu of inode %llu: "
32701+ "jnode is not dirty", clust->index,
32702+ (unsigned long long)get_inode_oid(inode));
32703+ unlock_cluster(node);
32704+ return RETERR(-E_REPEAT);
32705+ }
32706+ cluster_reserved2grabbed(estimate_update_cluster(inode));
32707+
32708+ /* this will unlock cluster */
32709+ checkout_page_cluster(clust, node, inode);
32710+ return 0;
44254afd
MT
32711+}
32712+
32713+/* set hint for the cluster of the index @index */
32714+static void set_hint_cluster(struct inode *inode, hint_t * hint,
32715+ cloff_t index, znode_lock_mode mode)
32716+{
32717+ reiser4_key key;
71430cf6 32718+ assert("edward-722", cryptcompress_inode_ok(inode));
44254afd
MT
32719+ assert("edward-723",
32720+ inode_file_plugin(inode) ==
71430cf6 32721+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44254afd
MT
32722+
32723+ inode_file_plugin(inode)->key_by_inode(inode,
32724+ clust_to_off(index, inode),
32725+ &key);
32726+
71430cf6 32727+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
44254afd
MT
32728+ hint->offset = get_key_offset(&key);
32729+ hint->mode = mode;
32730+}
32731+
71430cf6 32732+void invalidate_hint_cluster(struct cluster_handle * clust)
44254afd
MT
32733+{
32734+ assert("edward-1291", clust != NULL);
32735+ assert("edward-1292", clust->hint != NULL);
32736+
32737+ done_lh(&clust->hint->lh);
71430cf6 32738+ hint_clr_valid(clust->hint);
44254afd
MT
32739+}
32740+
71430cf6
MT
32741+void put_hint_cluster(struct cluster_handle * clust, struct inode *inode,
32742+ znode_lock_mode mode)
44254afd
MT
32743+{
32744+ assert("edward-1286", clust != NULL);
32745+ assert("edward-1287", clust->hint != NULL);
32746+
32747+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
32748+ invalidate_hint_cluster(clust);
32749+}
32750+
71430cf6
MT
32751+static int balance_dirty_page_cluster(struct cluster_handle * clust,
32752+ struct inode *inode, loff_t off,
32753+ loff_t to_file)
44254afd
MT
32754+{
32755+ int result;
71430cf6 32756+ struct cryptcompress_info * info;
44254afd
MT
32757+
32758+ assert("edward-724", inode != NULL);
71430cf6 32759+ assert("edward-725", cryptcompress_inode_ok(inode));
44254afd
MT
32760+
32761+ /* set next window params */
71430cf6 32762+ move_update_window(inode, clust, off, to_file);
44254afd
MT
32763+
32764+ result = update_sd_cryptcompress(inode);
44254afd
MT
32765+ if (result)
32766+ return result;
32767+ assert("edward-726", clust->hint->lh.owner == NULL);
71430cf6 32768+ info = cryptcompress_inode_data(inode);
44254afd 32769+
71430cf6 32770+ mutex_unlock(&info->checkin_mutex);
44254afd 32771+ reiser4_throttle_write(inode);
71430cf6 32772+ mutex_lock(&info->checkin_mutex);
44254afd
MT
32773+ return 0;
32774+}
32775+
71430cf6
MT
32776+/* set zeroes to the page cluster, proceed it, and maybe, try to capture
32777+ its pages */
32778+static int write_hole(struct inode *inode, struct cluster_handle * clust,
32779+ loff_t file_off, loff_t to_file)
44254afd 32780+{
44254afd
MT
32781+ int result = 0;
32782+ unsigned cl_off, cl_count = 0;
32783+ unsigned to_pg, pg_off;
71430cf6 32784+ struct reiser4_slide * win;
44254afd
MT
32785+
32786+ assert("edward-190", clust != NULL);
32787+ assert("edward-1069", clust->win != NULL);
32788+ assert("edward-191", inode != NULL);
71430cf6 32789+ assert("edward-727", cryptcompress_inode_ok(inode));
44254afd
MT
32790+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
32791+ assert("edward-1154",
32792+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
32793+
32794+ win = clust->win;
32795+
32796+ assert("edward-1070", win != NULL);
32797+ assert("edward-201", win->stat == HOLE_WINDOW);
32798+ assert("edward-192", cluster_ok(clust, inode));
32799+
32800+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
71430cf6
MT
32801+ /* This part of the hole will be represented by "fake"
32802+ * logical cluster, i.e. which doesn't have appropriate
32803+ * disk cluster until someone modify this logical cluster
32804+ * and make it dirty.
32805+ * So go forward here..
32806+ */
32807+ move_update_window(inode, clust, file_off, to_file);
44254afd
MT
32808+ return 0;
32809+ }
32810+ cl_count = win->count; /* number of zeroes to write */
32811+ cl_off = win->off;
32812+ pg_off = off_to_pgoff(win->off);
32813+
32814+ while (cl_count) {
32815+ struct page *page;
32816+ page = clust->pages[off_to_pg(cl_off)];
32817+
32818+ assert("edward-284", page != NULL);
32819+
71430cf6 32820+ to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
44254afd 32821+ lock_page(page);
71430cf6 32822+ zero_user_page(page, pg_off, to_pg, KM_USER0);
44254afd 32823+ SetPageUptodate(page);
71430cf6
MT
32824+ reiser4_set_page_dirty_internal(page);
32825+ mark_page_accessed(page);
44254afd
MT
32826+ unlock_page(page);
32827+
32828+ cl_off += to_pg;
32829+ cl_count -= to_pg;
32830+ pg_off = 0;
32831+ }
32832+ if (!win->delta) {
71430cf6
MT
32833+ /* only zeroes in this window, try to capture
32834+ */
32835+ result = checkin_logical_cluster(clust, inode);
44254afd
MT
32836+ if (result)
32837+ return result;
32838+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
32839+ result =
32840+ balance_dirty_page_cluster(clust, inode, file_off, to_file);
32841+ } else
71430cf6 32842+ move_update_window(inode, clust, file_off, to_file);
44254afd
MT
32843+ return result;
32844+}
32845+
32846+/*
71430cf6
MT
32847+ The main disk search procedure for cryptcompress plugin, which
32848+ . scans all items of disk cluster with the lock mode @mode
32849+ . maybe reads each one (if @read)
32850+ . maybe makes its znode dirty (if write lock mode was specified)
44254afd
MT
32851+
32852+ NOTE-EDWARD: Callers should handle the case when disk cluster
32853+ is incomplete (-EIO)
32854+*/
71430cf6
MT
32855+int find_disk_cluster(struct cluster_handle * clust,
32856+ struct inode *inode, int read, znode_lock_mode mode)
44254afd
MT
32857+{
32858+ flow_t f;
32859+ hint_t *hint;
32860+ int result = 0;
71430cf6 32861+ int was_grabbed;
44254afd
MT
32862+ ra_info_t ra_info;
32863+ file_plugin *fplug;
32864+ item_plugin *iplug;
71430cf6
MT
32865+ struct tfm_cluster *tc;
32866+ struct cryptcompress_info * info;
44254afd
MT
32867+
32868+ assert("edward-138", clust != NULL);
32869+ assert("edward-728", clust->hint != NULL);
71430cf6 32870+ assert("edward-226", reiser4_schedulable());
44254afd 32871+ assert("edward-137", inode != NULL);
71430cf6 32872+ assert("edward-729", cryptcompress_inode_ok(inode));
44254afd
MT
32873+
32874+ hint = clust->hint;
44254afd
MT
32875+ fplug = inode_file_plugin(inode);
32876+ was_grabbed = get_current_context()->grabbed_blocks;
71430cf6 32877+ info = cryptcompress_inode_data(inode);
44254afd
MT
32878+ tc = &clust->tc;
32879+
32880+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
32881+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
32882+
71430cf6
MT
32883+ dclust_init_extension(hint);
32884+
44254afd
MT
32885+ /* set key of the first disk cluster item */
32886+ fplug->flow_by_inode(inode,
32887+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
32888+ 0 /* kernel space */ ,
32889+ inode_scaled_cluster_size(inode),
71430cf6
MT
32890+ clust_to_off(clust->index, inode), READ_OP, &f);
32891+ if (mode == ZNODE_WRITE_LOCK) {
44254afd
MT
32892+ /* reserve for flush to make dirty all the leaf nodes
32893+ which contain disk cluster */
32894+ result =
32895+ reiser4_grab_space_force(estimate_dirty_cluster(inode),
32896+ BA_CAN_COMMIT);
44254afd
MT
32897+ if (result)
32898+ goto out;
32899+ }
32900+
32901+ ra_info.key_to_stop = f.key;
71430cf6 32902+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
44254afd
MT
32903+
32904+ while (f.length) {
71430cf6
MT
32905+ result = find_cluster_item(hint, &f.key, mode,
32906+ NULL, FIND_EXACT,
32907+ (mode == ZNODE_WRITE_LOCK ?
32908+ CBK_FOR_INSERT : 0));
44254afd
MT
32909+ switch (result) {
32910+ case CBK_COORD_NOTFOUND:
32911+ result = 0;
32912+ if (inode_scaled_offset
71430cf6
MT
32913+ (inode, clust_to_off(clust->index, inode)) ==
32914+ get_key_offset(&f.key)) {
44254afd
MT
32915+ /* first item not found, this is treated
32916+ as disk cluster is absent */
32917+ clust->dstat = FAKE_DISK_CLUSTER;
32918+ goto out;
32919+ }
32920+ /* we are outside the cluster, stop search here */
32921+ assert("edward-146",
32922+ f.length != inode_scaled_cluster_size(inode));
32923+ goto ok;
32924+ case CBK_COORD_FOUND:
32925+ assert("edward-148",
32926+ hint->ext_coord.coord.between == AT_UNIT);
32927+ assert("edward-460",
32928+ hint->ext_coord.coord.unit_pos == 0);
32929+
32930+ coord_clear_iplug(&hint->ext_coord.coord);
32931+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
32932+ if (unlikely(result))
32933+ goto out;
32934+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
32935+ assert("edward-147",
32936+ item_id_by_coord(&hint->ext_coord.coord) ==
32937+ CTAIL_ID);
32938+
32939+ result = iplug->s.file.read(NULL, &f, hint);
32940+ if (result) {
32941+ zrelse(hint->ext_coord.coord.node);
32942+ goto out;
32943+ }
71430cf6
MT
32944+ if (mode == ZNODE_WRITE_LOCK) {
32945+ /* Don't make dirty more nodes then it was
32946+ estimated (see comments before
32947+ estimate_dirty_cluster). Missed nodes will be
32948+ read up in flush time if they are evicted from
32949+ memory */
32950+ if (dclust_get_extension_ncount(hint) <=
32951+ estimate_dirty_cluster(inode))
32952+ znode_make_dirty(hint->ext_coord.coord.node);
32953+
44254afd
MT
32954+ znode_set_convertible(hint->ext_coord.coord.
32955+ node);
32956+ }
32957+ zrelse(hint->ext_coord.coord.node);
32958+ break;
32959+ default:
32960+ goto out;
32961+ }
32962+ }
32963+ ok:
32964+ /* at least one item was found */
32965+ /* NOTE-EDWARD: Callers should handle the case
32966+ when disk cluster is incomplete (-EIO) */
32967+ tc->len = inode_scaled_cluster_size(inode) - f.length;
71430cf6 32968+ tc->lsize = lbytes(clust->index, inode);
44254afd
MT
32969+ assert("edward-1196", tc->len > 0);
32970+ assert("edward-1406", tc->lsize > 0);
32971+
71430cf6 32972+ if (hint_is_unprepped_dclust(clust->hint)) {
44254afd 32973+ clust->dstat = UNPR_DISK_CLUSTER;
71430cf6
MT
32974+ } else if (clust->index == info->trunc_index) {
32975+ clust->dstat = TRNC_DISK_CLUSTER;
32976+ } else {
44254afd 32977+ clust->dstat = PREP_DISK_CLUSTER;
71430cf6
MT
32978+ dclust_set_extension_dsize(clust->hint, tc->len);
32979+ }
44254afd
MT
32980+ out:
32981+ assert("edward-1339",
32982+ get_current_context()->grabbed_blocks >= was_grabbed);
32983+ grabbed2free(get_current_context(),
32984+ get_current_super_private(),
32985+ get_current_context()->grabbed_blocks - was_grabbed);
32986+ return result;
32987+}
32988+
71430cf6
MT
32989+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
32990+ znode_lock_mode lock_mode)
44254afd
MT
32991+{
32992+ reiser4_key key;
32993+ ra_info_t ra_info;
32994+
71430cf6 32995+ assert("edward-730", reiser4_schedulable());
44254afd
MT
32996+ assert("edward-731", clust != NULL);
32997+ assert("edward-732", inode != NULL);
32998+
71430cf6 32999+ if (hint_is_valid(clust->hint)) {
44254afd
MT
33000+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33001+ assert("edward-1294",
33002+ znode_is_write_locked(clust->hint->lh.node));
33003+ /* already have a valid locked position */
33004+ return (clust->dstat ==
33005+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33006+ CBK_COORD_FOUND);
33007+ }
33008+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33009+ &key);
33010+ ra_info.key_to_stop = key;
71430cf6 33011+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
44254afd
MT
33012+
33013+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33014+ CBK_FOR_INSERT);
33015+}
33016+
33017+/* Read needed cluster pages before modifying.
33018+ If success, @clust->hint contains locked position in the tree.
33019+ Also:
33020+ . find and set disk cluster state
33021+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33022+*/
71430cf6
MT
33023+static int read_some_cluster_pages(struct inode * inode,
33024+ struct cluster_handle * clust)
44254afd
MT
33025+{
33026+ int i;
33027+ int result = 0;
33028+ item_plugin *iplug;
71430cf6
MT
33029+ struct reiser4_slide * win = clust->win;
33030+ znode_lock_mode mode = ZNODE_WRITE_LOCK;
44254afd
MT
33031+
33032+ iplug = item_plugin_by_id(CTAIL_ID);
33033+
44254afd
MT
33034+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33035+
33036+#if REISER4_DEBUG
33037+ if (clust->nr_pages == 0) {
33038+ /* start write hole from fake disk cluster */
33039+ assert("edward-1117", win != NULL);
33040+ assert("edward-1118", win->stat == HOLE_WINDOW);
71430cf6 33041+ assert("edward-1119", new_logical_cluster(clust, inode));
44254afd
MT
33042+ }
33043+#endif
71430cf6 33044+ if (new_logical_cluster(clust, inode)) {
44254afd
MT
33045+ /*
33046+ new page cluster is about to be written, nothing to read,
33047+ */
71430cf6 33048+ assert("edward-734", reiser4_schedulable());
44254afd
MT
33049+ assert("edward-735", clust->hint->lh.owner == NULL);
33050+
33051+ if (clust->nr_pages) {
33052+ int off;
44254afd
MT
33053+ struct page * pg;
33054+ assert("edward-1419", clust->pages != NULL);
33055+ pg = clust->pages[clust->nr_pages - 1];
33056+ assert("edward-1420", pg != NULL);
33057+ off = off_to_pgoff(win->off+win->count+win->delta);
33058+ if (off) {
33059+ lock_page(pg);
71430cf6
MT
33060+ zero_user_page(pg, off, PAGE_CACHE_SIZE - off,
33061+ KM_USER0);
44254afd
MT
33062+ unlock_page(pg);
33063+ }
33064+ }
33065+ clust->dstat = FAKE_DISK_CLUSTER;
33066+ return 0;
33067+ }
33068+ /*
33069+ Here we should search for disk cluster to figure out its real state.
33070+ Also there is one more important reason to do disk search: we need
33071+ to make disk cluster _dirty_ if it exists
33072+ */
33073+
33074+ /* if windows is specified, read the only pages
33075+ that will be modified partially */
33076+
33077+ for (i = 0; i < clust->nr_pages; i++) {
33078+ struct page *pg = clust->pages[i];
33079+
33080+ lock_page(pg);
33081+ if (PageUptodate(pg)) {
33082+ unlock_page(pg);
33083+ continue;
33084+ }
33085+ unlock_page(pg);
33086+
33087+ if (win &&
71430cf6 33088+ i >= size_in_pages(win->off) &&
44254afd
MT
33089+ i < off_to_pg(win->off + win->count + win->delta))
33090+ /* page will be completely overwritten */
33091+ continue;
33092+
33093+ if (win && (i == clust->nr_pages - 1) &&
33094+ /* the last page is
33095+ partially modified,
33096+ not uptodate .. */
71430cf6 33097+ (size_in_pages(i_size_read(inode)) <= pg->index)) {
44254afd
MT
33098+ /* .. and appended,
33099+ so set zeroes to the rest */
44254afd
MT
33100+ int offset;
33101+ lock_page(pg);
44254afd 33102+ assert("edward-1260",
71430cf6
MT
33103+ size_in_pages(win->off + win->count +
33104+ win->delta) - 1 == i);
44254afd
MT
33105+
33106+ offset =
33107+ off_to_pgoff(win->off + win->count + win->delta);
71430cf6
MT
33108+ zero_user_page(pg, offset, PAGE_CACHE_SIZE - offset,
33109+ KM_USER0);
44254afd
MT
33110+ unlock_page(pg);
33111+ /* still not uptodate */
33112+ break;
33113+ }
44254afd 33114+ lock_page(pg);
71430cf6
MT
33115+ result = do_readpage_ctail(inode, clust, pg, mode);
33116+
33117+ assert("edward-1526", ergo(!result, PageUptodate(pg)));
44254afd 33118+ unlock_page(pg);
44254afd 33119+ if (result) {
71430cf6 33120+ warning("edward-219", "do_readpage_ctail failed");
44254afd
MT
33121+ goto out;
33122+ }
33123+ }
33124+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
33125+ /* disk cluster unclaimed, but we need to make its znodes dirty
71430cf6
MT
33126+ * to make flush update convert its content
33127+ */
33128+ result = find_disk_cluster(clust, inode,
33129+ 0 /* do not read items */,
33130+ mode);
44254afd
MT
33131+ }
33132+ out:
33133+ tfm_cluster_clr_uptodate(&clust->tc);
33134+ return result;
33135+}
33136+
71430cf6
MT
33137+static int should_create_unprepped_cluster(struct cluster_handle * clust,
33138+ struct inode * inode)
44254afd
MT
33139+{
33140+ assert("edward-737", clust != NULL);
33141+
33142+ switch (clust->dstat) {
33143+ case PREP_DISK_CLUSTER:
33144+ case UNPR_DISK_CLUSTER:
33145+ return 0;
33146+ case FAKE_DISK_CLUSTER:
33147+ if (clust->win &&
33148+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
71430cf6
MT
33149+ assert("edward-1172",
33150+ new_logical_cluster(clust, inode));
44254afd
MT
33151+ return 0;
33152+ }
33153+ return 1;
33154+ default:
33155+ impossible("edward-1173", "bad disk cluster state");
33156+ return 0;
33157+ }
33158+}
33159+
71430cf6
MT
33160+static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
33161+ struct inode *inode)
44254afd
MT
33162+{
33163+ int result;
33164+
71430cf6 33165+ assert("edward-1123", reiser4_schedulable());
44254afd
MT
33166+ assert("edward-737", clust != NULL);
33167+ assert("edward-738", inode != NULL);
71430cf6 33168+ assert("edward-739", cryptcompress_inode_ok(inode));
44254afd 33169+ assert("edward-1053", clust->hint != NULL);
44254afd 33170+
71430cf6
MT
33171+ if (!should_create_unprepped_cluster(clust, inode)) {
33172+ if (clust->reserved) {
33173+ cluster_reserved2free(estimate_insert_cluster(inode));
44254afd 33174+#if REISER4_DEBUG
71430cf6
MT
33175+ assert("edward-1267",
33176+ clust->reserved_unprepped ==
33177+ estimate_insert_cluster(inode));
33178+ clust->reserved_unprepped -=
33179+ estimate_insert_cluster(inode);
44254afd 33180+#endif
71430cf6 33181+ }
44254afd 33182+ return 0;
44254afd 33183+ }
71430cf6
MT
33184+ assert("edward-1268", clust->reserved);
33185+ cluster_reserved2grabbed(estimate_insert_cluster(inode));
33186+#if REISER4_DEBUG
33187+ assert("edward-1441",
33188+ clust->reserved_unprepped == estimate_insert_cluster(inode));
33189+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
33190+#endif
44254afd 33191+ result = ctail_insert_unprepped_cluster(clust, inode);
44254afd
MT
33192+ if (result)
33193+ return result;
33194+
71430cf6
MT
33195+ inode_add_bytes(inode, inode_cluster_size(inode));
33196+
33197+ assert("edward-743", cryptcompress_inode_ok(inode));
44254afd
MT
33198+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33199+
33200+ clust->dstat = UNPR_DISK_CLUSTER;
33201+ return 0;
33202+}
33203+
71430cf6
MT
33204+/* . Grab page cluster for read, write, setattr, etc. operations;
33205+ * . Truncate its complete pages, if needed;
33206+ */
33207+int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
33208+ rw_op rw)
44254afd
MT
33209+{
33210+ assert("edward-177", inode != NULL);
71430cf6 33211+ assert("edward-741", cryptcompress_inode_ok(inode));
44254afd
MT
33212+ assert("edward-740", clust->pages != NULL);
33213+
33214+ set_cluster_nrpages(clust, inode);
33215+ reset_cluster_pgset(clust, cluster_nrpages(inode));
71430cf6 33216+ return grab_page_cluster(inode, clust, rw);
44254afd
MT
33217+}
33218+
71430cf6
MT
33219+/* Truncate complete page cluster of index @index.
33220+ * This is called by ->kill_hook() method of item
33221+ * plugin when deleting a disk cluster of such index.
33222+ */
33223+void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
33224+ int even_cows)
44254afd 33225+{
71430cf6 33226+ int found;
44254afd
MT
33227+ int nr_pages;
33228+ jnode *node;
33229+ struct page *pages[MAX_CLUSTER_NRPAGES];
33230+
71430cf6
MT
33231+ node = jlookup(current_tree, get_inode_oid(inode),
33232+ clust_to_pg(index, inode));
33233+ nr_pages = size_in_pages(lbytes(index, inode));
33234+ assert("edward-1483", nr_pages != 0);
33235+ if (!node)
33236+ goto truncate;
33237+ found = find_get_pages(inode->i_mapping,
33238+ clust_to_pg(index, inode),
33239+ cluster_nrpages(inode), pages);
33240+ if (!found) {
33241+ assert("edward-1484", jnode_truncate_ok(inode, index));
44254afd
MT
33242+ return;
33243+ }
71430cf6 33244+ lock_cluster(node);
44254afd 33245+
71430cf6
MT
33246+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
33247+ && index == 0)
33248+ /* converting to unix_file is in progress */
33249+ JF_CLR(node, JNODE_CLUSTER_PAGE);
44254afd 33250+ if (JF_ISSET(node, JNODE_DIRTY)) {
71430cf6
MT
33251+ /*
33252+ * @nr_pages were checked in, but not yet checked out -
33253+ * we need to release them. (also there can be pages
33254+ * attached to page cache by read(), etc. - don't take
33255+ * them into account).
33256+ */
33257+ assert("edward-1198", found >= nr_pages);
33258+
44254afd
MT
33259+ /* free disk space grabbed for disk cluster converting */
33260+ cluster_reserved2grabbed(estimate_update_cluster(inode));
33261+ grabbed2free(get_current_context(),
33262+ get_current_super_private(),
33263+ estimate_update_cluster(inode));
71430cf6 33264+ __put_page_cluster(0, nr_pages, pages, inode);
44254afd 33265+
71430cf6
MT
33266+ /* This will clear dirty bit, uncapture and unlock jnode */
33267+ unlock_cluster_uncapture(node);
44254afd 33268+ } else
71430cf6
MT
33269+ unlock_cluster(node);
33270+ jput(node); /* jlookup */
33271+ put_found_pages(pages, found); /* find_get_pages */
33272+ truncate:
33273+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
33274+ index == 0)
33275+ return;
33276+ truncate_page_cluster_range(inode, pages, index, 0,
33277+ cluster_nrpages(inode),
33278+ even_cows);
33279+ assert("edward-1201",
33280+ ergo(!reiser4_inode_get_flag(inode,
33281+ REISER4_FILE_CONV_IN_PROGRESS),
33282+ jnode_truncate_ok(inode, index)));
44254afd
MT
33283+ return;
33284+}
33285+
71430cf6
MT
33286+/*
33287+ * Set cluster handle @clust of a logical cluster before
33288+ * modifications which are supposed to be committed.
33289+ *
33290+ * . grab cluster pages;
33291+ * . reserve disk space;
33292+ * . maybe read pages from disk and set the disk cluster dirty;
33293+ * . maybe write hole and check in (partially zeroed) logical cluster;
33294+ * . create 'unprepped' disk cluster for new or fake logical one.
33295+ */
33296+static int prepare_logical_cluster(struct inode *inode,
33297+ loff_t file_off, /* write position
33298+ in the file */
33299+ loff_t to_file, /* bytes of users data
33300+ to write to the file */
33301+ struct cluster_handle * clust,
33302+ logical_cluster_op op)
44254afd
MT
33303+{
33304+ int result = 0;
71430cf6 33305+ struct reiser4_slide * win = clust->win;
44254afd 33306+
44254afd 33307+ reset_cluster_params(clust);
71430cf6 33308+ cluster_set_tfm_act(&clust->tc, TFMA_READ);
44254afd
MT
33309+#if REISER4_DEBUG
33310+ clust->ctx = get_current_context();
33311+#endif
71430cf6 33312+ assert("edward-1190", op != LC_INVAL);
44254afd
MT
33313+
33314+ clust->op = op;
33315+
71430cf6 33316+ result = prepare_page_cluster(inode, clust, WRITE_OP);
44254afd
MT
33317+ if (result)
33318+ return result;
71430cf6
MT
33319+ assert("edward-1447",
33320+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
33321+ assert("edward-1448",
33322+ ergo(clust->nr_pages != 0,
33323+ jnode_is_cluster_page(jprivate(clust->pages[0]))));
33324+
44254afd
MT
33325+ result = reserve4cluster(inode, clust);
33326+ if (result)
33327+ goto err1;
33328+ result = read_some_cluster_pages(inode, clust);
33329+ if (result) {
33330+ free_reserved4cluster(inode,
33331+ clust,
33332+ estimate_update_cluster(inode) +
33333+ estimate_insert_cluster(inode));
33334+ goto err1;
33335+ }
33336+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33337+
71430cf6 33338+ result = cryptcompress_make_unprepped_cluster(clust, inode);
44254afd
MT
33339+ if (result)
33340+ goto err2;
33341+ if (win && win->stat == HOLE_WINDOW) {
33342+ result = write_hole(inode, clust, file_off, to_file);
33343+ if (result)
33344+ goto err2;
33345+ }
33346+ return 0;
71430cf6 33347+ err2:
44254afd
MT
33348+ free_reserved4cluster(inode, clust,
33349+ estimate_update_cluster(inode));
71430cf6
MT
33350+ err1:
33351+ put_page_cluster(clust, inode, WRITE_OP);
44254afd
MT
33352+ assert("edward-1125", result == -ENOSPC);
33353+ return result;
33354+}
33355+
33356+/* set window by two offsets */
71430cf6
MT
33357+static void set_window(struct cluster_handle * clust,
33358+ struct reiser4_slide * win, struct inode *inode,
33359+ loff_t o1, loff_t o2)
44254afd
MT
33360+{
33361+ assert("edward-295", clust != NULL);
33362+ assert("edward-296", inode != NULL);
33363+ assert("edward-1071", win != NULL);
33364+ assert("edward-297", o1 <= o2);
33365+
33366+ clust->index = off_to_clust(o1, inode);
33367+
33368+ win->off = off_to_cloff(o1, inode);
71430cf6
MT
33369+ win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
33370+ o2 - o1);
44254afd
MT
33371+ win->delta = 0;
33372+
33373+ clust->win = win;
33374+}
33375+
71430cf6
MT
33376+static int set_cluster_by_window(struct inode *inode,
33377+ struct cluster_handle * clust,
33378+ struct reiser4_slide * win, size_t length,
33379+ loff_t file_off)
44254afd
MT
33380+{
33381+ int result;
33382+
33383+ assert("edward-197", clust != NULL);
33384+ assert("edward-1072", win != NULL);
33385+ assert("edward-198", inode != NULL);
33386+
33387+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33388+ if (result)
33389+ return result;
33390+
71430cf6 33391+ if (file_off > i_size_read(inode)) {
44254afd
MT
33392+ /* Uhmm, hole in cryptcompress file... */
33393+ loff_t hole_size;
33394+ hole_size = file_off - inode->i_size;
33395+
33396+ set_window(clust, win, inode, inode->i_size, file_off);
33397+ win->stat = HOLE_WINDOW;
33398+ if (win->off + hole_size < inode_cluster_size(inode))
33399+ /* there is also user's data to append to the hole */
71430cf6
MT
33400+ win->delta = min(inode_cluster_size(inode) -
33401+ (win->off + win->count), length);
44254afd
MT
33402+ return 0;
33403+ }
71430cf6 33404+ set_window(clust, win, inode, file_off, file_off + length);
44254afd
MT
33405+ win->stat = DATA_WINDOW;
33406+ return 0;
33407+}
33408+
71430cf6 33409+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
44254afd
MT
33410+ int count)
33411+{
33412+ int result = 0;
71430cf6 33413+ int (*setting_actor)(struct cluster_handle * clust, int count);
44254afd
MT
33414+
33415+ assert("edward-1358", clust != NULL);
33416+ assert("edward-1359", page != NULL);
33417+ assert("edward-1360", page->mapping != NULL);
33418+ assert("edward-1361", page->mapping->host != NULL);
33419+
71430cf6
MT
33420+ setting_actor =
33421+ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
44254afd
MT
33422+ result = setting_actor(clust, count);
33423+ clust->index = pg_to_clust(page->index, page->mapping->host);
33424+ return result;
33425+}
33426+
33427+/* reset all the params that not get updated */
71430cf6 33428+void reset_cluster_params(struct cluster_handle * clust)
44254afd
MT
33429+{
33430+ assert("edward-197", clust != NULL);
33431+
33432+ clust->dstat = INVAL_DISK_CLUSTER;
33433+ clust->tc.uptodate = 0;
33434+ clust->tc.len = 0;
33435+}
33436+
71430cf6
MT
33437+static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
33438+ const char __user *buf, size_t to_write,
33439+ loff_t pos, int *conv_occured)
44254afd
MT
33440+{
33441+ int i;
44254afd
MT
33442+ hint_t *hint;
33443+ int result = 0;
71430cf6
MT
33444+ size_t count;
33445+ struct reiser4_slide win;
33446+ struct cluster_handle clust;
33447+ struct cryptcompress_info * info;
44254afd 33448+
71430cf6
MT
33449+ assert("edward-161", reiser4_schedulable());
33450+ assert("edward-748", cryptcompress_inode_ok(inode));
44254afd
MT
33451+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33452+ assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33453+
71430cf6 33454+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
33455+ if (hint == NULL)
33456+ return RETERR(-ENOMEM);
33457+
33458+ result = load_file_hint(file, hint);
33459+ if (result) {
33460+ kfree(hint);
33461+ return result;
33462+ }
71430cf6 33463+ count = to_write;
44254afd 33464+
44254afd
MT
33465+ reiser4_slide_init(&win);
33466+ cluster_init_read(&clust, &win);
33467+ clust.hint = hint;
71430cf6
MT
33468+ info = cryptcompress_inode_data(inode);
33469+
33470+ mutex_lock(&info->checkin_mutex);
44254afd 33471+
71430cf6 33472+ result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
44254afd
MT
33473+ if (result)
33474+ goto out;
33475+
33476+ if (next_window_stat(&win) == HOLE_WINDOW) {
71430cf6
MT
33477+ /* write hole in this iteration
33478+ separated from the loop below */
33479+ result = write_conversion_hook(file, inode,
33480+ pos,
33481+ &clust,
33482+ NULL);
33483+ if (result)
33484+ goto out;
33485+ result = prepare_logical_cluster(inode, pos, count, &clust,
33486+ LC_APPOV);
44254afd
MT
33487+ if (result)
33488+ goto out;
33489+ }
33490+ do {
71430cf6
MT
33491+ const char __user * src;
33492+ unsigned page_off, to_page;
44254afd 33493+
71430cf6 33494+ assert("edward-750", reiser4_schedulable());
44254afd 33495+
71430cf6
MT
33496+ result = write_conversion_hook(file, inode,
33497+ pos + to_write - count,
33498+ &clust,
33499+ conv_occured);
33500+ if (result || *conv_occured)
33501+ goto out;
33502+ result = prepare_logical_cluster(inode, pos, count, &clust,
33503+ LC_APPOV);
44254afd
MT
33504+ if (result)
33505+ goto out;
33506+
71430cf6 33507+ assert("edward-751", cryptcompress_inode_ok(inode));
44254afd 33508+ assert("edward-204", win.stat == DATA_WINDOW);
71430cf6 33509+ assert("edward-1288", hint_is_valid(clust.hint));
44254afd
MT
33510+ assert("edward-752",
33511+ znode_is_write_locked(hint->ext_coord.coord.node));
44254afd
MT
33512+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33513+
33514+ /* set write position in page */
33515+ page_off = off_to_pgoff(win.off);
33516+
33517+ /* copy user's data to cluster pages */
71430cf6
MT
33518+ for (i = off_to_pg(win.off), src = buf;
33519+ i < size_in_pages(win.off + win.count);
33520+ i++, src += to_page) {
33521+ to_page = __mbp(win.off + win.count, i) - page_off;
44254afd 33522+ assert("edward-1039",
71430cf6 33523+ page_off + to_page <= PAGE_CACHE_SIZE);
44254afd
MT
33524+ assert("edward-287", clust.pages[i] != NULL);
33525+
71430cf6
MT
33526+ fault_in_pages_readable(src, to_page);
33527+
44254afd
MT
33528+ lock_page(clust.pages[i]);
33529+ result =
33530+ __copy_from_user((char *)kmap(clust.pages[i]) +
71430cf6 33531+ page_off, src, to_page);
44254afd
MT
33532+ kunmap(clust.pages[i]);
33533+ if (unlikely(result)) {
33534+ unlock_page(clust.pages[i]);
33535+ result = -EFAULT;
33536+ goto err2;
33537+ }
33538+ SetPageUptodate(clust.pages[i]);
71430cf6
MT
33539+ reiser4_set_page_dirty_internal(clust.pages[i]);
33540+ flush_dcache_page(clust.pages[i]);
33541+ mark_page_accessed(clust.pages[i]);
44254afd
MT
33542+ unlock_page(clust.pages[i]);
33543+ page_off = 0;
33544+ }
71430cf6 33545+ assert("edward-753", cryptcompress_inode_ok(inode));
44254afd 33546+
71430cf6 33547+ result = checkin_logical_cluster(&clust, inode);
44254afd
MT
33548+ if (result)
33549+ goto err2;
33550+
71430cf6
MT
33551+ buf += win.count;
33552+ count -= win.count;
44254afd 33553+
71430cf6 33554+ result = balance_dirty_page_cluster(&clust, inode, 0, count);
44254afd
MT
33555+ if (result)
33556+ goto err1;
33557+ assert("edward-755", hint->lh.owner == NULL);
33558+ reset_cluster_params(&clust);
33559+ continue;
71430cf6
MT
33560+ err2:
33561+ put_page_cluster(&clust, inode, WRITE_OP);
33562+ err1:
44254afd
MT
33563+ if (clust.reserved)
33564+ free_reserved4cluster(inode,
33565+ &clust,
33566+ estimate_update_cluster(inode));
33567+ break;
71430cf6
MT
33568+ } while (count);
33569+ out:
33570+ /*
33571+ * NOTE: at this point file may have
33572+ * another (unix-file) plugin installed
33573+ */
44254afd
MT
33574+ done_lh(&hint->lh);
33575+ if (result == -EEXIST)
33576+ warning("edward-1407", "write returns EEXIST!\n");
33577+
33578+ put_cluster_handle(&clust);
33579+ save_file_hint(file, hint);
33580+ kfree(hint);
71430cf6
MT
33581+ /*
33582+ * don't release cryptcompress-specific
33583+ * checkin_mutex, if conversion occured
33584+ */
33585+ if (*conv_occured == 0)
33586+ mutex_unlock(&info->checkin_mutex);
44254afd
MT
33587+ if (buf) {
33588+ /* if nothing were written - there must be an error */
71430cf6
MT
33589+ assert("edward-195", ergo((to_write == count),
33590+ (result < 0 || *conv_occured)));
33591+ return (to_write - count) ? (to_write - count) : result;
44254afd
MT
33592+ }
33593+ return result;
33594+}
33595+
44254afd
MT
33596+/**
33597+ * write_cryptcompress - write of struct file_operations
33598+ * @file: file to write to
33599+ * @buf: address of user-space buffer
33600+ * @read_amount: number of bytes to write
33601+ * @off: position in file to write to
33602+ *
33603+ * This is implementation of vfs's write method of struct file_operations for
33604+ * cryptcompress plugin.
33605+ */
33606+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
71430cf6 33607+ size_t count, loff_t *off, int *conv)
44254afd
MT
33608+{
33609+ ssize_t result;
33610+ struct inode *inode;
33611+ reiser4_context *ctx;
71430cf6
MT
33612+ loff_t pos = *off;
33613+ struct cryptcompress_info *info;
33614+
33615+ assert("edward-1449", *conv == 0);
44254afd
MT
33616+
33617+ inode = file->f_dentry->d_inode;
71430cf6
MT
33618+ assert("edward-196", cryptcompress_inode_ok(inode));
33619+
33620+ info = cryptcompress_inode_data(inode);
44254afd 33621+
71430cf6 33622+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
33623+ if (IS_ERR(ctx))
33624+ return PTR_ERR(ctx);
33625+
71430cf6
MT
33626+ mutex_lock(&inode->i_mutex);
33627+
33628+ result = generic_write_checks(file, &pos, &count, 0);
33629+ if (unlikely(result != 0))
33630+ goto out;
33631+ if (unlikely(count == 0))
33632+ goto out;
33633+ result = remove_suid(file->f_dentry);
33634+ if (unlikely(result != 0))
33635+ goto out;
33636+ /* remove_suid might create a transaction */
33637+ reiser4_txn_restart(ctx);
44254afd 33638+
71430cf6 33639+ result = do_write_cryptcompress(file, inode, buf, count, pos, conv);
44254afd 33640+
71430cf6
MT
33641+ if (result < 0)
33642+ goto out;
33643+ /* update position in a file */
33644+ *off = pos + result;
33645+ out:
44254afd
MT
33646+ mutex_unlock(&inode->i_mutex);
33647+
33648+ context_set_commit_async(ctx);
33649+ reiser4_exit_context(ctx);
33650+ return result;
33651+}
33652+
71430cf6
MT
33653+int readpages_cryptcompress(struct file *file, struct address_space *mapping,
33654+ struct list_head *pages, unsigned nr_pages)
44254afd 33655+{
71430cf6
MT
33656+ reiser4_context * ctx;
33657+ int ret;
44254afd 33658+
71430cf6
MT
33659+ ctx = reiser4_init_context(mapping->host->i_sb);
33660+ if (IS_ERR(ctx)) {
33661+ ret = PTR_ERR(ctx);
33662+ goto err;
33663+ }
33664+ /* cryptcompress file can be built of ctail items only */
33665+ ret = readpages_ctail(file, mapping, pages);
33666+ reiser4_txn_restart(ctx);
33667+ reiser4_exit_context(ctx);
33668+ if (ret) {
33669+err:
33670+ put_pages_list(pages);
33671+ }
33672+ return ret;
44254afd
MT
33673+}
33674+
33675+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
33676+{
33677+ /* reserve one block to update stat data item */
33678+ assert("edward-1193",
33679+ inode_file_plugin(inode)->estimate.update ==
33680+ estimate_update_common);
33681+ return estimate_update_common(inode);
33682+}
33683+
33684+/**
33685+ * read_cryptcompress - read of struct file_operations
33686+ * @file: file to read from
33687+ * @buf: address of user-space buffer
33688+ * @read_amount: number of bytes to read
33689+ * @off: position in file to read from
33690+ *
33691+ * This is implementation of vfs's read method of struct file_operations for
33692+ * cryptcompress plugin.
33693+ */
33694+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
33695+ loff_t * off)
33696+{
33697+ ssize_t result;
33698+ struct inode *inode;
33699+ reiser4_context *ctx;
71430cf6 33700+ struct cryptcompress_info *info;
44254afd
MT
33701+ reiser4_block_nr needed;
33702+
33703+ inode = file->f_dentry->d_inode;
71430cf6 33704+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
44254afd 33705+
71430cf6 33706+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
33707+ if (IS_ERR(ctx))
33708+ return PTR_ERR(ctx);
33709+
33710+ info = cryptcompress_inode_data(inode);
33711+ needed = cryptcompress_estimate_read(inode);
33712+
44254afd
MT
33713+ result = reiser4_grab_space(needed, BA_CAN_COMMIT);
33714+ if (result != 0) {
33715+ reiser4_exit_context(ctx);
33716+ return result;
33717+ }
71430cf6 33718+ result = do_sync_read(file, buf, size, off);
44254afd
MT
33719+
33720+ context_set_commit_async(ctx);
33721+ reiser4_exit_context(ctx);
33722+
33723+ return result;
33724+}
33725+
71430cf6
MT
33726+/* Look for a disk cluster and keep lookup result in @found.
33727+ * If @index > 0, then find disk cluster of the index (@index - 1);
33728+ * If @index == 0, then find the rightmost disk cluster.
33729+ * Keep incremented index of the found disk cluster in @found.
33730+ * @found == 0 means that disk cluster was not found (in the last
33731+ * case (@index == 0) it means that file doesn't have disk clusters).
33732+ */
33733+static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
33734+ cloff_t index)
44254afd
MT
33735+{
33736+ int result;
33737+ reiser4_key key;
33738+ loff_t offset;
33739+ hint_t *hint;
33740+ lock_handle *lh;
33741+ lookup_bias bias;
33742+ coord_t *coord;
33743+ item_plugin *iplug;
33744+
33745+ assert("edward-1131", inode != NULL);
71430cf6 33746+ assert("edward-95", cryptcompress_inode_ok(inode));
44254afd 33747+
71430cf6 33748+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
33749+ if (hint == NULL)
33750+ return RETERR(-ENOMEM);
33751+ hint_init_zero(hint);
33752+ lh = &hint->lh;
33753+
33754+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
33755+ offset =
33756+ (index ? clust_to_off(index, inode) -
71430cf6 33757+ 1 : get_key_offset(reiser4_max_key()));
44254afd
MT
33758+
33759+ key_by_inode_cryptcompress(inode, offset, &key);
33760+
33761+ /* find the last item of this object */
33762+ result =
33763+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
33764+ bias, 0);
33765+ if (cbk_errored(result)) {
33766+ done_lh(lh);
33767+ kfree(hint);
33768+ return result;
33769+ }
33770+ if (result == CBK_COORD_NOTFOUND) {
33771+ /* no real disk clusters */
33772+ done_lh(lh);
33773+ kfree(hint);
33774+ *found = 0;
33775+ return 0;
33776+ }
33777+ /* disk cluster is found */
33778+ coord = &hint->ext_coord.coord;
33779+ coord_clear_iplug(coord);
33780+ result = zload(coord->node);
33781+ if (unlikely(result)) {
33782+ done_lh(lh);
33783+ kfree(hint);
33784+ return result;
33785+ }
33786+ iplug = item_plugin_by_coord(coord);
33787+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
33788+ assert("edward-1202", ctail_ok(coord));
33789+
33790+ item_key_by_coord(coord, &key);
33791+ *found = off_to_clust(get_key_offset(&key), inode) + 1;
33792+
33793+ assert("edward-1132", ergo(index, index == *found));
33794+
33795+ zrelse(coord->node);
33796+ done_lh(lh);
33797+ kfree(hint);
33798+ return 0;
33799+}
33800+
33801+static int find_fake_appended(struct inode *inode, cloff_t * index)
33802+{
71430cf6
MT
33803+ return lookup_disk_cluster(inode, index,
33804+ 0 /* find last real one */ );
44254afd
MT
33805+}
33806+
33807+/* Set left coord when unit is not found after node_lookup()
33808+ This takes into account that there can be holes in a sequence
33809+ of disk clusters */
33810+
33811+static void adjust_left_coord(coord_t * left_coord)
33812+{
33813+ switch (left_coord->between) {
33814+ case AFTER_UNIT:
33815+ left_coord->between = AFTER_ITEM;
33816+ case AFTER_ITEM:
33817+ case BEFORE_UNIT:
33818+ break;
33819+ default:
33820+ impossible("edward-1204", "bad left coord to cut");
33821+ }
33822+ return;
33823+}
33824+
33825+#define CRC_CUT_TREE_MIN_ITERATIONS 64
71430cf6
MT
33826+int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
33827+ const reiser4_key * to_key,
33828+ reiser4_key * smallest_removed,
33829+ struct inode *object, int truncate,
33830+ int *progress)
44254afd
MT
33831+{
33832+ lock_handle next_node_lock;
33833+ coord_t left_coord;
33834+ int result;
33835+
33836+ assert("edward-1158", tap->coord->node != NULL);
33837+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
33838+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
33839+
33840+ *progress = 0;
33841+ init_lh(&next_node_lock);
33842+
33843+ while (1) {
33844+ znode *node; /* node from which items are cut */
33845+ node_plugin *nplug; /* node plugin for @node */
33846+
33847+ node = tap->coord->node;
33848+
33849+ /* Move next_node_lock to the next node on the left. */
33850+ result =
33851+ reiser4_get_left_neighbor(&next_node_lock, node,
33852+ ZNODE_WRITE_LOCK,
33853+ GN_CAN_USE_UPPER_LEVELS);
33854+ if (result != 0 && result != -E_NO_NEIGHBOR)
33855+ break;
33856+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
71430cf6 33857+ result = reiser4_tap_load(tap);
44254afd
MT
33858+ if (result)
33859+ return result;
33860+
33861+ /* Prepare the second (right) point for cut_node() */
33862+ if (*progress)
33863+ coord_init_last_unit(tap->coord, node);
33864+
33865+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
33866+ /* set rightmost unit for the items without lookup method */
33867+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
33868+
33869+ nplug = node->nplug;
33870+
33871+ assert("edward-1161", nplug);
33872+ assert("edward-1162", nplug->lookup);
33873+
33874+ /* left_coord is leftmost unit cut from @node */
33875+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
33876+
33877+ if (IS_CBKERR(result))
33878+ break;
33879+
33880+ if (result == CBK_COORD_NOTFOUND)
33881+ adjust_left_coord(&left_coord);
33882+
33883+ /* adjust coordinates so that they are set to existing units */
33884+ if (coord_set_to_right(&left_coord)
33885+ || coord_set_to_left(tap->coord)) {
33886+ result = 0;
33887+ break;
33888+ }
33889+
33890+ if (coord_compare(&left_coord, tap->coord) ==
33891+ COORD_CMP_ON_RIGHT) {
33892+ /* keys from @from_key to @to_key are not in the tree */
33893+ result = 0;
33894+ break;
33895+ }
33896+
33897+ /* cut data from one node */
71430cf6 33898+ *smallest_removed = *reiser4_min_key();
44254afd
MT
33899+ result = kill_node_content(&left_coord,
33900+ tap->coord,
33901+ from_key,
33902+ to_key,
33903+ smallest_removed,
33904+ next_node_lock.node,
33905+ object, truncate);
71430cf6 33906+ reiser4_tap_relse(tap);
44254afd
MT
33907+
33908+ if (result)
33909+ break;
33910+
33911+ ++(*progress);
33912+
33913+ /* Check whether all items with keys >= from_key were removed
33914+ * from the tree. */
33915+ if (keyle(smallest_removed, from_key))
33916+ /* result = 0; */
33917+ break;
33918+
33919+ if (next_node_lock.node == NULL)
33920+ break;
33921+
71430cf6 33922+ result = reiser4_tap_move(tap, &next_node_lock);
44254afd
MT
33923+ done_lh(&next_node_lock);
33924+ if (result)
33925+ break;
33926+
33927+ /* Break long cut_tree operation (deletion of a large file) if
33928+ * atom requires commit. */
33929+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
33930+ && current_atom_should_commit()) {
33931+ result = -E_REPEAT;
33932+ break;
33933+ }
33934+ }
33935+ done_lh(&next_node_lock);
33936+ return result;
33937+}
33938+
71430cf6
MT
33939+/* Append or expand hole in two steps:
33940+ * 1) set zeroes to the rightmost page of the rightmost non-fake
33941+ * logical cluster;
33942+ * 2) expand hole via fake logical clusters (just increase i_size)
33943+ */
33944+static int cryptcompress_append_hole(struct inode *inode /* with old size */,
33945+ loff_t new_size)
44254afd
MT
33946+{
33947+ int result = 0;
33948+ hint_t *hint;
33949+ lock_handle *lh;
33950+ loff_t hole_size;
33951+ int nr_zeroes;
71430cf6
MT
33952+ struct reiser4_slide win;
33953+ struct cluster_handle clust;
44254afd
MT
33954+
33955+ assert("edward-1133", inode->i_size < new_size);
71430cf6
MT
33956+ assert("edward-1134", reiser4_schedulable());
33957+ assert("edward-1135", cryptcompress_inode_ok(inode));
44254afd
MT
33958+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
33959+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
33960+
71430cf6 33961+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
33962+ if (hint == NULL)
33963+ return RETERR(-ENOMEM);
33964+ hint_init_zero(hint);
33965+ lh = &hint->lh;
33966+
33967+ reiser4_slide_init(&win);
33968+ cluster_init_read(&clust, &win);
33969+ clust.hint = hint;
33970+
33971+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33972+ if (result)
33973+ goto out;
33974+ if (off_to_cloff(inode->i_size, inode) == 0)
71430cf6 33975+ goto append_fake;
44254afd 33976+ hole_size = new_size - inode->i_size;
71430cf6 33977+ nr_zeroes =
44254afd
MT
33978+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
33979+ if (hole_size < nr_zeroes)
33980+ nr_zeroes = hole_size;
33981+ set_window(&clust, &win, inode, inode->i_size,
33982+ inode->i_size + nr_zeroes);
33983+ win.stat = HOLE_WINDOW;
33984+
33985+ assert("edward-1137",
33986+ clust.index == off_to_clust(inode->i_size, inode));
33987+
71430cf6 33988+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
44254afd
MT
33989+
33990+ assert("edward-1271", !result || result == -ENOSPC);
33991+ if (result)
33992+ goto out;
33993+ assert("edward-1139",
33994+ clust.dstat == PREP_DISK_CLUSTER ||
33995+ clust.dstat == UNPR_DISK_CLUSTER);
33996+
33997+ assert("edward-1431", hole_size >= nr_zeroes);
33998+ if (hole_size == nr_zeroes)
33999+ /* nothing to append anymore */
34000+ goto out;
71430cf6
MT
34001+ append_fake:
34002+ INODE_SET_SIZE(inode, new_size);
34003+ out:
44254afd
MT
34004+ done_lh(lh);
34005+ kfree(hint);
34006+ put_cluster_handle(&clust);
34007+ return result;
34008+}
34009+
44254afd
MT
34010+static int
34011+update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34012+{
34013+ return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
71430cf6 34014+ ? 0 : reiser4_update_file_size(inode, key, update_sd));
44254afd
MT
34015+}
34016+
71430cf6
MT
34017+/* Prune cryptcompress file in two steps:
34018+ * 1) cut all nominated logical clusters except the leftmost one which
34019+ * is to be partially truncated. Note, that there can be "holes"
34020+ * represented by fake logical clusters.
34021+ * 2) set zeroes and capture leftmost partially truncated logical
34022+ * cluster, if it is not fake; otherwise prune fake logical cluster
34023+ * (just decrease i_size).
34024+ */
34025+static int prune_cryptcompress(struct inode *inode, loff_t new_size,
34026+ int update_sd, cloff_t aidx)
44254afd
MT
34027+{
34028+ int result = 0;
34029+ unsigned nr_zeroes;
34030+ loff_t to_prune;
34031+ loff_t old_size;
34032+ cloff_t ridx;
34033+
34034+ hint_t *hint;
34035+ lock_handle *lh;
71430cf6
MT
34036+ struct reiser4_slide win;
34037+ struct cluster_handle clust;
44254afd
MT
34038+
34039+ assert("edward-1140", inode->i_size >= new_size);
71430cf6
MT
34040+ assert("edward-1141", reiser4_schedulable());
34041+ assert("edward-1142", cryptcompress_inode_ok(inode));
44254afd
MT
34042+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34043+
71430cf6
MT
34044+ old_size = inode->i_size;
34045+
34046+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
34047+ if (hint == NULL)
34048+ return RETERR(-ENOMEM);
34049+ hint_init_zero(hint);
34050+ lh = &hint->lh;
34051+
34052+ reiser4_slide_init(&win);
34053+ cluster_init_read(&clust, &win);
34054+ clust.hint = hint;
34055+
71430cf6
MT
34056+ /* calculate index of the rightmost logical cluster
34057+ that will be completely truncated */
34058+ ridx = size_in_lc(new_size, inode);
44254afd 34059+
71430cf6 34060+ /* truncate all disk clusters starting from @ridx */
44254afd
MT
34061+ assert("edward-1174", ridx <= aidx);
34062+ old_size = inode->i_size;
34063+ if (ridx != aidx) {
71430cf6
MT
34064+ struct cryptcompress_info * info;
34065+ info = cryptcompress_inode_data(inode);
44254afd
MT
34066+ result = cut_file_items(inode,
34067+ clust_to_off(ridx, inode),
34068+ update_sd,
34069+ clust_to_off(aidx, inode),
34070+ update_cryptcompress_size);
71430cf6 34071+ info->trunc_index = ULONG_MAX;
44254afd
MT
34072+ if (result)
34073+ goto out;
34074+ }
71430cf6
MT
34075+ /*
34076+ * there can be pages of fake logical clusters, truncate them
34077+ */
34078+ truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
34079+ assert("edward-1524",
34080+ pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
34081+ /*
34082+ * now perform partial truncate of last logical cluster
34083+ */
44254afd 34084+ if (!off_to_cloff(new_size, inode)) {
71430cf6 34085+ /* no partial truncate is needed */
44254afd 34086+ assert("edward-1145", inode->i_size == new_size);
71430cf6 34087+ goto truncate_fake;
44254afd
MT
34088+ }
34089+ assert("edward-1146", new_size < inode->i_size);
34090+
34091+ to_prune = inode->i_size - new_size;
34092+
71430cf6
MT
34093+ /* check if the last logical cluster is fake */
34094+ result = lookup_disk_cluster(inode, &aidx, ridx);
44254afd
MT
34095+ if (result)
34096+ goto out;
34097+ if (!aidx)
34098+ /* yup, this is fake one */
71430cf6 34099+ goto truncate_fake;
44254afd
MT
34100+
34101+ assert("edward-1148", aidx == ridx);
34102+
71430cf6
MT
34103+ /* do partial truncate of the last page cluster,
34104+ and try to capture this one */
44254afd
MT
34105+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34106+ if (result)
34107+ goto out;
34108+ nr_zeroes = (off_to_pgoff(new_size) ?
34109+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34110+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34111+ win.stat = HOLE_WINDOW;
34112+
34113+ assert("edward-1149", clust.index == ridx - 1);
34114+
71430cf6 34115+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
44254afd
MT
34116+ if (result)
34117+ goto out;
34118+ assert("edward-1151",
34119+ clust.dstat == PREP_DISK_CLUSTER ||
34120+ clust.dstat == UNPR_DISK_CLUSTER);
34121+
34122+ assert("edward-1191", inode->i_size == new_size);
34123+ assert("edward-1206", body_truncate_ok(inode, ridx));
71430cf6 34124+ truncate_fake:
44254afd
MT
34125+ /* drop all the pages that don't have jnodes (i.e. pages
34126+ which can not be truncated by cut_file_items() because
34127+ of holes represented by fake disk clusters) including
34128+ the pages of partially truncated cluster which was
71430cf6
MT
34129+ released by prepare_logical_cluster() */
34130+ INODE_SET_SIZE(inode, new_size);
44254afd 34131+ truncate_inode_pages(inode->i_mapping, new_size);
71430cf6 34132+ out:
44254afd 34133+ assert("edward-1334", !result || result == -ENOSPC);
71430cf6
MT
34134+ assert("edward-1497",
34135+ pages_truncate_ok(inode, size_in_pages(new_size)));
34136+
44254afd
MT
34137+ done_lh(lh);
34138+ kfree(hint);
34139+ put_cluster_handle(&clust);
34140+ return result;
34141+}
34142+
34143+/* Prepare cryptcompress file for truncate:
71430cf6
MT
34144+ * prune or append rightmost fake logical clusters (if any)
34145+ */
34146+static int start_truncate_fake(struct inode *inode, cloff_t aidx,
34147+ loff_t new_size, int update_sd)
44254afd
MT
34148+{
34149+ int result = 0;
34150+ int bytes;
34151+
34152+ if (new_size > inode->i_size) {
34153+ /* append */
34154+ if (inode->i_size < clust_to_off(aidx, inode))
34155+ /* no fake bytes */
34156+ return 0;
34157+ bytes = new_size - inode->i_size;
71430cf6 34158+ INODE_SET_SIZE(inode, inode->i_size + bytes);
44254afd
MT
34159+ } else {
34160+ /* prune */
34161+ if (inode->i_size <= clust_to_off(aidx, inode))
34162+ /* no fake bytes */
34163+ return 0;
71430cf6
MT
34164+ bytes = inode->i_size -
34165+ max(new_size, clust_to_off(aidx, inode));
44254afd
MT
34166+ if (!bytes)
34167+ return 0;
71430cf6 34168+ INODE_SET_SIZE(inode, inode->i_size - bytes);
44254afd
MT
34169+ /* In the case of fake prune we need to drop page cluster.
34170+ There are only 2 cases for partially truncated page:
34171+ 1. If is is dirty, therefore it is anonymous
34172+ (was dirtied via mmap), and will be captured
34173+ later via ->capture().
34174+ 2. If is clean, therefore it is filled by zeroes.
34175+ In both cases we don't need to make it dirty and
34176+ capture here.
34177+ */
34178+ truncate_inode_pages(inode->i_mapping, inode->i_size);
34179+ }
34180+ if (update_sd)
34181+ result = update_sd_cryptcompress(inode);
34182+ return result;
34183+}
34184+
34185+/* This is called in setattr_cryptcompress when it is used to truncate,
71430cf6 34186+ * and in delete_cryptcompress */
44254afd
MT
34187+static int cryptcompress_truncate(struct inode *inode, /* old size */
34188+ loff_t new_size, /* new size */
34189+ int update_sd)
34190+{
34191+ int result;
34192+ cloff_t aidx;
34193+
34194+ result = find_fake_appended(inode, &aidx);
34195+ if (result)
34196+ return result;
34197+ assert("edward-1208",
34198+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34199+
34200+ result = start_truncate_fake(inode, aidx, new_size, update_sd);
34201+ if (result)
34202+ return result;
34203+ if (inode->i_size == new_size)
34204+ /* nothing to truncate anymore */
34205+ return 0;
71430cf6
MT
34206+ result = (inode->i_size < new_size ?
34207+ cryptcompress_append_hole(inode, new_size) :
34208+ prune_cryptcompress(inode, new_size, update_sd, aidx));
34209+ if (!result && update_sd)
34210+ result = update_sd_cryptcompress(inode);
34211+ return result;
44254afd
MT
34212+}
34213+
34214+/* Capture an anonymous pager cluster. (Page cluser is
71430cf6
MT
34215+ * anonymous if it contains at least one anonymous page
34216+ */
34217+static int capture_anon_page_cluster(struct cluster_handle * clust,
34218+ struct inode * inode)
44254afd
MT
34219+{
34220+ int result;
34221+
34222+ assert("edward-1073", clust != NULL);
34223+ assert("edward-1074", inode != NULL);
34224+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34225+
71430cf6 34226+ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
44254afd
MT
34227+ if (result)
34228+ return result;
71430cf6
MT
34229+ set_cluster_pages_dirty(clust, inode);
34230+ result = checkin_logical_cluster(clust, inode);
44254afd 34231+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
71430cf6
MT
34232+ if (unlikely(result))
34233+ put_page_cluster(clust, inode, WRITE_OP);
44254afd
MT
34234+ return result;
34235+}
34236+
71430cf6
MT
34237+/* Starting from @index find tagged pages of the same page cluster.
34238+ * Clear the tag for each of them. Return number of found pages.
34239+ */
34240+static int find_anon_page_cluster(struct address_space * mapping,
34241+ pgoff_t * index, struct page ** pages)
34242+{
34243+ int i = 0;
34244+ int found;
34245+ write_lock_irq(&mapping->tree_lock);
34246+ do {
34247+ /* looking for one page */
34248+ found = radix_tree_gang_lookup_tag(&mapping->page_tree,
34249+ (void **)&pages[i],
34250+ *index, 1,
34251+ PAGECACHE_TAG_REISER4_MOVED);
34252+ if (!found)
34253+ break;
34254+ if (!same_page_cluster(pages[0], pages[i]))
34255+ break;
44254afd 34256+
71430cf6
MT
34257+ /* found */
34258+ page_cache_get(pages[i]);
34259+ *index = pages[i]->index + 1;
34260+
34261+ radix_tree_tag_clear(&mapping->page_tree,
34262+ pages[i]->index,
34263+ PAGECACHE_TAG_REISER4_MOVED);
34264+ if (last_page_in_cluster(pages[i++]))
34265+ break;
34266+ } while (1);
34267+ write_unlock_irq(&mapping->tree_lock);
34268+ return i;
34269+}
34270+
34271+#define MAX_PAGES_TO_CAPTURE (1024)
34272+
34273+/* Capture anonymous page clusters */
34274+static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
34275+ int to_capture)
44254afd 34276+{
71430cf6
MT
34277+ int count = 0;
34278+ int found = 0;
44254afd 34279+ int result = 0;
44254afd
MT
34280+ hint_t *hint;
34281+ lock_handle *lh;
71430cf6
MT
34282+ struct inode * inode;
34283+ struct cluster_handle clust;
34284+ struct page * pages[MAX_CLUSTER_NRPAGES];
44254afd
MT
34285+
34286+ assert("edward-1127", mapping != NULL);
34287+ assert("edward-1128", mapping->host != NULL);
71430cf6 34288+ assert("edward-1440", mapping->host->i_mapping == mapping);
44254afd 34289+
71430cf6
MT
34290+ inode = mapping->host;
34291+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
34292+ if (hint == NULL)
34293+ return RETERR(-ENOMEM);
34294+ hint_init_zero(hint);
34295+ lh = &hint->lh;
34296+
34297+ cluster_init_read(&clust, NULL);
34298+ clust.hint = hint;
34299+
71430cf6 34300+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
44254afd
MT
34301+ if (result)
34302+ goto out;
34303+
34304+ while (to_capture > 0) {
71430cf6 34305+ found = find_anon_page_cluster(mapping, index, pages);
44254afd
MT
34306+ if (!found) {
34307+ *index = (pgoff_t) - 1;
34308+ break;
34309+ }
71430cf6
MT
34310+ move_cluster_forward(&clust, inode, pages[0]->index);
34311+ result = capture_anon_page_cluster(&clust, inode);
44254afd 34312+
71430cf6 34313+ put_found_pages(pages, found); /* find_anon_page_cluster */
44254afd
MT
34314+ if (result)
34315+ break;
71430cf6
MT
34316+ to_capture -= clust.nr_pages;
34317+ count += clust.nr_pages;
44254afd
MT
34318+ }
34319+ if (result) {
34320+ warning("edward-1077",
71430cf6
MT
34321+ "Capture failed (inode %llu, result=%i, captured=%d)\n",
34322+ (unsigned long long)get_inode_oid(inode), result, count);
44254afd 34323+ } else {
71430cf6 34324+ assert("edward-1078", ergo(found > 0, count > 0));
44254afd
MT
34325+ if (to_capture <= 0)
34326+ /* there may be left more pages */
71430cf6
MT
34327+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
34328+ result = count;
44254afd
MT
34329+ }
34330+ out:
34331+ done_lh(lh);
34332+ kfree(hint);
34333+ put_cluster_handle(&clust);
34334+ return result;
34335+}
34336+
71430cf6
MT
34337+/* Returns true if inode's mapping has dirty pages
34338+ which do not belong to any atom */
34339+static int cryptcompress_inode_has_anon_pages(struct inode *inode)
44254afd 34340+{
71430cf6
MT
34341+ int result;
34342+ read_lock_irq(&inode->i_mapping->tree_lock);
34343+ result = radix_tree_tagged(&inode->i_mapping->page_tree,
34344+ PAGECACHE_TAG_REISER4_MOVED);
34345+ read_unlock_irq(&inode->i_mapping->tree_lock);
34346+ return result;
44254afd
MT
34347+}
34348+
71430cf6 34349+/* This is implementation of vfs's writepages method of struct
44254afd 34350+ address_space_operations */
71430cf6
MT
34351+int writepages_cryptcompress(struct address_space *mapping,
34352+ struct writeback_control *wbc)
44254afd 34353+{
71430cf6
MT
34354+ int result = 0;
34355+ long to_capture;
44254afd
MT
34356+ pgoff_t nrpages;
34357+ pgoff_t index = 0;
44254afd 34358+ struct inode *inode;
71430cf6 34359+ struct cryptcompress_info *info;
44254afd
MT
34360+
34361+ inode = mapping->host;
71430cf6 34362+ if (!cryptcompress_inode_has_anon_pages(inode))
44254afd 34363+ goto end;
44254afd 34364+ info = cryptcompress_inode_data(inode);
71430cf6 34365+ nrpages = size_in_pages(i_size_read(inode));
44254afd
MT
34366+
34367+ if (wbc->sync_mode != WB_SYNC_ALL)
71430cf6 34368+ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
44254afd 34369+ else
71430cf6 34370+ to_capture = MAX_PAGES_TO_CAPTURE;
44254afd
MT
34371+ do {
34372+ reiser4_context *ctx;
34373+
71430cf6 34374+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
34375+ if (IS_ERR(ctx)) {
34376+ result = PTR_ERR(ctx);
34377+ break;
34378+ }
71430cf6 34379+ /* avoid recursive calls to ->sync_inodes */
44254afd
MT
34380+ ctx->nobalance = 1;
34381+
34382+ assert("edward-1079",
34383+ lock_stack_isclean(get_current_lock_stack()));
34384+
71430cf6 34385+ reiser4_txn_restart_current();
44254afd 34386+
71430cf6
MT
34387+ if (get_current_context()->entd) {
34388+ if (mutex_trylock(&info->checkin_mutex) == 0) {
34389+ /* the mutex might be occupied by
34390+ entd caller */
34391+ result = RETERR(-EBUSY);
34392+ reiser4_exit_context(ctx);
34393+ break;
34394+ }
34395+ } else
34396+ mutex_lock(&info->checkin_mutex);
44254afd 34397+
71430cf6
MT
34398+ result = capture_anon_pages(inode->i_mapping, &index,
34399+ to_capture);
34400+ mutex_unlock(&info->checkin_mutex);
44254afd 34401+
71430cf6
MT
34402+ if (result < 0) {
34403+ reiser4_exit_context(ctx);
34404+ break;
34405+ }
34406+ wbc->nr_to_write -= result;
34407+ if (wbc->sync_mode != WB_SYNC_ALL) {
44254afd
MT
34408+ reiser4_exit_context(ctx);
34409+ break;
34410+ }
34411+ result = txnmgr_force_commit_all(inode->i_sb, 0);
34412+ reiser4_exit_context(ctx);
71430cf6 34413+ } while (result >= 0 && index < nrpages);
44254afd 34414+
71430cf6 34415+ end:
44254afd
MT
34416+ if (is_in_reiser4_context()) {
34417+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
71430cf6
MT
34418+ /* there are already pages to flush, flush them out,
34419+ do not delay until end of reiser4_sync_inodes */
34420+ reiser4_writeout(inode->i_sb, wbc);
44254afd
MT
34421+ get_current_context()->nr_captured = 0;
34422+ }
34423+ }
34424+ return result;
34425+}
34426+
34427+/* plugin->u.file.mmap */
34428+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34429+{
71430cf6
MT
34430+ int result;
34431+ struct inode *inode;
34432+ reiser4_context *ctx;
34433+
34434+ inode = file->f_dentry->d_inode;
34435+ ctx = reiser4_init_context(inode->i_sb);
34436+ if (IS_ERR(ctx))
34437+ return PTR_ERR(ctx);
34438+ /*
34439+ * generic_file_mmap will do update_atime. Grab space for stat data
34440+ * update.
34441+ */
34442+ result = reiser4_grab_space_force
34443+ (inode_file_plugin(inode)->estimate.update(inode),
34444+ BA_CAN_COMMIT);
34445+ if (result) {
34446+ reiser4_exit_context(ctx);
34447+ return result;
34448+ }
34449+ result = generic_file_mmap(file, vma);
34450+ reiser4_exit_context(ctx);
34451+ return result;
44254afd
MT
34452+}
34453+
34454+/* plugin->u.file.release */
34455+/* plugin->u.file.get_block */
34456+
34457+/* this is implementation of delete method of file plugin for
71430cf6
MT
34458+ * cryptcompress objects
34459+ */
34460+int delete_object_cryptcompress(struct inode *inode)
44254afd
MT
34461+{
34462+ int result;
71430cf6 34463+ struct cryptcompress_info * info;
44254afd
MT
34464+
34465+ assert("edward-429", inode->i_nlink == 0);
34466+
71430cf6
MT
34467+ reiser4_txn_restart_current();
34468+ info = cryptcompress_inode_data(inode);
34469+
34470+ mutex_lock(&info->checkin_mutex);
34471+ result = cryptcompress_truncate(inode, 0, 0);
34472+ mutex_unlock(&info->checkin_mutex);
34473+
34474+ if (result) {
34475+ warning("edward-430",
34476+ "cannot truncate cryptcompress file %lli: %i",
34477+ (unsigned long long)get_inode_oid(inode),
34478+ result);
44254afd 34479+ }
71430cf6
MT
34480+ truncate_inode_pages(inode->i_mapping, 0);
34481+ assert("edward-1487", pages_truncate_ok(inode, 0));
44254afd 34482+ /* and remove stat data */
71430cf6 34483+ return reiser4_delete_object_common(inode);
44254afd
MT
34484+}
34485+
34486+/* plugin->u.file.setattr method
71430cf6
MT
34487+ This implements actual truncate (see comments in reiser4/page_cache.c) */
34488+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
44254afd
MT
34489+{
34490+ int result;
34491+ struct inode *inode;
71430cf6 34492+ struct cryptcompress_info * info;
44254afd
MT
34493+
34494+ inode = dentry->d_inode;
71430cf6 34495+ info = cryptcompress_inode_data(inode);
44254afd 34496+
71430cf6
MT
34497+ if (attr->ia_valid & ATTR_SIZE) {
34498+ if (i_size_read(inode) != attr->ia_size) {
44254afd
MT
34499+ reiser4_context *ctx;
34500+ loff_t old_size;
44254afd 34501+
71430cf6 34502+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
44254afd
MT
34503+ if (IS_ERR(ctx))
34504+ return PTR_ERR(ctx);
34505+
71430cf6
MT
34506+ old_size = i_size_read(inode);
34507+ inode_check_scale(inode, old_size, attr->ia_size);
44254afd 34508+
71430cf6
MT
34509+ mutex_lock(&info->checkin_mutex);
34510+ result = cryptcompress_truncate(inode,
34511+ attr->ia_size,
34512+ 1/* update sd */);
34513+ mutex_unlock(&info->checkin_mutex);
44254afd 34514+ if (result) {
71430cf6
MT
34515+ warning("edward-1192",
34516+ "truncate_cryptcompress failed: oid %lli, "
34517+ "old size %lld, new size %lld, retval %d",
34518+ (unsigned long long)
34519+ get_inode_oid(inode), old_size,
34520+ attr->ia_size, result);
44254afd 34521+ }
44254afd
MT
34522+ context_set_commit_async(ctx);
34523+ reiser4_exit_context(ctx);
34524+ } else
34525+ result = 0;
34526+ } else
71430cf6 34527+ result = reiser4_setattr_common(dentry, attr);
44254afd
MT
34528+ return result;
34529+}
34530+
34531+/* sendfile_cryptcompress - sendfile of struct file_operations */
34532+ssize_t
34533+sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
34534+ read_actor_t actor, void *target)
34535+{
34536+ reiser4_context *ctx;
34537+ ssize_t result;
34538+ struct inode *inode;
71430cf6 34539+ struct cryptcompress_info *info;
44254afd
MT
34540+
34541+ inode = file->f_dentry->d_inode;
71430cf6 34542+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
34543+ if (IS_ERR(ctx))
34544+ return PTR_ERR(ctx);
34545+ /*
34546+ * generic_file_sndfile may want to call update_atime. Grab space for
34547+ * stat data update
34548+ */
34549+ result = reiser4_grab_space(estimate_update_common(inode),
34550+ BA_CAN_COMMIT);
34551+ if (result)
34552+ goto exit;
34553+ info = cryptcompress_inode_data(inode);
71430cf6 34554+
44254afd 34555+ result = generic_file_sendfile(file, ppos, count, actor, target);
44254afd
MT
34556+ exit:
34557+ reiser4_exit_context(ctx);
34558+ return result;
34559+}
34560+
34561+/*
34562+ * release_cryptcompress - release of struct file_operations
34563+ * @inode: inode of released file
34564+ * @file: file to release
34565+ */
34566+int release_cryptcompress(struct inode *inode, struct file *file)
34567+{
71430cf6 34568+ reiser4_context *ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
34569+
34570+ if (IS_ERR(ctx))
34571+ return PTR_ERR(ctx);
34572+ reiser4_free_file_fsdata(file);
34573+ reiser4_exit_context(ctx);
34574+ return 0;
34575+}
34576+
71430cf6
MT
34577+#if 0
34578+int prepare_write_cryptcompress(struct file *file, struct page *page,
34579+ unsigned from, unsigned to)
44254afd 34580+{
71430cf6 34581+ return prepare_write_common(file, page, from, to);
44254afd 34582+}
71430cf6 34583+#endif /* 0 */
44254afd 34584+
44254afd
MT
34585+
34586+/*
34587+ Local variables:
34588+ c-indentation-style: "K&R"
34589+ mode-name: "LC"
34590+ c-basic-offset: 8
34591+ tab-width: 8
34592+ fill-column: 80
34593+ scroll-step: 1
34594+ End:
34595+*/
71430cf6
MT
34596diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.h
34597--- linux-2.6.22.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 03:00:00.000000000 +0300
34598+++ linux-2.6.22/fs/reiser4/plugin/file/cryptcompress.h 2007-07-29 00:26:21.804839975 +0400
34599@@ -0,0 +1,607 @@
44254afd
MT
34600+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
34601+/* See http://www.namesys.com/cryptcompress_design.html */
34602+
34603+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
34604+#define __FS_REISER4_CRYPTCOMPRESS_H__
34605+
71430cf6 34606+#include "../../page_cache.h"
44254afd
MT
34607+#include "../compress/compress.h"
34608+#include "../crypto/cipher.h"
34609+
34610+#include <linux/pagemap.h>
44254afd 34611+
44254afd
MT
34612+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
34613+#define MAX_CLUSTER_SHIFT 16
34614+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
34615+#define DC_CHECKSUM_SIZE 4
34616+
71430cf6
MT
34617+#define MIN_LATTICE_FACTOR 1
34618+#define MAX_LATTICE_FACTOR 32
44254afd 34619+
71430cf6
MT
34620+/* this mask contains all non-standard plugins that might
34621+ be present in reiser4-specific part of inode managed by
34622+ cryptcompress file plugin */
34623+#define cryptcompress_mask \
34624+ ((1 << PSET_FILE) | \
34625+ (1 << PSET_CLUSTER) | \
34626+ (1 << PSET_CIPHER) | \
34627+ (1 << PSET_DIGEST) | \
34628+ (1 << PSET_COMPRESSION) | \
34629+ (1 << PSET_COMPRESSION_MODE))
44254afd
MT
34630+
34631+#if REISER4_DEBUG
34632+static inline int cluster_shift_ok(int shift)
34633+{
34634+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
34635+}
34636+#endif
34637+
71430cf6
MT
34638+#if REISER4_DEBUG
34639+#define INODE_PGCOUNT(inode) \
34640+({ \
34641+ assert("edward-1530", inode_file_plugin(inode) == \
34642+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
34643+ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
34644+ })
34645+#define INODE_PGCOUNT_INC(inode) \
34646+do { \
34647+ assert("edward-1531", inode_file_plugin(inode) == \
34648+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
34649+ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
34650+} while (0)
34651+#define INODE_PGCOUNT_DEC(inode) \
34652+do { \
34653+ if (inode_file_plugin(inode) == \
34654+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
34655+ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
34656+} while (0)
34657+#else
34658+#define INODE_PGCOUNT(inode) (0)
34659+#define INODE_PGCOUNT_INC(inode)
34660+#define INODE_PGCOUNT_DEC(inode)
34661+#endif /* REISER4_DEBUG */
34662+
34663+struct tfm_stream {
44254afd
MT
34664+ __u8 *data;
34665+ size_t size;
71430cf6 34666+};
44254afd
MT
34667+
34668+typedef enum {
34669+ INPUT_STREAM,
34670+ OUTPUT_STREAM,
34671+ LAST_STREAM
34672+} tfm_stream_id;
34673+
71430cf6 34674+typedef struct tfm_stream * tfm_unit[LAST_STREAM];
44254afd 34675+
71430cf6 34676+static inline __u8 *ts_data(struct tfm_stream * stm)
44254afd
MT
34677+{
34678+ assert("edward-928", stm != NULL);
34679+ return stm->data;
34680+}
34681+
71430cf6 34682+static inline size_t ts_size(struct tfm_stream * stm)
44254afd
MT
34683+{
34684+ assert("edward-929", stm != NULL);
34685+ return stm->size;
34686+}
34687+
71430cf6 34688+static inline void set_ts_size(struct tfm_stream * stm, size_t size)
44254afd
MT
34689+{
34690+ assert("edward-930", stm != NULL);
34691+
34692+ stm->size = size;
34693+}
34694+
71430cf6 34695+static inline int alloc_ts(struct tfm_stream ** stm)
44254afd
MT
34696+{
34697+ assert("edward-931", stm);
34698+ assert("edward-932", *stm == NULL);
34699+
71430cf6 34700+ *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get());
44254afd
MT
34701+ if (*stm == NULL)
34702+ return -ENOMEM;
34703+ memset(*stm, 0, sizeof **stm);
34704+ return 0;
34705+}
34706+
71430cf6 34707+static inline void free_ts(struct tfm_stream * stm)
44254afd
MT
34708+{
34709+ assert("edward-933", !ts_data(stm));
34710+ assert("edward-934", !ts_size(stm));
34711+
34712+ kfree(stm);
34713+}
34714+
71430cf6 34715+static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
44254afd
MT
34716+{
34717+ assert("edward-935", !ts_data(stm));
34718+ assert("edward-936", !ts_size(stm));
34719+ assert("edward-937", size != 0);
34720+
71430cf6 34721+ stm->data = reiser4_vmalloc(size);
44254afd
MT
34722+ if (!stm->data)
34723+ return -ENOMEM;
34724+ set_ts_size(stm, size);
34725+ return 0;
34726+}
34727+
71430cf6 34728+static inline void free_ts_data(struct tfm_stream * stm)
44254afd
MT
34729+{
34730+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
34731+
34732+ if (ts_data(stm))
34733+ vfree(ts_data(stm));
34734+ memset(stm, 0, sizeof *stm);
34735+}
34736+
34737+/* Write modes for item conversion in flush convert phase */
34738+typedef enum {
34739+ CRC_APPEND_ITEM = 1,
34740+ CRC_OVERWRITE_ITEM = 2,
34741+ CRC_CUT_ITEM = 3
71430cf6 34742+} cryptcompress_write_mode_t;
44254afd
MT
34743+
34744+typedef enum {
71430cf6
MT
34745+ LC_INVAL = 0, /* invalid value */
34746+ LC_APPOV = 1, /* append and/or overwrite */
34747+ LC_TRUNC = 2 /* truncate */
34748+} logical_cluster_op;
34749+
34750+/* Transform cluster.
34751+ * Intermediate state between page cluster and disk cluster
34752+ * Is used for data transform (compression/encryption)
34753+ */
34754+struct tfm_cluster {
34755+ coa_set coa; /* compression algorithms info */
34756+ tfm_unit tun; /* plain and transformed streams */
44254afd
MT
34757+ tfm_action act;
34758+ int uptodate;
71430cf6 34759+ int lsize; /* number of bytes in logical cluster */
44254afd 34760+ int len; /* length of the transform stream */
71430cf6 34761+};
44254afd 34762+
71430cf6
MT
34763+static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
34764+ tfm_action act)
44254afd
MT
34765+{
34766+ return tc->coa[id][act];
34767+}
34768+
71430cf6
MT
34769+static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
34770+ tfm_action act, coa_t coa)
44254afd
MT
34771+{
34772+ tc->coa[id][act] = coa;
34773+}
34774+
71430cf6 34775+static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
44254afd
MT
34776+{
34777+ coa_t coa;
34778+
34779+ coa = cplug->alloc(tc->act);
34780+ if (IS_ERR(coa))
34781+ return PTR_ERR(coa);
34782+ set_coa(tc, cplug->h.id, tc->act, coa);
34783+ return 0;
34784+}
34785+
34786+static inline int
71430cf6 34787+grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
44254afd
MT
34788+{
34789+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
34790+ alloc_coa(tc, cplug) : 0);
34791+}
34792+
71430cf6 34793+static inline void free_coa_set(struct tfm_cluster * tc)
44254afd
MT
34794+{
34795+ tfm_action j;
34796+ reiser4_compression_id i;
34797+ compression_plugin *cplug;
34798+
34799+ assert("edward-810", tc != NULL);
34800+
71430cf6 34801+ for (j = 0; j < TFMA_LAST; j++)
44254afd
MT
34802+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
34803+ if (!get_coa(tc, i, j))
34804+ continue;
34805+ cplug = compression_plugin_by_id(i);
34806+ assert("edward-812", cplug->free != NULL);
34807+ cplug->free(get_coa(tc, i, j), j);
34808+ set_coa(tc, i, j, 0);
34809+ }
34810+ return;
34811+}
34812+
71430cf6
MT
34813+static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
34814+ tfm_stream_id id)
44254afd
MT
34815+{
34816+ return tc->tun[id];
34817+}
34818+
71430cf6
MT
34819+static inline void set_tfm_stream(struct tfm_cluster * tc,
34820+ tfm_stream_id id, struct tfm_stream * ts)
44254afd
MT
34821+{
34822+ tc->tun[id] = ts;
34823+}
34824+
71430cf6 34825+static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
44254afd 34826+{
71430cf6 34827+ return ts_data(get_tfm_stream(tc, id));
44254afd
MT
34828+}
34829+
71430cf6
MT
34830+static inline void set_tfm_stream_data(struct tfm_cluster * tc,
34831+ tfm_stream_id id, __u8 * data)
44254afd 34832+{
71430cf6 34833+ get_tfm_stream(tc, id)->data = data;
44254afd
MT
34834+}
34835+
71430cf6 34836+static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
44254afd 34837+{
71430cf6 34838+ return ts_size(get_tfm_stream(tc, id));
44254afd
MT
34839+}
34840+
34841+static inline void
71430cf6 34842+set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
44254afd 34843+{
71430cf6 34844+ get_tfm_stream(tc, id)->size = size;
44254afd
MT
34845+}
34846+
34847+static inline int
71430cf6 34848+alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
44254afd
MT
34849+{
34850+ assert("edward-939", tc != NULL);
71430cf6 34851+ assert("edward-940", !get_tfm_stream(tc, id));
44254afd 34852+
71430cf6
MT
34853+ tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
34854+ reiser4_ctx_gfp_mask_get());
44254afd
MT
34855+ if (!tc->tun[id])
34856+ return -ENOMEM;
71430cf6 34857+ return alloc_ts_data(get_tfm_stream(tc, id), size);
44254afd
MT
34858+}
34859+
34860+static inline int
71430cf6 34861+realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
44254afd
MT
34862+{
34863+ assert("edward-941", tfm_stream_size(tc, id) < size);
71430cf6
MT
34864+ free_ts_data(get_tfm_stream(tc, id));
34865+ return alloc_ts_data(get_tfm_stream(tc, id), size);
44254afd
MT
34866+}
34867+
71430cf6 34868+static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
44254afd 34869+{
71430cf6
MT
34870+ free_ts_data(get_tfm_stream(tc, id));
34871+ free_ts(get_tfm_stream(tc, id));
44254afd
MT
34872+ set_tfm_stream(tc, id, 0);
34873+}
34874+
34875+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
34876+{
34877+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
34878+}
34879+
71430cf6 34880+static inline void free_tfm_unit(struct tfm_cluster * tc)
44254afd
MT
34881+{
34882+ tfm_stream_id id;
34883+ for (id = 0; id < LAST_STREAM; id++) {
71430cf6 34884+ if (!get_tfm_stream(tc, id))
44254afd
MT
34885+ continue;
34886+ free_tfm_stream(tc, id);
34887+ }
34888+}
34889+
71430cf6 34890+static inline void put_tfm_cluster(struct tfm_cluster * tc)
44254afd
MT
34891+{
34892+ assert("edward-942", tc != NULL);
34893+ free_coa_set(tc);
34894+ free_tfm_unit(tc);
34895+}
34896+
71430cf6 34897+static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
44254afd
MT
34898+{
34899+ assert("edward-943", tc != NULL);
34900+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
34901+ return (tc->uptodate == 1);
34902+}
34903+
71430cf6 34904+static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
44254afd
MT
34905+{
34906+ assert("edward-945", tc != NULL);
34907+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
34908+ tc->uptodate = 1;
34909+ return;
34910+}
34911+
71430cf6 34912+static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
44254afd
MT
34913+{
34914+ assert("edward-947", tc != NULL);
34915+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
34916+ tc->uptodate = 0;
34917+ return;
34918+}
34919+
71430cf6 34920+static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
44254afd 34921+{
71430cf6 34922+ return (get_tfm_stream(tc, id) &&
44254afd
MT
34923+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
34924+}
34925+
71430cf6 34926+static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
44254afd
MT
34927+{
34928+ int i;
34929+ for (i = 0; i < LAST_STREAM; i++)
34930+ if (!tfm_stream_is_set(tc, i))
34931+ return 0;
34932+ return 1;
34933+}
34934+
71430cf6 34935+static inline void alternate_streams(struct tfm_cluster * tc)
44254afd 34936+{
71430cf6 34937+ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
44254afd 34938+
71430cf6 34939+ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
44254afd
MT
34940+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
34941+}
34942+
71430cf6
MT
34943+/* Set of states to indicate a kind of data
34944+ * that will be written to the window */
44254afd 34945+typedef enum {
71430cf6
MT
34946+ DATA_WINDOW, /* user's data */
34947+ HOLE_WINDOW /* zeroes (such kind of data can be written
34948+ * if we start to write from offset > i_size) */
44254afd
MT
34949+} window_stat;
34950+
71430cf6
MT
34951+/* Window (of logical cluster size) discretely sliding along a file.
34952+ * Is used to locate hole region in a logical cluster to be properly
34953+ * represented on disk.
34954+ * We split a write to cryptcompress file into writes to its logical
34955+ * clusters. Before writing to a logical cluster we set a window, i.e.
34956+ * calculate values of the following fields:
34957+ */
34958+struct reiser4_slide {
34959+ unsigned off; /* offset to write from */
34960+ unsigned count; /* number of bytes to write */
44254afd 34961+ unsigned delta; /* number of bytes to append to the hole */
71430cf6
MT
34962+ window_stat stat; /* what kind of data will be written starting
34963+ from @off */
34964+};
44254afd 34965+
71430cf6 34966+/* Possible states of a disk cluster */
44254afd
MT
34967+typedef enum {
34968+ INVAL_DISK_CLUSTER, /* unknown state */
34969+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
71430cf6 34970+ * at least 1 time */
44254afd 34971+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
71430cf6
MT
34972+ * converted by flush */
34973+ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
34974+ * nor on disk */
34975+ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
44254afd
MT
34976+} disk_cluster_stat;
34977+
71430cf6
MT
34978+/* The following structure represents various stages of the same logical
34979+ * cluster of index @index:
34980+ * . fixed slide
34981+ * . page cluster (stage in primary cache)
34982+ * . transform cluster (transition stage)
34983+ * . disk cluster (stage in secondary cache)
34984+ * This structure is used in transition and synchronizing operations, e.g.
34985+ * transform cluster is a transition state when synchronizing page cluster
34986+ * and disk cluster.
34987+ * FIXME: Encapsulate page cluster, disk cluster.
34988+ */
34989+struct cluster_handle {
34990+ cloff_t index; /* offset in a file (unit is a cluster size) */
34991+ int index_valid; /* for validating the index above, if needed */
34992+ struct file *file; /* host file */
34993+
34994+ /* logical cluster */
34995+ struct reiser4_slide *win; /* sliding window to locate holes */
34996+ logical_cluster_op op; /* logical cluster operation (truncate or
34997+ append/overwrite) */
34998+ /* transform cluster */
34999+ struct tfm_cluster tc; /* contains all needed info to synchronize
35000+ page cluster and disk cluster) */
35001+ /* page cluster */
35002+ int nr_pages; /* number of pages of current checkin action */
35003+ int old_nrpages; /* number of pages of last checkin action */
35004+ struct page **pages; /* attached pages */
35005+ jnode * node; /* jnode for capture */
35006+
35007+ /* disk cluster */
35008+ hint_t *hint; /* current position in the tree */
35009+ disk_cluster_stat dstat; /* state of the current disk cluster */
35010+ int reserved; /* is space for disk cluster reserved */
44254afd
MT
35011+#if REISER4_DEBUG
35012+ reiser4_context *ctx;
35013+ int reserved_prepped;
35014+ int reserved_unprepped;
35015+#endif
35016+
71430cf6 35017+};
44254afd 35018+
71430cf6 35019+static inline __u8 * tfm_input_data (struct cluster_handle * clust)
44254afd
MT
35020+{
35021+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
35022+}
35023+
71430cf6 35024+static inline __u8 * tfm_output_data (struct cluster_handle * clust)
44254afd
MT
35025+{
35026+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35027+}
35028+
71430cf6
MT
35029+static inline int reset_cluster_pgset(struct cluster_handle * clust,
35030+ int nrpages)
44254afd
MT
35031+{
35032+ assert("edward-1057", clust->pages != NULL);
35033+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35034+ return 0;
35035+}
35036+
71430cf6
MT
35037+static inline int alloc_cluster_pgset(struct cluster_handle * clust,
35038+ int nrpages)
44254afd
MT
35039+{
35040+ assert("edward-949", clust != NULL);
35041+ assert("edward-1362", clust->pages == NULL);
35042+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35043+
71430cf6
MT
35044+ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
35045+ reiser4_ctx_gfp_mask_get());
44254afd
MT
35046+ if (!clust->pages)
35047+ return RETERR(-ENOMEM);
44254afd
MT
35048+ return 0;
35049+}
35050+
71430cf6 35051+static inline void free_cluster_pgset(struct cluster_handle * clust)
44254afd
MT
35052+{
35053+ assert("edward-951", clust->pages != NULL);
35054+ kfree(clust->pages);
35055+ clust->pages = NULL;
35056+}
35057+
71430cf6 35058+static inline void put_cluster_handle(struct cluster_handle * clust)
44254afd
MT
35059+{
35060+ assert("edward-435", clust != NULL);
35061+
35062+ put_tfm_cluster(&clust->tc);
35063+ if (clust->pages)
35064+ free_cluster_pgset(clust);
35065+ memset(clust, 0, sizeof *clust);
35066+}
35067+
71430cf6 35068+static inline void inc_keyload_count(struct reiser4_crypto_info * data)
44254afd
MT
35069+{
35070+ assert("edward-1410", data != NULL);
35071+ data->keyload_count++;
35072+}
35073+
71430cf6 35074+static inline void dec_keyload_count(struct reiser4_crypto_info * data)
44254afd
MT
35075+{
35076+ assert("edward-1411", data != NULL);
35077+ assert("edward-1412", data->keyload_count > 0);
35078+ data->keyload_count--;
35079+}
35080+
71430cf6
MT
35081+static inline int capture_cluster_jnode(jnode * node)
35082+{
35083+ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
35084+}
35085+
44254afd 35086+/* cryptcompress specific part of reiser4_inode */
71430cf6
MT
35087+struct cryptcompress_info {
35088+ struct mutex checkin_mutex; /* This is to serialize
35089+ * checkin_logical_cluster operations */
35090+ cloff_t trunc_index; /* Index of the leftmost truncated disk
35091+ * cluster (to resolve races with read) */
35092+ struct reiser4_crypto_info *crypt;
35093+ /*
35094+ * the following 2 fields are controlled by compression mode plugin
35095+ */
35096+ int compress_toggle; /* Current status of compressibility */
35097+ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
35098+ * a compression_toggle to keep the factor
35099+ */
44254afd 35100+#if REISER4_DEBUG
71430cf6 35101+ atomic_t pgcount; /* number of grabbed pages */
44254afd 35102+#endif
71430cf6 35103+};
44254afd 35104+
71430cf6 35105+static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
44254afd
MT
35106+{
35107+ info->compress_toggle = val;
35108+}
35109+
71430cf6 35110+static inline int get_compression_toggle (struct cryptcompress_info * info)
44254afd
MT
35111+{
35112+ return info->compress_toggle;
35113+}
35114+
71430cf6 35115+static inline int compression_is_on(struct cryptcompress_info * info)
44254afd 35116+{
71430cf6 35117+ return get_compression_toggle(info) == 1;
44254afd
MT
35118+}
35119+
71430cf6 35120+static inline void turn_on_compression(struct cryptcompress_info * info)
44254afd 35121+{
71430cf6 35122+ set_compression_toggle(info, 1);
44254afd
MT
35123+}
35124+
71430cf6 35125+static inline void turn_off_compression(struct cryptcompress_info * info)
44254afd 35126+{
71430cf6 35127+ set_compression_toggle(info, 0);
44254afd
MT
35128+}
35129+
71430cf6 35130+static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
44254afd 35131+{
71430cf6 35132+ info->lattice_factor = val;
44254afd
MT
35133+}
35134+
71430cf6 35135+static inline int get_lattice_factor(struct cryptcompress_info * info)
44254afd 35136+{
71430cf6 35137+ return info->lattice_factor;
44254afd
MT
35138+}
35139+
71430cf6
MT
35140+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
35141+int equal_to_rdk(znode *, const reiser4_key *);
35142+int goto_right_neighbor(coord_t *, lock_handle *);
35143+int cryptcompress_inode_ok(struct inode *inode);
35144+int coord_is_unprepped_ctail(const coord_t * coord);
35145+extern int ctail_read_disk_cluster (struct cluster_handle *, struct inode *,
35146+ struct page *, znode_lock_mode mode);
35147+extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
35148+ struct page * page, znode_lock_mode mode);
35149+extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
35150+ struct inode * inode);
35151+extern int readpages_cryptcompress(struct file*, struct address_space*,
35152+ struct list_head*, unsigned);
35153+int bind_cryptcompress(struct inode *child, struct inode *parent);
35154+void destroy_inode_cryptcompress(struct inode * inode);
35155+int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
35156+ rw_op rw);
35157+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
35158+ struct cluster_handle * clust, int * progress);
35159+struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
35160+void inherit_crypto_info_common(struct inode * parent, struct inode * object,
35161+ int (*can_inherit)(struct inode * child,
35162+ struct inode * parent));
35163+void reiser4_attach_crypto_info(struct inode * inode,
35164+ struct reiser4_crypto_info * info);
35165+void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
35166+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
35167+
35168+static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
44254afd 35169+{
71430cf6 35170+ return info->cipher;
44254afd
MT
35171+}
35172+
71430cf6
MT
35173+static inline void info_set_cipher(struct reiser4_crypto_info * info,
35174+ struct crypto_blkcipher * tfm)
44254afd 35175+{
71430cf6 35176+ info->cipher = tfm;
44254afd
MT
35177+}
35178+
71430cf6 35179+static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
44254afd 35180+{
71430cf6 35181+ return info->digest;
44254afd
MT
35182+}
35183+
71430cf6
MT
35184+static inline void info_set_digest(struct reiser4_crypto_info * info,
35185+ struct crypto_hash * tfm)
44254afd 35186+{
71430cf6 35187+ info->digest = tfm;
44254afd
MT
35188+}
35189+
71430cf6 35190+static inline void put_cluster_page(struct page * page)
44254afd 35191+{
71430cf6 35192+ page_cache_release(page);
44254afd
MT
35193+}
35194+
71430cf6 35195+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
44254afd
MT
35196+
35197+/* Make Linus happy.
35198+ Local variables:
35199+ c-indentation-style: "K&R"
35200+ mode-name: "LC"
35201+ c-basic-offset: 8
35202+ tab-width: 8
35203+ fill-column: 120
35204+ scroll-step: 1
35205+ End:
35206+*/
71430cf6
MT
35207diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/file.c linux-2.6.22/fs/reiser4/plugin/file/file.c
35208--- linux-2.6.22.orig/fs/reiser4/plugin/file/file.c 1970-01-01 03:00:00.000000000 +0300
35209+++ linux-2.6.22/fs/reiser4/plugin/file/file.c 2007-07-29 00:25:34.924708901 +0400
35210@@ -0,0 +1,2817 @@
44254afd
MT
35211+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35212+ * reiser4/README */
35213+
35214+/*
35215+ * this file contains implementations of inode/file/address_space/file plugin
35216+ * operations specific for "unix file plugin" (plugin id is
35217+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35218+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35219+ * no items but stat data)
35220+ */
35221+
35222+#include "../../inode.h"
35223+#include "../../super.h"
35224+#include "../../tree_walk.h"
35225+#include "../../carry.h"
35226+#include "../../page_cache.h"
35227+#include "../../ioctl.h"
35228+#include "../object.h"
71430cf6 35229+#include "../cluster.h"
44254afd
MT
35230+#include "../../safe_link.h"
35231+
35232+#include <linux/writeback.h>
35233+#include <linux/pagevec.h>
35234+#include <linux/syscalls.h>
35235+
35236+
35237+static int unpack(struct file *file, struct inode *inode, int forever);
71430cf6
MT
35238+static void drop_access(struct unix_file_info *);
35239+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35240+ znode_lock_mode lock_mode);
35241+
35242+/* Get exclusive access and make sure that file is not partially
35243+ * converted (It may happen that another process is doing tail
35244+ * conversion. If so, wait until it completes)
35245+ */
35246+static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
35247+ struct inode *inode)
35248+{
35249+ do {
35250+ get_exclusive_access(uf_info);
35251+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
35252+ break;
35253+ drop_exclusive_access(uf_info);
35254+ schedule();
35255+ } while (1);
35256+}
44254afd
MT
35257+
35258+/* get unix file plugin specific portion of inode */
71430cf6 35259+struct unix_file_info *unix_file_inode_data(const struct inode *inode)
44254afd
MT
35260+{
35261+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35262+}
35263+
35264+/**
35265+ * equal_to_rdk - compare key and znode's right delimiting key
35266+ * @node: node whose right delimiting key to compare with @key
35267+ * @key: key to compare with @node's right delimiting key
35268+ *
35269+ * Returns true if @key is equal to right delimiting key of @node.
35270+ */
35271+int equal_to_rdk(znode *node, const reiser4_key *key)
35272+{
35273+ int result;
35274+
35275+ read_lock_dk(znode_get_tree(node));
35276+ result = keyeq(key, znode_get_rd_key(node));
35277+ read_unlock_dk(znode_get_tree(node));
35278+ return result;
35279+}
35280+
35281+#if REISER4_DEBUG
35282+
35283+/**
35284+ * equal_to_ldk - compare key and znode's left delimiting key
35285+ * @node: node whose left delimiting key to compare with @key
35286+ * @key: key to compare with @node's left delimiting key
35287+ *
35288+ * Returns true if @key is equal to left delimiting key of @node.
35289+ */
35290+int equal_to_ldk(znode *node, const reiser4_key *key)
35291+{
35292+ int result;
35293+
35294+ read_lock_dk(znode_get_tree(node));
35295+ result = keyeq(key, znode_get_ld_key(node));
35296+ read_unlock_dk(znode_get_tree(node));
35297+ return result;
35298+}
35299+
35300+/**
35301+ * check_coord - check whether coord corresponds to key
35302+ * @coord: coord to check
35303+ * @key: key @coord has to correspond to
35304+ *
35305+ * Returns true if @coord is set as if it was set as result of lookup with @key
35306+ * in coord->node.
35307+ */
35308+static int check_coord(const coord_t *coord, const reiser4_key *key)
35309+{
35310+ coord_t twin;
35311+
35312+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
35313+ FIND_MAX_NOT_MORE_THAN, &twin);
35314+ return coords_equal(coord, &twin);
35315+}
35316+
35317+#endif /* REISER4_DEBUG */
35318+
35319+/**
35320+ * init_uf_coord - initialize extended coord
35321+ * @uf_coord:
35322+ * @lh:
35323+ *
35324+ *
35325+ */
35326+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35327+{
35328+ coord_init_zero(&uf_coord->coord);
35329+ coord_clear_iplug(&uf_coord->coord);
35330+ uf_coord->lh = lh;
35331+ init_lh(lh);
35332+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35333+ uf_coord->valid = 0;
35334+}
35335+
71430cf6 35336+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
44254afd
MT
35337+{
35338+ assert("vs-1333", uf_coord->valid == 0);
35339+
35340+ if (coord_is_between_items(&uf_coord->coord))
35341+ return;
35342+
35343+ assert("vs-1348",
35344+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35345+ init_coord_extension);
35346+
35347+ item_body_by_coord(&uf_coord->coord);
35348+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35349+ init_coord_extension(uf_coord, offset);
35350+}
35351+
35352+/**
35353+ * goto_right_neighbor - lock right neighbor, drop current node lock
35354+ * @coord:
35355+ * @lh:
35356+ *
35357+ * Obtain lock on right neighbor and drop lock on current node.
35358+ */
35359+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35360+{
35361+ int result;
35362+ lock_handle lh_right;
35363+
35364+ assert("vs-1100", znode_is_locked(coord->node));
35365+
35366+ init_lh(&lh_right);
35367+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
35368+ znode_is_wlocked(coord->node) ?
35369+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35370+ GN_CAN_USE_UPPER_LEVELS);
35371+ if (result) {
35372+ done_lh(&lh_right);
35373+ return result;
35374+ }
35375+
35376+ /*
35377+ * we hold two longterm locks on neighboring nodes. Unlock left of
35378+ * them
35379+ */
35380+ done_lh(lh);
35381+
35382+ coord_init_first_unit_nocheck(coord, lh_right.node);
35383+ move_lh(lh, &lh_right);
35384+
35385+ return 0;
35386+
35387+}
35388+
35389+/**
35390+ * set_file_state
35391+ * @uf_info:
35392+ * @cbk_result:
35393+ * @level:
35394+ *
35395+ * This is to be used by find_file_item and in find_file_state to
35396+ * determine real state of file
35397+ */
71430cf6 35398+static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
44254afd
MT
35399+ tree_level level)
35400+{
35401+ if (cbk_errored(cbk_result))
35402+ /* error happened in find_file_item */
35403+ return;
35404+
35405+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35406+
35407+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
44254afd
MT
35408+ if (cbk_result == CBK_COORD_NOTFOUND)
35409+ uf_info->container = UF_CONTAINER_EMPTY;
35410+ else if (level == LEAF_LEVEL)
35411+ uf_info->container = UF_CONTAINER_TAILS;
35412+ else
35413+ uf_info->container = UF_CONTAINER_EXTENTS;
35414+ } else {
35415+ /*
35416+ * file state is known, check whether it is set correctly if
35417+ * file is not being tail converted
35418+ */
71430cf6
MT
35419+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35420+ REISER4_PART_IN_CONV)) {
44254afd
MT
35421+ assert("vs-1162",
35422+ ergo(level == LEAF_LEVEL &&
35423+ cbk_result == CBK_COORD_FOUND,
35424+ uf_info->container == UF_CONTAINER_TAILS));
35425+ assert("vs-1165",
35426+ ergo(level == TWIG_LEVEL &&
35427+ cbk_result == CBK_COORD_FOUND,
35428+ uf_info->container == UF_CONTAINER_EXTENTS));
35429+ }
35430+ }
35431+}
35432+
35433+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35434+ const reiser4_key *key, znode_lock_mode lock_mode,
35435+ struct inode *inode)
35436+{
71430cf6
MT
35437+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
35438+ FIND_MAX_NOT_MORE_THAN,
35439+ TWIG_LEVEL, LEAF_LEVEL,
35440+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35441+ (CBK_UNIQUE | CBK_FOR_INSERT),
35442+ NULL /* ra_info */ );
44254afd
MT
35443+}
35444+
35445+/**
35446+ * find_file_item - look for file item in the tree
35447+ * @hint: provides coordinate, lock handle, seal
35448+ * @key: key for search
35449+ * @mode: mode of lock to put on returned node
35450+ * @ra_info:
35451+ * @inode:
35452+ *
35453+ * This finds position in the tree corresponding to @key. It first tries to use
35454+ * @hint's seal if it is set.
35455+ */
35456+int find_file_item(hint_t *hint, const reiser4_key *key,
35457+ znode_lock_mode lock_mode,
35458+ struct inode *inode)
35459+{
35460+ int result;
35461+ coord_t *coord;
35462+ lock_handle *lh;
35463+
71430cf6 35464+ assert("nikita-3030", reiser4_schedulable());
44254afd
MT
35465+ assert("vs-1707", hint != NULL);
35466+ assert("vs-47", inode != NULL);
35467+
35468+ coord = &hint->ext_coord.coord;
35469+ lh = hint->ext_coord.lh;
35470+ init_lh(lh);
35471+
35472+ result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35473+ if (!result) {
35474+ if (coord->between == AFTER_UNIT &&
35475+ equal_to_rdk(coord->node, key)) {
35476+ result = goto_right_neighbor(coord, lh);
35477+ if (result == -E_NO_NEIGHBOR)
35478+ return RETERR(-EIO);
35479+ if (result)
35480+ return result;
35481+ assert("vs-1152", equal_to_ldk(coord->node, key));
35482+ /*
35483+ * we moved to different node. Invalidate coord
35484+ * extension, zload is necessary to init it again
35485+ */
35486+ hint->ext_coord.valid = 0;
35487+ }
35488+
35489+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35490+ znode_get_level(coord->node));
71430cf6 35491+
44254afd
MT
35492+ return CBK_COORD_FOUND;
35493+ }
35494+
35495+ coord_init_zero(coord);
35496+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35497+ set_file_state(unix_file_inode_data(inode), result,
35498+ znode_get_level(coord->node));
35499+
35500+ /* FIXME: we might already have coord extension initialized */
35501+ hint->ext_coord.valid = 0;
35502+ return result;
35503+}
35504+
35505+/* plugin->u.file.write_flowom = NULL
35506+ plugin->u.file.read_flow = NULL */
35507+
35508+void hint_init_zero(hint_t * hint)
35509+{
35510+ memset(hint, 0, sizeof(*hint));
35511+ init_lh(&hint->lh);
35512+ hint->ext_coord.lh = &hint->lh;
35513+}
35514+
71430cf6 35515+static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
44254afd
MT
35516+{
35517+ int result;
35518+ reiser4_key key;
35519+ coord_t coord;
35520+ lock_handle lh;
35521+
35522+ assert("vs-1628", ea_obtained(uf_info));
35523+
35524+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35525+ key_by_inode_and_offset_common(inode, 0, &key);
35526+ init_lh(&lh);
35527+ result = find_file_item_nohint(&coord, &lh, &key,
35528+ ZNODE_READ_LOCK, inode);
35529+ set_file_state(uf_info, result, znode_get_level(coord.node));
35530+ done_lh(&lh);
35531+ if (!cbk_errored(result))
35532+ result = 0;
35533+ } else
35534+ result = 0;
35535+ assert("vs-1074",
35536+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
71430cf6 35537+ reiser4_txn_restart_current();
44254afd
MT
35538+ return result;
35539+}
35540+
35541+/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
35542+ data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
35543+ if page corresponds to hole extent and unallocated one will have to be created */
35544+static int reserve_partial_page(reiser4_tree * tree)
35545+{
35546+ grab_space_enable();
35547+ return reiser4_grab_reserved(reiser4_get_current_sb(),
35548+ 1 +
35549+ 2 * estimate_one_insert_into_item(tree),
35550+ BA_CAN_COMMIT);
35551+}
35552+
35553+/* estimate and reserve space needed to cut one item and update one stat data */
35554+static int reserve_cut_iteration(reiser4_tree * tree)
35555+{
35556+ __u64 estimate = estimate_one_item_removal(tree)
35557+ + estimate_one_insert_into_item(tree);
35558+
35559+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
35560+
35561+ grab_space_enable();
35562+ /* We need to double our estimate now that we can delete more than one
35563+ node. */
35564+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
35565+ BA_CAN_COMMIT);
35566+}
35567+
71430cf6
MT
35568+int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
35569+ int update_sd)
44254afd
MT
35570+{
35571+ int result = 0;
35572+
71430cf6 35573+ INODE_SET_SIZE(inode, get_key_offset(key));
44254afd
MT
35574+ if (update_sd) {
35575+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
35576+ result = reiser4_update_sd(inode);
35577+ }
35578+ return result;
35579+}
35580+
35581+/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
35582+ and update file stat data on every single cut from the tree */
35583+int
35584+cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
35585+ loff_t cur_size, int (*update_actor) (struct inode *,
35586+ reiser4_key *, int))
35587+{
35588+ reiser4_key from_key, to_key;
35589+ reiser4_key smallest_removed;
35590+ file_plugin *fplug = inode_file_plugin(inode);
35591+ int result;
35592+ int progress = 0;
35593+
35594+ assert("vs-1248",
35595+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
71430cf6 35596+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44254afd
MT
35597+
35598+ fplug->key_by_inode(inode, new_size, &from_key);
35599+ to_key = from_key;
71430cf6 35600+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
44254afd
MT
35601+ /* this loop normally runs just once */
35602+ while (1) {
71430cf6 35603+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
44254afd
MT
35604+ if (result)
35605+ break;
35606+
71430cf6
MT
35607+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
35608+ &smallest_removed, inode, 1,
35609+ &progress);
44254afd
MT
35610+ if (result == -E_REPEAT) {
35611+ /* -E_REPEAT is a signal to interrupt a long file truncation process */
35612+ if (progress) {
35613+ result =
35614+ update_actor(inode, &smallest_removed,
35615+ update_sd);
35616+ if (result)
35617+ break;
35618+ }
35619+
71430cf6 35620+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
44254afd
MT
35621+ reiser4_release_reserved(inode->i_sb);
35622+
71430cf6 35623+ /* reiser4_cut_tree_object() was interrupted probably because
44254afd
MT
35624+ * current atom requires commit, we have to release
35625+ * transaction handle to allow atom commit. */
71430cf6 35626+ reiser4_txn_restart_current();
44254afd
MT
35627+ continue;
35628+ }
35629+ if (result
35630+ && !(result == CBK_COORD_NOTFOUND && new_size == 0
35631+ && inode->i_size == 0))
35632+ break;
35633+
35634+ set_key_offset(&smallest_removed, new_size);
35635+ /* Final sd update after the file gets its correct size */
35636+ result = update_actor(inode, &smallest_removed, update_sd);
35637+ break;
35638+ }
35639+
71430cf6 35640+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
44254afd
MT
35641+ reiser4_release_reserved(inode->i_sb);
35642+
35643+ return result;
35644+}
35645+
35646+int find_or_create_extent(struct page *page);
35647+
44254afd
MT
35648+/* part of truncate_file_body: it is called when truncate is used to make file
35649+ shorter */
35650+static int shorten_file(struct inode *inode, loff_t new_size)
35651+{
35652+ int result;
35653+ struct page *page;
35654+ int padd_from;
35655+ unsigned long index;
71430cf6 35656+ struct unix_file_info *uf_info;
44254afd
MT
35657+
35658+ /*
35659+ * all items of ordinary reiser4 file are grouped together. That is why
71430cf6 35660+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be
44254afd
MT
35661+ * truncated that simply
35662+ */
35663+ result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
71430cf6
MT
35664+ get_key_offset(reiser4_max_key()),
35665+ reiser4_update_file_size);
44254afd
MT
35666+ if (result)
35667+ return result;
35668+
35669+ uf_info = unix_file_inode_data(inode);
35670+ assert("vs-1105", new_size == inode->i_size);
35671+ if (new_size == 0) {
35672+ uf_info->container = UF_CONTAINER_EMPTY;
35673+ return 0;
35674+ }
35675+
35676+ result = find_file_state(inode, uf_info);
35677+ if (result)
35678+ return result;
35679+ if (uf_info->container == UF_CONTAINER_TAILS)
35680+ /*
35681+ * No need to worry about zeroing last page after new file
35682+ * end
35683+ */
35684+ return 0;
35685+
35686+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
35687+ if (!padd_from)
35688+ /* file is truncated to page boundary */
35689+ return 0;
35690+
71430cf6 35691+ result = reserve_partial_page(reiser4_tree_by_inode(inode));
44254afd
MT
35692+ if (result) {
35693+ reiser4_release_reserved(inode->i_sb);
35694+ return result;
35695+ }
35696+
35697+ /* last page is partially truncated - zero its content */
35698+ index = (inode->i_size >> PAGE_CACHE_SHIFT);
71430cf6 35699+ page = read_mapping_page(inode->i_mapping, index, NULL);
44254afd
MT
35700+ if (IS_ERR(page)) {
35701+ /*
71430cf6 35702+ * the below does up(sbinfo->delete_mutex). Do not get
44254afd
MT
35703+ * confused
35704+ */
35705+ reiser4_release_reserved(inode->i_sb);
35706+ if (likely(PTR_ERR(page) == -EINVAL)) {
35707+ /* looks like file is built of tail items */
35708+ return 0;
35709+ }
35710+ return PTR_ERR(page);
35711+ }
35712+ wait_on_page_locked(page);
35713+ if (!PageUptodate(page)) {
35714+ page_cache_release(page);
35715+ /*
71430cf6 35716+ * the below does up(sbinfo->delete_mutex). Do not get
44254afd
MT
35717+ * confused
35718+ */
35719+ reiser4_release_reserved(inode->i_sb);
35720+ return RETERR(-EIO);
35721+ }
35722+
35723+ /*
35724+ * if page correspons to hole extent unit - unallocated one will be
35725+ * created here. This is not necessary
35726+ */
35727+ result = find_or_create_extent(page);
35728+
35729+ /*
35730+ * FIXME: cut_file_items has already updated inode. Probably it would
35731+ * be better to update it here when file is really truncated
35732+ */
35733+ if (result) {
35734+ page_cache_release(page);
35735+ /*
71430cf6 35736+ * the below does up(sbinfo->delete_mutex). Do not get
44254afd
MT
35737+ * confused
35738+ */
35739+ reiser4_release_reserved(inode->i_sb);
35740+ return result;
35741+ }
35742+
35743+ lock_page(page);
35744+ assert("vs-1066", PageLocked(page));
71430cf6 35745+ zero_user_page(page, padd_from, PAGE_CACHE_SIZE - padd_from, KM_USER0);
44254afd
MT
35746+ unlock_page(page);
35747+ page_cache_release(page);
71430cf6 35748+ /* the below does up(sbinfo->delete_mutex). Do not get confused */
44254afd
MT
35749+ reiser4_release_reserved(inode->i_sb);
35750+ return 0;
35751+}
35752+
35753+/**
35754+ * should_have_notail
35755+ * @uf_info:
35756+ * @new_size:
35757+ *
35758+ * Calls formatting plugin to see whether file of size @new_size has to be
35759+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
35760+ */
71430cf6 35761+static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
44254afd
MT
35762+{
35763+ if (!uf_info->tplug)
35764+ return 1;
35765+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
35766+ new_size);
35767+
35768+}
35769+
35770+/**
35771+ * truncate_file_body - change length of file
35772+ * @inode: inode of file
35773+ * @new_size: new file length
35774+ *
35775+ * Adjusts items file @inode is built of to match @new_size. It may either cut
35776+ * items or add them to represent a hole at the end of file. The caller has to
35777+ * obtain exclusive access to the file.
35778+ */
35779+static int truncate_file_body(struct inode *inode, loff_t new_size)
35780+{
35781+ int result;
35782+
35783+ if (inode->i_size < new_size) {
35784+ /* expanding truncate */
35785+ struct dentry dentry;
35786+ struct file file;
71430cf6 35787+ struct unix_file_info *uf_info;
44254afd
MT
35788+
35789+ dentry.d_inode = inode;
35790+ file.f_dentry = &dentry;
35791+ file.private_data = NULL;
35792+ file.f_pos = new_size;
35793+ file.private_data = NULL;
35794+ uf_info = unix_file_inode_data(inode);
35795+ result = find_file_state(inode, uf_info);
35796+ if (result)
35797+ return result;
71430cf6 35798+
44254afd
MT
35799+ if (should_have_notail(uf_info, new_size)) {
35800+ /*
35801+ * file of size @new_size has to be built of
35802+ * extents. If it is built of tails - convert to
35803+ * extents
35804+ */
35805+ if (uf_info->container == UF_CONTAINER_TAILS) {
35806+ /*
35807+ * if file is being convered by another process
35808+ * - wait until it completes
35809+ */
35810+ while (1) {
71430cf6
MT
35811+ if (reiser4_inode_get_flag(inode,
35812+ REISER4_PART_IN_CONV)) {
44254afd
MT
35813+ drop_exclusive_access(uf_info);
35814+ schedule();
35815+ get_exclusive_access(uf_info);
35816+ continue;
35817+ }
35818+ break;
35819+ }
71430cf6 35820+
44254afd
MT
35821+ if (uf_info->container == UF_CONTAINER_TAILS) {
35822+ result = tail2extent(uf_info);
35823+ if (result)
35824+ return result;
35825+ }
35826+ }
71430cf6
MT
35827+ result = reiser4_write_extent(&file, NULL, 0,
35828+ &new_size);
44254afd
MT
35829+ if (result)
35830+ return result;
35831+ uf_info->container = UF_CONTAINER_EXTENTS;
35832+ } else {
35833+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
71430cf6
MT
35834+ result = reiser4_write_extent(&file, NULL, 0,
35835+ &new_size);
44254afd
MT
35836+ if (result)
35837+ return result;
35838+ } else {
71430cf6
MT
35839+ result = reiser4_write_tail(&file, NULL, 0,
35840+ &new_size);
44254afd
MT
35841+ if (result)
35842+ return result;
35843+ uf_info->container = UF_CONTAINER_TAILS;
35844+ }
35845+ }
35846+ BUG_ON(result > 0);
35847+ INODE_SET_FIELD(inode, i_size, new_size);
35848+ file_update_time(&file);
35849+ result = reiser4_update_sd(inode);
35850+ BUG_ON(result != 0);
35851+ reiser4_free_file_fsdata(&file);
35852+ } else
35853+ result = shorten_file(inode, new_size);
35854+ return result;
35855+}
35856+
35857+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
35858+
35859+/**
35860+ * load_file_hint - copy hint from struct file to local variable
35861+ * @file: file to get hint from
35862+ * @hint: structure to fill
35863+ *
35864+ * Reiser4 specific portion of struct file may contain information (hint)
35865+ * stored on exiting from previous read or write. That information includes
35866+ * seal of znode and coord within that znode where previous read or write
35867+ * stopped. This function copies that information to @hint if it was stored or
35868+ * initializes @hint by 0s otherwise.
35869+ */
35870+int load_file_hint(struct file *file, hint_t *hint)
35871+{
35872+ reiser4_file_fsdata *fsdata;
35873+
35874+ if (file) {
35875+ fsdata = reiser4_get_file_fsdata(file);
35876+ if (IS_ERR(fsdata))
35877+ return PTR_ERR(fsdata);
35878+
35879+ spin_lock_inode(file->f_dentry->d_inode);
71430cf6 35880+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
44254afd
MT
35881+ *hint = fsdata->reg.hint;
35882+ init_lh(&hint->lh);
35883+ hint->ext_coord.lh = &hint->lh;
35884+ spin_unlock_inode(file->f_dentry->d_inode);
35885+ /*
35886+ * force re-validation of the coord on the first
35887+ * iteration of the read/write loop.
35888+ */
35889+ hint->ext_coord.valid = 0;
35890+ assert("nikita-19892", coords_equal(&hint->seal.coord1,
35891+ &hint->ext_coord.
35892+ coord));
35893+ return 0;
35894+ }
35895+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
35896+ spin_unlock_inode(file->f_dentry->d_inode);
35897+ }
35898+ hint_init_zero(hint);
35899+ return 0;
35900+}
35901+
35902+/**
35903+ * save_file_hint - copy hint to reiser4 private struct file's part
35904+ * @file: file to save hint in
35905+ * @hint: hint to save
35906+ *
35907+ * This copies @hint to reiser4 private part of struct file. It can help
35908+ * speedup future accesses to the file.
35909+ */
35910+void save_file_hint(struct file *file, const hint_t *hint)
35911+{
35912+ reiser4_file_fsdata *fsdata;
35913+
35914+ assert("edward-1337", hint != NULL);
35915+
71430cf6 35916+ if (!file || !reiser4_seal_is_set(&hint->seal))
44254afd
MT
35917+ return;
35918+ fsdata = reiser4_get_file_fsdata(file);
35919+ assert("vs-965", !IS_ERR(fsdata));
35920+ assert("nikita-19891",
35921+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
35922+ assert("vs-30", hint->lh.owner == NULL);
35923+ spin_lock_inode(file->f_dentry->d_inode);
35924+ fsdata->reg.hint = *hint;
35925+ spin_unlock_inode(file->f_dentry->d_inode);
35926+ return;
35927+}
35928+
71430cf6 35929+void reiser4_unset_hint(hint_t * hint)
44254afd
MT
35930+{
35931+ assert("vs-1315", hint);
35932+ hint->ext_coord.valid = 0;
71430cf6 35933+ reiser4_seal_done(&hint->seal);
44254afd
MT
35934+ done_lh(&hint->lh);
35935+}
35936+
71430cf6
MT
35937+/* coord must be set properly. So, that reiser4_set_hint
35938+ has nothing to do */
35939+void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
35940+ znode_lock_mode mode)
44254afd
MT
35941+{
35942+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
35943+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
35944+
71430cf6 35945+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
44254afd
MT
35946+ hint->offset = get_key_offset(key);
35947+ hint->mode = mode;
35948+ done_lh(&hint->lh);
35949+}
35950+
35951+int hint_is_set(const hint_t * hint)
35952+{
71430cf6 35953+ return reiser4_seal_is_set(&hint->seal);
44254afd
MT
35954+}
35955+
35956+#if REISER4_DEBUG
35957+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
35958+{
35959+ return (get_key_locality(k1) == get_key_locality(k2) &&
35960+ get_key_type(k1) == get_key_type(k2) &&
35961+ get_key_band(k1) == get_key_band(k2) &&
35962+ get_key_ordering(k1) == get_key_ordering(k2) &&
35963+ get_key_objectid(k1) == get_key_objectid(k2));
35964+}
35965+#endif
35966+
71430cf6 35967+static int
44254afd
MT
35968+hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35969+ znode_lock_mode lock_mode)
35970+{
35971+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
35972+ /* hint either not set or set by different operation */
35973+ return RETERR(-E_REPEAT);
35974+
35975+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
35976+
35977+ if (check_key && get_key_offset(key) != hint->offset)
35978+ /* hint is set for different key */
35979+ return RETERR(-E_REPEAT);
35980+
35981+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
71430cf6
MT
35982+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
35983+ hint->ext_coord.lh, lock_mode,
35984+ ZNODE_LOCK_LOPRI);
44254afd
MT
35985+}
35986+
44254afd 35987+/**
71430cf6
MT
35988+ * find_or_create_extent -
35989+ * @page:
44254afd
MT
35990+ *
35991+ *
35992+ */
35993+/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
35994+ unallocated extent if it does not exist yet, initialize jnode, capture page */
35995+int find_or_create_extent(struct page *page)
35996+{
35997+ int result;
35998+ struct inode *inode;
35999+ int plugged_hole;
36000+
36001+ jnode *node;
36002+
36003+ assert("vs-1065", page->mapping && page->mapping->host);
36004+ inode = page->mapping->host;
36005+
36006+ lock_page(page);
36007+ node = jnode_of_page(page);
71430cf6
MT
36008+ if (IS_ERR(node)) {
36009+ unlock_page(page);
44254afd 36010+ return PTR_ERR(node);
71430cf6
MT
36011+ }
36012+ JF_SET(node, JNODE_WRITE_PREPARED);
36013+ unlock_page(page);
44254afd
MT
36014+
36015+ if (node->blocknr == 0) {
36016+ plugged_hole = 0;
71430cf6
MT
36017+ result = reiser4_update_extent(inode, node, page_offset(page),
36018+ &plugged_hole);
44254afd 36019+ if (result) {
71430cf6 36020+ JF_CLR(node, JNODE_WRITE_PREPARED);
44254afd 36021+ jput(node);
71430cf6 36022+ warning("", "reiser4_update_extent failed: %d", result);
44254afd
MT
36023+ return result;
36024+ }
36025+ if (plugged_hole)
36026+ reiser4_update_sd(inode);
36027+ } else {
36028+ spin_lock_jnode(node);
71430cf6 36029+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
44254afd
MT
36030+ BUG_ON(result != 0);
36031+ jnode_make_dirty_locked(node);
36032+ spin_unlock_jnode(node);
36033+ }
36034+
36035+ BUG_ON(node->atom == NULL);
71430cf6 36036+ JF_CLR(node, JNODE_WRITE_PREPARED);
44254afd
MT
36037+ jput(node);
36038+
36039+ if (get_current_context()->entd) {
36040+ entd_context *ent = get_entd_context(node->tree->super);
36041+
36042+ if (ent->cur_request->page == page)
36043+ ent->cur_request->node = node;
36044+ }
36045+ return 0;
36046+}
36047+
36048+/**
36049+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
36050+ * @inode: inode to check
36051+ *
36052+ * Returns true if inode's mapping has dirty pages which do not belong to any
36053+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36054+ * tree or were eflushed and can be found via jnodes tagged
36055+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36056+ */
36057+static int has_anonymous_pages(struct inode *inode)
36058+{
36059+ int result;
36060+
36061+ read_lock_irq(&inode->i_mapping->tree_lock);
36062+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36063+ read_unlock_irq(&inode->i_mapping->tree_lock);
36064+ return result;
36065+}
36066+
36067+/**
36068+ * capture_page_and_create_extent -
36069+ * @page: page to be captured
36070+ *
36071+ * Grabs space for extent creation and stat data update and calls function to
36072+ * do actual work.
36073+ */
36074+static int capture_page_and_create_extent(struct page *page)
36075+{
36076+ int result;
36077+ struct inode *inode;
36078+
36079+ assert("vs-1084", page->mapping && page->mapping->host);
36080+ inode = page->mapping->host;
36081+ assert("vs-1139",
36082+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36083+ /* page belongs to file */
36084+ assert("vs-1393",
71430cf6 36085+ inode->i_size > page_offset(page));
44254afd
MT
36086+
36087+ /* page capture may require extent creation (if it does not exist yet)
36088+ and stat data's update (number of blocks changes on extent
36089+ creation) */
36090+ grab_space_enable();
71430cf6
MT
36091+ result = reiser4_grab_space(2 * estimate_one_insert_into_item
36092+ (reiser4_tree_by_inode(inode)),
36093+ BA_CAN_COMMIT);
44254afd
MT
36094+ if (likely(!result))
36095+ result = find_or_create_extent(page);
36096+
36097+ if (result != 0)
36098+ SetPageError(page);
36099+ return result;
36100+}
36101+
36102+/* this is implementation of method commit_write of struct
36103+ address_space_operations for unix file plugin */
36104+int
36105+commit_write_unix_file(struct file *file, struct page *page,
36106+ unsigned from, unsigned to)
36107+{
36108+ reiser4_context *ctx;
36109+ struct inode *inode;
36110+ int result;
36111+
36112+ assert("umka-3101", file != NULL);
36113+ assert("umka-3102", page != NULL);
36114+ assert("umka-3093", PageLocked(page));
36115+
36116+ SetPageUptodate(page);
36117+
36118+ inode = page->mapping->host;
71430cf6 36119+ ctx = reiser4_init_context(page->mapping->host->i_sb);
44254afd
MT
36120+ if (IS_ERR(ctx))
36121+ return PTR_ERR(ctx);
36122+ page_cache_get(page);
36123+ unlock_page(page);
36124+ result = capture_page_and_create_extent(page);
36125+ lock_page(page);
36126+ page_cache_release(page);
36127+
36128+ /* don't commit transaction under inode semaphore */
36129+ context_set_commit_async(ctx);
36130+ reiser4_exit_context(ctx);
36131+ return result;
36132+}
36133+
36134+/*
36135+ * Support for "anonymous" pages and jnodes.
36136+ *
36137+ * When file is write-accessed through mmap pages can be dirtied from the user
36138+ * level. In this case kernel is not notified until one of following happens:
36139+ *
36140+ * (1) msync()
36141+ *
36142+ * (2) truncate() (either explicit or through unlink)
36143+ *
36144+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
36145+ * starting write-back.
36146+ *
36147+ * As a result of (3) ->writepage may be called on a dirty page without
36148+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36149+ * (iozone) generate huge number of anonymous pages. Emergency flush handles
36150+ * this situation by creating jnode for anonymous page, starting IO on the
36151+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36152+ * memory. Such jnode is also called anonymous.
36153+ *
36154+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36155+ * tree. This is done by capture_anonymous_*() functions below.
36156+ */
36157+
36158+/**
36159+ * capture_anonymous_page - involve page into transaction
36160+ * @pg: page to deal with
36161+ *
36162+ * Takes care that @page has corresponding metadata in the tree, creates jnode
36163+ * for @page and captures it. On success 1 is returned.
36164+ */
36165+static int capture_anonymous_page(struct page *page)
36166+{
36167+ int result;
36168+
36169+ if (PageWriteback(page))
36170+ /* FIXME: do nothing? */
36171+ return 0;
36172+
36173+ result = capture_page_and_create_extent(page);
36174+ if (result == 0) {
36175+ result = 1;
36176+ } else
36177+ warning("nikita-3329",
36178+ "Cannot capture anon page: %i", result);
36179+
36180+ return result;
36181+}
36182+
36183+/**
36184+ * capture_anonymous_pages - find and capture pages dirtied via mmap
36185+ * @mapping: address space where to look for pages
36186+ * @index: start index
36187+ * @to_capture: maximum number of pages to capture
36188+ *
36189+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36190+ * captures (involves into atom) them, returns number of captured pages,
36191+ * updates @index to next page after the last captured one.
36192+ */
36193+static int
36194+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36195+ unsigned int to_capture)
36196+{
36197+ int result;
36198+ struct pagevec pvec;
36199+ unsigned int i, count;
36200+ int nr;
36201+
36202+ pagevec_init(&pvec, 0);
36203+ count = min(pagevec_space(&pvec), to_capture);
36204+ nr = 0;
36205+
36206+ /* find pages tagged MOVED */
36207+ write_lock_irq(&mapping->tree_lock);
36208+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36209+ (void **)pvec.pages, *index, count,
36210+ PAGECACHE_TAG_REISER4_MOVED);
36211+ if (pagevec_count(&pvec) == 0) {
36212+ /*
36213+ * there are no pages tagged MOVED in mapping->page_tree
36214+ * starting from *index
36215+ */
36216+ write_unlock_irq(&mapping->tree_lock);
36217+ *index = (pgoff_t)-1;
36218+ return 0;
36219+ }
36220+
36221+ /* clear MOVED tag for all found pages */
36222+ for (i = 0; i < pagevec_count(&pvec); i++) {
36223+ void *p;
36224+
36225+ page_cache_get(pvec.pages[i]);
36226+ p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36227+ PAGECACHE_TAG_REISER4_MOVED);
36228+ assert("vs-49", p == pvec.pages[i]);
36229+ }
36230+ write_unlock_irq(&mapping->tree_lock);
36231+
36232+
36233+ *index = pvec.pages[i - 1]->index + 1;
36234+
36235+ for (i = 0; i < pagevec_count(&pvec); i++) {
36236+ /*
36237+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
71430cf6 36238+ * reiser4_set_page_dirty_internal which is called when jnode is
44254afd
MT
36239+ * captured
36240+ */
36241+ result = capture_anonymous_page(pvec.pages[i]);
36242+ if (result == 1)
36243+ nr++;
36244+ else {
36245+ if (result < 0) {
36246+ warning("vs-1454",
36247+ "failed to capture page: "
36248+ "result=%d, captured=%d)\n",
36249+ result, i);
36250+
36251+ /*
36252+ * set MOVED tag to all pages which left not
36253+ * captured
36254+ */
36255+ write_lock_irq(&mapping->tree_lock);
36256+ for (; i < pagevec_count(&pvec); i ++) {
36257+ radix_tree_tag_set(&mapping->page_tree,
36258+ pvec.pages[i]->index,
36259+ PAGECACHE_TAG_REISER4_MOVED);
36260+ }
36261+ write_unlock_irq(&mapping->tree_lock);
36262+
36263+ pagevec_release(&pvec);
36264+ return result;
36265+ } else {
36266+ /*
36267+ * result == 0. capture_anonymous_page returns
36268+ * 0 for Writeback-ed page. Set MOVED tag on
36269+ * that page
36270+ */
36271+ write_lock_irq(&mapping->tree_lock);
36272+ radix_tree_tag_set(&mapping->page_tree,
36273+ pvec.pages[i]->index,
36274+ PAGECACHE_TAG_REISER4_MOVED);
36275+ write_unlock_irq(&mapping->tree_lock);
36276+ if (i == 0)
36277+ *index = pvec.pages[0]->index;
36278+ else
36279+ *index = pvec.pages[i - 1]->index + 1;
36280+ }
36281+ }
36282+ }
36283+ pagevec_release(&pvec);
36284+ return nr;
36285+}
36286+
36287+/**
36288+ * capture_anonymous_jnodes - find and capture anonymous jnodes
36289+ * @mapping: address space where to look for jnodes
36290+ * @from: start index
36291+ * @to: end index
36292+ * @to_capture: maximum number of jnodes to capture
36293+ *
36294+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36295+ * the range of indexes @from-@to and captures them, returns number of captured
36296+ * jnodes, updates @from to next jnode after the last captured one.
36297+ */
36298+static int
36299+capture_anonymous_jnodes(struct address_space *mapping,
36300+ pgoff_t *from, pgoff_t to, int to_capture)
36301+{
36302+ *from = to;
36303+ return 0;
36304+}
36305+
36306+/*
36307+ * Commit atom of the jnode of a page.
36308+ */
36309+static int sync_page(struct page *page)
36310+{
36311+ int result;
36312+ do {
36313+ jnode *node;
36314+ txn_atom *atom;
36315+
36316+ lock_page(page);
36317+ node = jprivate(page);
36318+ if (node != NULL) {
36319+ spin_lock_jnode(node);
36320+ atom = jnode_get_atom(node);
36321+ spin_unlock_jnode(node);
36322+ } else
36323+ atom = NULL;
36324+ unlock_page(page);
71430cf6 36325+ result = reiser4_sync_atom(atom);
44254afd
MT
36326+ } while (result == -E_REPEAT);
36327+ /*
36328+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36329+ * handle the case where more pages get added to the atom while we are
36330+ * syncing it?
36331+ */
36332+ assert("nikita-3485", ergo(result == 0,
36333+ get_current_context()->trans->atom == NULL));
36334+ return result;
36335+}
36336+
36337+/*
36338+ * Commit atoms of pages on @pages list.
36339+ * call sync_page for each page from mapping's page tree
36340+ */
36341+static int sync_page_list(struct inode *inode)
36342+{
36343+ int result;
36344+ struct address_space *mapping;
36345+ unsigned long from; /* start index for radix_tree_gang_lookup */
36346+ unsigned int found; /* return value for radix_tree_gang_lookup */
36347+
36348+ mapping = inode->i_mapping;
36349+ from = 0;
36350+ result = 0;
36351+ read_lock_irq(&mapping->tree_lock);
36352+ while (result == 0) {
36353+ struct page *page;
36354+
36355+ found =
36356+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36357+ from, 1);
36358+ assert("", found < 2);
36359+ if (found == 0)
36360+ break;
36361+
36362+ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36363+ sys_fsync */
36364+ page_cache_get(page);
36365+ read_unlock_irq(&mapping->tree_lock);
36366+
36367+ from = page->index + 1;
36368+
36369+ result = sync_page(page);
36370+
36371+ page_cache_release(page);
36372+ read_lock_irq(&mapping->tree_lock);
36373+ }
36374+
36375+ read_unlock_irq(&mapping->tree_lock);
36376+ return result;
36377+}
36378+
36379+static int commit_file_atoms(struct inode *inode)
36380+{
36381+ int result;
71430cf6 36382+ struct unix_file_info *uf_info;
44254afd
MT
36383+
36384+ uf_info = unix_file_inode_data(inode);
36385+
36386+ get_exclusive_access(uf_info);
36387+ /*
36388+ * find what items file is made from
36389+ */
36390+ result = find_file_state(inode, uf_info);
36391+ drop_exclusive_access(uf_info);
36392+ if (result != 0)
36393+ return result;
36394+
36395+ /*
36396+ * file state cannot change because we are under ->i_mutex
36397+ */
36398+ switch (uf_info->container) {
36399+ case UF_CONTAINER_EXTENTS:
36400+ /* find_file_state might open join an atom */
71430cf6 36401+ reiser4_txn_restart_current();
44254afd
MT
36402+ result =
36403+ /*
36404+ * when we are called by
36405+ * filemap_fdatawrite->
36406+ * do_writepages()->
36407+ * reiser4_writepages()
36408+ *
36409+ * inode->i_mapping->dirty_pages are spices into
36410+ * ->io_pages, leaving ->dirty_pages dirty.
36411+ *
36412+ * When we are called from
36413+ * reiser4_fsync()->sync_unix_file(), we have to
36414+ * commit atoms of all pages on the ->dirty_list.
36415+ *
36416+ * So for simplicity we just commit ->io_pages and
36417+ * ->dirty_pages.
36418+ */
36419+ sync_page_list(inode);
36420+ break;
36421+ case UF_CONTAINER_TAILS:
36422+ /*
36423+ * NOTE-NIKITA probably we can be smarter for tails. For now
36424+ * just commit all existing atoms.
36425+ */
36426+ result = txnmgr_force_commit_all(inode->i_sb, 0);
36427+ break;
36428+ case UF_CONTAINER_EMPTY:
36429+ result = 0;
36430+ break;
36431+ case UF_CONTAINER_UNKNOWN:
36432+ default:
36433+ result = -EIO;
36434+ break;
36435+ }
36436+
36437+ /*
36438+ * commit current transaction: there can be captured nodes from
36439+ * find_file_state() and finish_conversion().
36440+ */
71430cf6 36441+ reiser4_txn_restart_current();
44254afd
MT
36442+ return result;
36443+}
36444+
36445+/**
36446+ * writepages_unix_file - writepages of struct address_space_operations
36447+ * @mapping:
36448+ * @wbc:
36449+ *
36450+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36451+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36452+ * created by reiser4_writepage.
36453+ */
36454+int writepages_unix_file(struct address_space *mapping,
36455+ struct writeback_control *wbc)
36456+{
36457+ int result;
71430cf6 36458+ struct unix_file_info *uf_info;
44254afd
MT
36459+ pgoff_t pindex, jindex, nr_pages;
36460+ long to_capture;
36461+ struct inode *inode;
36462+
36463+ inode = mapping->host;
36464+ if (!has_anonymous_pages(inode)) {
36465+ result = 0;
36466+ goto end;
36467+ }
71430cf6 36468+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
44254afd 36469+ result = 0;
71430cf6
MT
36470+ nr_pages = size_in_pages(i_size_read(inode));
36471+
44254afd
MT
36472+ uf_info = unix_file_inode_data(inode);
36473+
36474+ do {
36475+ reiser4_context *ctx;
36476+
36477+ if (wbc->sync_mode != WB_SYNC_ALL)
36478+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36479+ else
36480+ to_capture = CAPTURE_APAGE_BURST;
36481+
71430cf6 36482+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
36483+ if (IS_ERR(ctx)) {
36484+ result = PTR_ERR(ctx);
36485+ break;
36486+ }
36487+ /* avoid recursive calls to ->sync_inodes */
36488+ ctx->nobalance = 1;
36489+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36490+ assert("", LOCK_CNT_NIL(inode_sem_w));
36491+ assert("", LOCK_CNT_NIL(inode_sem_r));
36492+
71430cf6 36493+ reiser4_txn_restart_current();
44254afd
MT
36494+
36495+ /* we have to get nonexclusive access to the file */
36496+ if (get_current_context()->entd) {
36497+ /*
36498+ * use nonblocking version of nonexclusive_access to
36499+ * avoid deadlock which might look like the following:
36500+ * process P1 holds NEA on file F1 and called entd to
36501+ * reclaim some memory. Entd works for P1 and is going
36502+ * to capture pages of file F2. To do that entd has to
36503+ * get NEA to F2. F2 is held by process P2 which also
36504+ * called entd. But entd is serving P1 at the moment
36505+ * and P2 has to wait. Process P3 trying to get EA to
36506+ * file F2. Existence of pending EA request to file F2
36507+ * makes impossible for entd to get NEA to file
36508+ * F2. Neither of these process can continue. Using
36509+ * nonblocking version of gettign NEA is supposed to
36510+ * avoid this deadlock.
36511+ */
36512+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
36513+ result = RETERR(-EBUSY);
36514+ reiser4_exit_context(ctx);
36515+ break;
36516+ }
36517+ } else
36518+ get_nonexclusive_access(uf_info);
36519+
36520+ while (to_capture > 0) {
36521+ pgoff_t start;
36522+
36523+ assert("vs-1727", jindex <= pindex);
36524+ if (pindex == jindex) {
36525+ start = pindex;
36526+ result =
36527+ capture_anonymous_pages(inode->i_mapping,
36528+ &pindex,
36529+ to_capture);
36530+ if (result <= 0)
36531+ break;
36532+ to_capture -= result;
36533+ wbc->nr_to_write -= result;
36534+ if (start + result == pindex) {
36535+ jindex = pindex;
36536+ continue;
36537+ }
36538+ if (to_capture <= 0)
36539+ break;
36540+ }
36541+ /* deal with anonymous jnodes between jindex and pindex */
36542+ result =
36543+ capture_anonymous_jnodes(inode->i_mapping, &jindex,
36544+ pindex, to_capture);
36545+ if (result < 0)
36546+ break;
36547+ to_capture -= result;
36548+ get_current_context()->nr_captured += result;
36549+
36550+ if (jindex == (pgoff_t) - 1) {
36551+ assert("vs-1728", pindex == (pgoff_t) - 1);
36552+ break;
36553+ }
36554+ }
36555+ if (to_capture <= 0)
36556+ /* there may be left more pages */
36557+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
36558+
36559+ drop_nonexclusive_access(uf_info);
36560+ if (result < 0) {
36561+ /* error happened */
36562+ reiser4_exit_context(ctx);
36563+ return result;
36564+ }
36565+ if (wbc->sync_mode != WB_SYNC_ALL) {
36566+ reiser4_exit_context(ctx);
36567+ return 0;
36568+ }
36569+ result = commit_file_atoms(inode);
36570+ reiser4_exit_context(ctx);
36571+ if (pindex >= nr_pages && jindex == pindex)
36572+ break;
36573+ } while (1);
36574+
36575+ end:
36576+ if (is_in_reiser4_context()) {
36577+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
36578+ /*
36579+ * there are already pages to flush, flush them out, do
36580+ * not delay until end of reiser4_sync_inodes
36581+ */
71430cf6 36582+ reiser4_writeout(inode->i_sb, wbc);
44254afd
MT
36583+ get_current_context()->nr_captured = 0;
36584+ }
36585+ }
36586+ return result;
36587+}
36588+
36589+/*
36590+ * ->sync() method for unix file.
36591+ *
36592+ * We are trying to be smart here. Instead of committing all atoms (original
36593+ * solution), we scan dirty pages of this file and commit all atoms they are
36594+ * part of.
36595+ *
36596+ * Situation is complicated by anonymous pages: i.e., extent-less pages
36597+ * dirtied through mmap. Fortunately sys_fsync() first calls
36598+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
36599+ * all missing extents and capture anonymous pages.
36600+ */
36601+int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
36602+{
36603+ reiser4_context *ctx;
36604+ txn_atom *atom;
36605+ reiser4_block_nr reserve;
36606+
71430cf6 36607+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
44254afd
MT
36608+ if (IS_ERR(ctx))
36609+ return PTR_ERR(ctx);
36610+
36611+ reserve = estimate_update_common(dentry->d_inode);
36612+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
36613+ reiser4_exit_context(ctx);
36614+ return RETERR(-ENOSPC);
36615+ }
36616+ write_sd_by_inode_common(dentry->d_inode);
36617+
36618+ atom = get_current_atom_locked();
36619+ spin_lock_txnh(ctx->trans);
36620+ force_commit_atom(ctx->trans);
36621+ reiser4_exit_context(ctx);
36622+ return 0;
36623+}
36624+
36625+/**
36626+ * readpage_unix_file_nolock - readpage of struct address_space_operations
36627+ * @file:
36628+ * @page:
36629+ *
36630+ * Compose a key and search for item containing information about @page
36631+ * data. If item is found - its readpage method is called.
36632+ */
71430cf6 36633+int readpage_unix_file(struct file *file, struct page *page)
44254afd
MT
36634+{
36635+ reiser4_context *ctx;
36636+ int result;
36637+ struct inode *inode;
36638+ reiser4_key key;
36639+ item_plugin *iplug;
36640+ hint_t *hint;
36641+ lock_handle *lh;
36642+ coord_t *coord;
36643+
36644+ assert("vs-1062", PageLocked(page));
36645+ assert("vs-976", !PageUptodate(page));
36646+ assert("vs-1061", page->mapping && page->mapping->host);
36647+
71430cf6 36648+ if (page->mapping->host->i_size <= page_offset(page)) {
44254afd
MT
36649+ /* page is out of file already */
36650+ unlock_page(page);
36651+ return -EINVAL;
36652+ }
36653+
36654+ inode = page->mapping->host;
71430cf6 36655+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
36656+ if (IS_ERR(ctx)) {
36657+ unlock_page(page);
36658+ return PTR_ERR(ctx);
36659+ }
36660+
71430cf6 36661+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
36662+ if (hint == NULL) {
36663+ unlock_page(page);
36664+ reiser4_exit_context(ctx);
36665+ return RETERR(-ENOMEM);
36666+ }
36667+
36668+ result = load_file_hint(file, hint);
36669+ if (result) {
36670+ kfree(hint);
36671+ unlock_page(page);
36672+ reiser4_exit_context(ctx);
36673+ return result;
36674+ }
36675+ lh = &hint->lh;
36676+
36677+ /* get key of first byte of the page */
71430cf6 36678+ key_by_inode_and_offset_common(inode, page_offset(page), &key);
44254afd
MT
36679+
36680+ /* look for file metadata corresponding to first byte of page */
36681+ page_cache_get(page);
36682+ unlock_page(page);
36683+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
36684+ lock_page(page);
36685+ page_cache_release(page);
36686+
36687+ if (page->mapping == NULL) {
36688+ /*
36689+ * readpage allows truncate to run concurrently. Page was
36690+ * truncated while it was not locked
36691+ */
36692+ done_lh(lh);
36693+ kfree(hint);
36694+ unlock_page(page);
71430cf6 36695+ reiser4_txn_restart(ctx);
44254afd
MT
36696+ reiser4_exit_context(ctx);
36697+ return -EINVAL;
36698+ }
36699+
36700+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
36701+ if (result == CBK_COORD_FOUND &&
36702+ hint->ext_coord.coord.between != AT_UNIT)
36703+ /* file is truncated */
36704+ result = -EINVAL;
36705+ done_lh(lh);
36706+ kfree(hint);
36707+ unlock_page(page);
71430cf6 36708+ reiser4_txn_restart(ctx);
44254afd
MT
36709+ reiser4_exit_context(ctx);
36710+ return result;
36711+ }
36712+
36713+ /*
36714+ * item corresponding to page is found. It can not be removed because
36715+ * znode lock is held
36716+ */
36717+ if (PageUptodate(page)) {
36718+ done_lh(lh);
36719+ kfree(hint);
36720+ unlock_page(page);
71430cf6 36721+ reiser4_txn_restart(ctx);
44254afd
MT
36722+ reiser4_exit_context(ctx);
36723+ return 0;
36724+ }
36725+
36726+ coord = &hint->ext_coord.coord;
36727+ result = zload(coord->node);
36728+ if (result) {
36729+ done_lh(lh);
36730+ kfree(hint);
36731+ unlock_page(page);
71430cf6 36732+ reiser4_txn_restart(ctx);
44254afd
MT
36733+ reiser4_exit_context(ctx);
36734+ return result;
36735+ }
36736+
71430cf6 36737+ validate_extended_coord(&hint->ext_coord, page_offset(page));
44254afd
MT
36738+
36739+ if (!coord_is_existing_unit(coord)) {
36740+ /* this indicates corruption */
36741+ warning("vs-280",
36742+ "Looking for page %lu of file %llu (size %lli). "
36743+ "No file items found (%d). File is corrupted?\n",
36744+ page->index, (unsigned long long)get_inode_oid(inode),
36745+ inode->i_size, result);
36746+ zrelse(coord->node);
36747+ done_lh(lh);
36748+ kfree(hint);
36749+ unlock_page(page);
71430cf6 36750+ reiser4_txn_restart(ctx);
44254afd
MT
36751+ reiser4_exit_context(ctx);
36752+ return RETERR(-EIO);
36753+ }
36754+
36755+ /*
36756+ * get plugin of found item or use plugin if extent if there are no
36757+ * one
36758+ */
36759+ iplug = item_plugin_by_coord(coord);
36760+ if (iplug->s.file.readpage)
36761+ result = iplug->s.file.readpage(coord, page);
36762+ else
36763+ result = RETERR(-EINVAL);
36764+
36765+ if (!result) {
36766+ set_key_offset(&key,
36767+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
71430cf6
MT
36768+ /* FIXME should call reiser4_set_hint() */
36769+ reiser4_unset_hint(hint);
44254afd
MT
36770+ } else {
36771+ unlock_page(page);
71430cf6 36772+ reiser4_unset_hint(hint);
44254afd
MT
36773+ }
36774+ assert("vs-979",
36775+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
36776+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
36777+
36778+ zrelse(coord->node);
36779+ done_lh(lh);
36780+
36781+ save_file_hint(file, hint);
36782+ kfree(hint);
36783+
36784+ /*
36785+ * FIXME: explain why it is needed. HINT: page allocation in write can
36786+ * not be done when atom is not NULL because reiser4_writepage can not
36787+ * kick entd and have to eflush
36788+ */
71430cf6 36789+ reiser4_txn_restart(ctx);
44254afd
MT
36790+ reiser4_exit_context(ctx);
36791+ return result;
36792+}
36793+
71430cf6
MT
36794+struct uf_readpages_context {
36795+ lock_handle lh;
36796+ coord_t coord;
36797+};
36798+
36799+/* A callback function for readpages_unix_file/read_cache_pages.
36800+ * If the file is build of tails, then return error (-ENOENT).
44254afd 36801+ *
71430cf6
MT
36802+ * @data -- a pointer to reiser4_readpages_context object,
36803+ * to save the twig lock and the coord between
36804+ * read_cache_page iterations.
36805+ * @page -- page to start read.
44254afd 36806+ */
71430cf6 36807+static int uf_readpages_filler(void * data, struct page * page)
44254afd 36808+{
71430cf6
MT
36809+ struct uf_readpages_context *rc = data;
36810+ jnode * node;
36811+ int ret = 0;
36812+ reiser4_extent *ext;
36813+ __u64 ext_index;
36814+ int cbk_done = 0;
36815+ struct address_space * mapping = page->mapping;
36816+
36817+ if (PageUptodate(page)) {
36818+ unlock_page(page);
36819+ return 0;
36820+ }
36821+ page_cache_get(page);
36822+
36823+ if (rc->lh.node == 0) {
36824+ /* no twig lock - have to do tree search. */
36825+ reiser4_key key;
36826+ repeat:
36827+ unlock_page(page);
36828+ key_by_inode_and_offset_common(
36829+ mapping->host, page_offset(page), &key);
36830+ ret = coord_by_key(
36831+ &get_super_private(mapping->host->i_sb)->tree,
36832+ &key, &rc->coord, &rc->lh,
36833+ ZNODE_READ_LOCK, FIND_EXACT,
36834+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
36835+ if (unlikely(ret))
36836+ goto exit;
36837+ lock_page(page);
36838+ cbk_done = 1;
36839+ }
36840+ ret = zload(rc->coord.node);
36841+ if (unlikely(ret))
36842+ goto unlock;
36843+ if (!coord_is_existing_item(&rc->coord) ||
36844+ !item_is_extent(&rc->coord)) {
36845+ zrelse(rc->coord.node);
36846+ ret = RETERR(-EIO);
36847+ goto unlock;
36848+ }
36849+ ext = extent_by_coord(&rc->coord);
36850+ ext_index = extent_unit_index(&rc->coord);
36851+ if (page->index < ext_index ||
36852+ page->index >= ext_index + extent_get_width(ext)) {
36853+ /* the page index doesn't belong to the extent unit
36854+ which the coord points to - release the lock and
36855+ repeat with tree search. */
36856+ zrelse(rc->coord.node);
36857+ done_lh(&rc->lh);
36858+ /* we can be here after a CBK call only in case of
36859+ corruption of the tree or the tree lookup algorithm bug. */
36860+ if (unlikely(cbk_done)) {
36861+ ret = RETERR(-EIO);
36862+ goto unlock;
36863+ }
36864+ goto repeat;
36865+ }
36866+ node = jnode_of_page(page);
36867+ if (unlikely(IS_ERR(node))) {
36868+ zrelse(rc->coord.node);
36869+ ret = PTR_ERR(node);
36870+ goto unlock;
36871+ }
36872+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
36873+ jput(node);
36874+ zrelse(rc->coord.node);
36875+ if (likely(!ret))
36876+ goto exit;
36877+ unlock:
36878+ unlock_page(page);
36879+ exit:
36880+ page_cache_release(page);
36881+ return ret;
36882+}
36883+
36884+/**
36885+ * readpages_unix_file - called by the readahead code, starts reading for each
36886+ * page of given list of pages
36887+ */
36888+int readpages_unix_file(
36889+ struct file *file, struct address_space *mapping,
36890+ struct list_head *pages, unsigned nr_pages)
36891+{
36892+ reiser4_context *ctx;
36893+ struct uf_readpages_context rc;
36894+ int ret;
36895+
36896+ ctx = reiser4_init_context(mapping->host->i_sb);
36897+ if (IS_ERR(ctx)) {
36898+ put_pages_list(pages);
36899+ return PTR_ERR(ctx);
36900+ }
36901+ init_lh(&rc.lh);
36902+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
36903+ done_lh(&rc.lh);
36904+ context_set_commit_async(ctx);
36905+ /* close the transaction to protect further page allocation from deadlocks */
36906+ reiser4_txn_restart(ctx);
36907+ reiser4_exit_context(ctx);
36908+ return ret;
44254afd
MT
36909+}
36910+
36911+static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
36912+ loff_t count UNUSED_ARG)
36913+{
36914+ /* We should reserve one block, because of updating of the stat data
36915+ item */
36916+ assert("vs-1249",
36917+ inode_file_plugin(inode)->estimate.update ==
36918+ estimate_update_common);
36919+ return estimate_update_common(inode);
36920+}
36921+
36922+/* this is called with nonexclusive access obtained, file's container can not change */
71430cf6
MT
36923+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
36924+ char __user *buf, /* address of user-space buffer */
36925+ size_t count, /* number of bytes to read */
36926+ loff_t *off)
44254afd
MT
36927+{
36928+ int result;
36929+ struct inode *inode;
36930+ flow_t flow;
36931+ int (*read_f) (struct file *, flow_t *, hint_t *);
36932+ coord_t *coord;
36933+ znode *loaded;
36934+
36935+ inode = file->f_dentry->d_inode;
36936+
36937+ /* build flow */
36938+ assert("vs-1250",
36939+ inode_file_plugin(inode)->flow_by_inode ==
36940+ flow_by_inode_unix_file);
36941+ result =
36942+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
36943+ *off, READ_OP, &flow);
36944+ if (unlikely(result))
36945+ return result;
36946+
36947+ /* get seal and coord sealed with it from reiser4 private data
36948+ of struct file. The coord will tell us where our last read
36949+ of this file finished, and the seal will help to determine
36950+ if that location is still valid.
36951+ */
36952+ coord = &hint->ext_coord.coord;
36953+ while (flow.length && result == 0) {
36954+ result =
36955+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
36956+ if (cbk_errored(result))
36957+ /* error happened */
36958+ break;
36959+
36960+ if (coord->between != AT_UNIT) {
36961+ /* there were no items corresponding to given offset */
36962+ done_lh(hint->ext_coord.lh);
36963+ break;
36964+ }
36965+
36966+ loaded = coord->node;
36967+ result = zload(loaded);
36968+ if (unlikely(result)) {
36969+ done_lh(hint->ext_coord.lh);
36970+ break;
36971+ }
36972+
36973+ if (hint->ext_coord.valid == 0)
36974+ validate_extended_coord(&hint->ext_coord,
36975+ get_key_offset(&flow.key));
36976+
36977+ assert("vs-4", hint->ext_coord.valid == 1);
36978+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
36979+ /* call item's read method */
36980+ read_f = item_plugin_by_coord(coord)->s.file.read;
36981+ result = read_f(file, &flow, hint);
36982+ zrelse(loaded);
36983+ done_lh(hint->ext_coord.lh);
36984+ }
36985+
36986+ return (count - flow.length) ? (count - flow.length) : result;
36987+}
36988+
71430cf6
MT
36989+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
36990+
44254afd
MT
36991+/**
36992+ * read_unix_file - read of struct file_operations
36993+ * @file: file to read from
36994+ * @buf: address of user-space buffer
36995+ * @read_amount: number of bytes to read
36996+ * @off: position in file to read from
36997+ *
36998+ * This is implementation of vfs's read method of struct file_operations for
36999+ * unix file plugin.
37000+ */
37001+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37002+ loff_t *off)
37003+{
37004+ reiser4_context *ctx;
71430cf6 37005+ ssize_t result;
44254afd 37006+ struct inode *inode;
71430cf6 37007+ struct unix_file_info *uf_info;
44254afd
MT
37008+
37009+ if (unlikely(read_amount == 0))
37010+ return 0;
37011+
37012+ assert("umka-072", file != NULL);
37013+ assert("umka-074", off != NULL);
37014+ inode = file->f_dentry->d_inode;
71430cf6 37015+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
44254afd 37016+
71430cf6 37017+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37018+ if (IS_ERR(ctx))
37019+ return PTR_ERR(ctx);
71430cf6
MT
37020+ uf_info = unix_file_inode_data(inode);
37021+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37022+ get_exclusive_access(uf_info);
37023+ result = find_file_state(inode, uf_info);
37024+ if (unlikely(result != 0))
37025+ goto out;
37026+ } else
37027+ get_nonexclusive_access(uf_info);
37028+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
37029+ BA_CAN_COMMIT);
37030+ if (unlikely(result != 0))
37031+ goto out;
37032+ if (uf_info->container == UF_CONTAINER_EXTENTS){
37033+ result = do_sync_read(file, buf, read_amount, off);
37034+ } else if (uf_info->container == UF_CONTAINER_TAILS ||
37035+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
37036+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37037+ result = read_unix_file_container_tails(file, buf, read_amount, off);
37038+ } else {
37039+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
37040+ result = 0;
37041+ }
37042+out:
37043+ drop_access(uf_info);
37044+ context_set_commit_async(ctx);
37045+ reiser4_exit_context(ctx);
37046+ return result;
37047+}
44254afd 37048+
71430cf6
MT
37049+static ssize_t read_unix_file_container_tails(
37050+ struct file *file, char __user *buf, size_t read_amount, loff_t *off)
37051+{
37052+ int result;
37053+ struct inode *inode;
37054+ hint_t *hint;
37055+ struct unix_file_info *uf_info;
37056+ size_t count, read, left;
37057+ loff_t size;
37058+
37059+ assert("umka-072", file != NULL);
37060+ assert("umka-074", off != NULL);
37061+ inode = file->f_dentry->d_inode;
37062+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37063+
37064+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
37065+ if (hint == NULL)
44254afd 37066+ return RETERR(-ENOMEM);
44254afd
MT
37067+
37068+ result = load_file_hint(file, hint);
37069+ if (result) {
37070+ kfree(hint);
44254afd
MT
37071+ return result;
37072+ }
37073+
37074+ left = read_amount;
37075+ count = 0;
37076+ uf_info = unix_file_inode_data(inode);
37077+ while (left > 0) {
71430cf6 37078+ reiser4_txn_restart_current();
44254afd 37079+ size = i_size_read(inode);
71430cf6 37080+ if (*off >= size)
44254afd 37081+ /* position to read from is past the end of file */
44254afd 37082+ break;
44254afd
MT
37083+ if (*off + left > size)
37084+ left = size - *off;
44254afd 37085+ /* faultin user page */
71430cf6
MT
37086+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
37087+ if (result)
37088+ return RETERR(-EFAULT);
44254afd
MT
37089+
37090+ read = read_file(hint, file, buf,
37091+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37092+ off);
44254afd
MT
37093+ if (read < 0) {
37094+ result = read;
37095+ break;
37096+ }
37097+ left -= read;
37098+ buf += read;
37099+
37100+ /* update position in a file */
37101+ *off += read;
37102+ /* total number of read bytes */
37103+ count += read;
37104+ }
44254afd 37105+ done_lh(&hint->lh);
71430cf6 37106+ save_file_hint(file, hint);
44254afd 37107+ kfree(hint);
71430cf6
MT
37108+ if (count)
37109+ file_accessed(file);
44254afd
MT
37110+ /* return number of read bytes or error code if nothing is read */
37111+ return count ? count : result;
37112+}
37113+
37114+/* This function takes care about @file's pages. First of all it checks if
37115+ filesystems readonly and if so gets out. Otherwise, it throws out all
37116+ pages of file if it was mapped for read and going to be mapped for write
37117+ and consists of tails. This is done in order to not manage few copies
37118+ of the data (first in page cache and second one in tails them selves)
37119+ for the case of mapping files consisting tails.
37120+
37121+ Here also tail2extent conversion is performed if it is allowed and file
37122+ is going to be written or mapped for write. This functions may be called
37123+ from write_unix_file() or mmap_unix_file(). */
37124+static int check_pages_unix_file(struct file *file, struct inode *inode)
37125+{
37126+ reiser4_invalidate_pages(inode->i_mapping, 0,
37127+ (inode->i_size + PAGE_CACHE_SIZE -
37128+ 1) >> PAGE_CACHE_SHIFT, 0);
37129+ return unpack(file, inode, 0 /* not forever */ );
37130+}
37131+
37132+/**
37133+ * mmap_unix_file - mmap of struct file_operations
37134+ * @file: file to mmap
37135+ * @vma:
37136+ *
37137+ * This is implementation of vfs's mmap method of struct file_operations for
37138+ * unix file plugin. It converts file to extent if necessary. Sets
37139+ * reiser4_inode's flag - REISER4_HAS_MMAP.
37140+ */
37141+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37142+{
37143+ reiser4_context *ctx;
37144+ int result;
37145+ struct inode *inode;
71430cf6 37146+ struct unix_file_info *uf_info;
44254afd
MT
37147+ reiser4_block_nr needed;
37148+
37149+ inode = file->f_dentry->d_inode;
71430cf6 37150+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37151+ if (IS_ERR(ctx))
37152+ return PTR_ERR(ctx);
37153+
37154+ uf_info = unix_file_inode_data(inode);
37155+
71430cf6 37156+ get_exclusive_access_careful(uf_info, inode);
44254afd
MT
37157+
37158+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37159+ /*
37160+ * we need file built of extent items. If it is still built of
37161+ * tail items we have to convert it. Find what items the file
37162+ * is built of
37163+ */
37164+ result = find_file_state(inode, uf_info);
37165+ if (result != 0) {
37166+ drop_exclusive_access(uf_info);
44254afd
MT
37167+ reiser4_exit_context(ctx);
37168+ return result;
37169+ }
37170+
37171+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37172+ uf_info->container == UF_CONTAINER_EXTENTS ||
37173+ uf_info->container == UF_CONTAINER_EMPTY));
37174+ if (uf_info->container == UF_CONTAINER_TAILS) {
37175+ /*
37176+ * invalidate all pages and convert file from tails to
37177+ * extents
37178+ */
37179+ result = check_pages_unix_file(file, inode);
37180+ if (result) {
37181+ drop_exclusive_access(uf_info);
44254afd
MT
37182+ reiser4_exit_context(ctx);
37183+ return result;
37184+ }
37185+ }
37186+ }
37187+
37188+ /*
37189+ * generic_file_mmap will do update_atime. Grab space for stat data
37190+ * update.
37191+ */
37192+ needed = inode_file_plugin(inode)->estimate.update(inode);
37193+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37194+ if (result) {
37195+ drop_exclusive_access(uf_info);
44254afd
MT
37196+ reiser4_exit_context(ctx);
37197+ return result;
37198+ }
37199+
37200+ result = generic_file_mmap(file, vma);
37201+ if (result == 0) {
37202+ /* mark file as having mapping. */
71430cf6 37203+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
44254afd
MT
37204+ }
37205+
37206+ drop_exclusive_access(uf_info);
44254afd
MT
37207+ reiser4_exit_context(ctx);
37208+ return result;
37209+}
37210+
37211+/**
37212+ * find_first_item
37213+ * @inode:
37214+ *
37215+ * Finds file item which is responsible for first byte in the file.
37216+ */
37217+static int find_first_item(struct inode *inode)
37218+{
37219+ coord_t coord;
37220+ lock_handle lh;
37221+ reiser4_key key;
37222+ int result;
37223+
37224+ coord_init_zero(&coord);
37225+ init_lh(&lh);
37226+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37227+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37228+ inode);
37229+ if (result == CBK_COORD_FOUND) {
37230+ if (coord.between == AT_UNIT) {
37231+ result = zload(coord.node);
37232+ if (result == 0) {
37233+ result = item_id_by_coord(&coord);
37234+ zrelse(coord.node);
37235+ if (result != EXTENT_POINTER_ID &&
37236+ result != FORMATTING_ID)
37237+ result = RETERR(-EIO);
37238+ }
37239+ } else
37240+ result = RETERR(-EIO);
37241+ }
37242+ done_lh(&lh);
37243+ return result;
37244+}
37245+
37246+/**
37247+ * open_unix_file
37248+ * @inode:
37249+ * @file:
37250+ *
37251+ * If filesystem is not readonly - complete uncompleted tail conversion if
37252+ * there was one
37253+ */
37254+int open_unix_file(struct inode *inode, struct file *file)
37255+{
37256+ int result;
37257+ reiser4_context *ctx;
71430cf6 37258+ struct unix_file_info *uf_info;
44254afd
MT
37259+
37260+ if (IS_RDONLY(inode))
37261+ return 0;
37262+
71430cf6 37263+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
44254afd
MT
37264+ return 0;
37265+
71430cf6 37266+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37267+ if (IS_ERR(ctx))
37268+ return PTR_ERR(ctx);
37269+
37270+ uf_info = unix_file_inode_data(inode);
44254afd 37271+
71430cf6 37272+ get_exclusive_access_careful(uf_info, inode);
44254afd 37273+
71430cf6 37274+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
44254afd
MT
37275+ /*
37276+ * other process completed the conversion
37277+ */
37278+ drop_exclusive_access(uf_info);
37279+ reiser4_exit_context(ctx);
37280+ return 0;
37281+ }
37282+
37283+ /*
37284+ * file left in semi converted state after unclean shutdown or another
37285+ * thread is doing conversion and dropped exclusive access which doing
37286+ * balance dirty pages. Complete the conversion
37287+ */
37288+ result = find_first_item(inode);
37289+ if (result == EXTENT_POINTER_ID)
37290+ /*
37291+ * first item is extent, therefore there was incomplete
37292+ * tail2extent conversion. Complete it
37293+ */
37294+ result = tail2extent(unix_file_inode_data(inode));
37295+ else if (result == FORMATTING_ID)
37296+ /*
37297+ * first item is formatting item, therefore there was
37298+ * incomplete extent2tail conversion. Complete it
37299+ */
37300+ result = extent2tail(unix_file_inode_data(inode));
37301+ else
37302+ result = -EIO;
37303+
37304+ assert("vs-1712",
71430cf6
MT
37305+ ergo(result == 0,
37306+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
37307+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
44254afd
MT
37308+ drop_exclusive_access(uf_info);
37309+ reiser4_exit_context(ctx);
37310+ return result;
37311+}
37312+
37313+#define NEITHER_OBTAINED 0
37314+#define EA_OBTAINED 1
37315+#define NEA_OBTAINED 2
37316+
71430cf6 37317+static void drop_access(struct unix_file_info *uf_info)
44254afd
MT
37318+{
37319+ if (uf_info->exclusive_use)
37320+ drop_exclusive_access(uf_info);
37321+ else
37322+ drop_nonexclusive_access(uf_info);
37323+}
37324+
37325+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37326+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37327+
44254afd
MT
37328+/**
37329+ * write_unix_file - write of struct file_operations
37330+ * @file: file to write to
37331+ * @buf: address of user-space buffer
37332+ * @write_amount: number of bytes to write
37333+ * @off: position in file to write to
37334+ *
37335+ * This is implementation of vfs's write method of struct file_operations for
37336+ * unix file plugin.
37337+ */
37338+ssize_t write_unix_file(struct file *file, const char __user *buf,
37339+ size_t count, loff_t *pos)
37340+{
37341+ int result;
37342+ reiser4_context *ctx;
37343+ struct inode *inode;
71430cf6 37344+ struct unix_file_info *uf_info;
44254afd
MT
37345+ ssize_t written;
37346+ int try_free_space;
37347+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37348+ size_t left;
37349+ ssize_t (*write_op)(struct file *, const char __user *, size_t,
37350+ loff_t *pos);
37351+ int ea;
37352+ loff_t new_size;
37353+
37354+ inode = file->f_dentry->d_inode;
71430cf6 37355+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37356+ if (IS_ERR(ctx))
37357+ return PTR_ERR(ctx);
37358+
37359+ mutex_lock(&inode->i_mutex);
37360+
71430cf6
MT
37361+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37362+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
44254afd
MT
37363+
37364+ /* check amount of bytes to write and writing position */
37365+ result = generic_write_checks(file, pos, &count, 0);
37366+ if (result) {
37367+ mutex_unlock(&inode->i_mutex);
37368+ context_set_commit_async(ctx);
37369+ reiser4_exit_context(ctx);
37370+ return result;
37371+ }
37372+
37373+ result = remove_suid(file->f_dentry);
37374+ if (result) {
37375+ mutex_unlock(&inode->i_mutex);
37376+ context_set_commit_async(ctx);
37377+ reiser4_exit_context(ctx);
37378+ return result;
37379+ }
71430cf6
MT
37380+ /* remove_suid might create a transaction */
37381+ reiser4_txn_restart(ctx);
44254afd
MT
37382+
37383+ uf_info = unix_file_inode_data(inode);
37384+
37385+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
37386+ written = 0;
37387+ try_free_space = 0;
37388+ left = count;
37389+ ea = NEITHER_OBTAINED;
37390+
37391+ new_size = i_size_read(inode);
37392+ if (*pos + count > new_size)
37393+ new_size = *pos + count;
37394+
37395+ while (left) {
37396+ if (left < to_write)
37397+ to_write = left;
37398+
37399+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37400+ get_exclusive_access(uf_info);
37401+ ea = EA_OBTAINED;
37402+ if (uf_info->container != UF_CONTAINER_EMPTY) {
37403+ /* file is made not empty by another process */
37404+ drop_exclusive_access(uf_info);
37405+ ea = NEITHER_OBTAINED;
37406+ continue;
37407+ }
37408+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37409+ /*
37410+ * get exclusive access directly just to not have to
37411+ * re-obtain it if file will appear empty
37412+ */
37413+ get_exclusive_access(uf_info);
37414+ ea = EA_OBTAINED;
37415+ result = find_file_state(inode, uf_info);
37416+ if (result) {
37417+ drop_exclusive_access(uf_info);
37418+ ea = NEITHER_OBTAINED;
37419+ break;
37420+ }
37421+ } else {
37422+ get_nonexclusive_access(uf_info);
37423+ ea = NEA_OBTAINED;
37424+ }
37425+
37426+ /* either EA or NEA is obtained. Choose item write method */
37427+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
37428+ /* file is built of extent items */
71430cf6 37429+ write_op = reiser4_write_extent;
44254afd
MT
37430+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37431+ /* file is empty */
37432+ if (should_have_notail(uf_info, new_size))
71430cf6 37433+ write_op = reiser4_write_extent;
44254afd 37434+ else
71430cf6 37435+ write_op = reiser4_write_tail;
44254afd
MT
37436+ } else {
37437+ /* file is built of tail items */
37438+ if (should_have_notail(uf_info, new_size)) {
37439+ if (ea == NEA_OBTAINED) {
37440+ drop_nonexclusive_access(uf_info);
37441+ get_exclusive_access(uf_info);
37442+ ea = EA_OBTAINED;
37443+ }
37444+ if (uf_info->container == UF_CONTAINER_TAILS) {
37445+ /*
37446+ * if file is being convered by another
37447+ * process - wait until it completes
37448+ */
37449+ while (1) {
71430cf6
MT
37450+ if (reiser4_inode_get_flag(inode,
37451+ REISER4_PART_IN_CONV)) {
44254afd
MT
37452+ drop_exclusive_access(uf_info);
37453+ schedule();
37454+ get_exclusive_access(uf_info);
37455+ continue;
37456+ }
37457+ break;
71430cf6 37458+ }
44254afd
MT
37459+ if (uf_info->container == UF_CONTAINER_TAILS) {
37460+ result = tail2extent(uf_info);
37461+ if (result)
37462+ break;
37463+ }
37464+ }
37465+ drop_exclusive_access(uf_info);
37466+ ea = NEITHER_OBTAINED;
37467+ continue;
37468+ }
71430cf6 37469+ write_op = reiser4_write_tail;
44254afd
MT
37470+ }
37471+
37472+ written = write_op(file, buf, to_write, pos);
37473+ if (written == -ENOSPC && try_free_space) {
37474+ drop_access(uf_info);
37475+ txnmgr_force_commit_all(inode->i_sb, 0);
37476+ try_free_space = 0;
37477+ continue;
37478+ }
37479+ if (written < 0) {
37480+ drop_access(uf_info);
37481+ result = written;
37482+ break;
37483+ }
37484+ /* something is written. */
37485+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37486+ assert("", ea == EA_OBTAINED);
71430cf6
MT
37487+ uf_info->container =
37488+ (write_op == reiser4_write_extent) ?
44254afd
MT
37489+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37490+ } else {
37491+ assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
71430cf6 37492+ write_op == reiser4_write_extent));
44254afd 37493+ assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
71430cf6 37494+ write_op == reiser4_write_tail));
44254afd
MT
37495+ }
37496+ if (*pos + written > inode->i_size)
37497+ INODE_SET_FIELD(inode, i_size, *pos + written);
37498+ file_update_time(file);
37499+ result = reiser4_update_sd(inode);
37500+ if (result) {
37501+ mutex_unlock(&inode->i_mutex);
37502+ current->backing_dev_info = NULL;
37503+ drop_access(uf_info);
37504+ context_set_commit_async(ctx);
37505+ reiser4_exit_context(ctx);
37506+ return result;
37507+ }
37508+ drop_access(uf_info);
37509+ ea = NEITHER_OBTAINED;
71430cf6 37510+ reiser4_txn_restart(ctx);
44254afd
MT
37511+ current->journal_info = NULL;
37512+ /*
37513+ * tell VM how many pages were dirtied. Maybe number of pages
37514+ * which were dirty already should not be counted
37515+ */
71430cf6
MT
37516+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
37517+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
44254afd
MT
37518+ current->journal_info = ctx;
37519+
37520+ left -= written;
37521+ buf += written;
37522+ *pos += written;
37523+ }
37524+
37525+ mutex_unlock(&inode->i_mutex);
37526+
37527+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
71430cf6 37528+ reiser4_txn_restart_current();
44254afd
MT
37529+ grab_space_enable();
37530+ result = sync_unix_file(file, file->f_dentry,
37531+ 0 /* data and stat data */ );
37532+ if (result)
37533+ warning("reiser4-7", "failed to sync file %llu",
37534+ (unsigned long long)get_inode_oid(inode));
37535+ }
37536+
37537+ current->backing_dev_info = NULL;
37538+
37539+ reiser4_exit_context(ctx);
37540+
37541+ /*
37542+ * return number of written bytes or error code if nothing is
37543+ * written. Note, that it does not work correctly in case when
37544+ * sync_unix_file returns error
37545+ */
37546+ return (count - left) ? (count - left) : result;
37547+}
37548+
37549+/**
37550+ * release_unix_file - release of struct file_operations
37551+ * @inode: inode of released file
37552+ * @file: file to release
37553+ *
37554+ * Implementation of release method of struct file_operations for unix file
37555+ * plugin. If last reference to indode is released - convert all extent items
37556+ * into tail items if necessary. Frees reiser4 specific file data.
37557+ */
37558+int release_unix_file(struct inode *inode, struct file *file)
37559+{
37560+ reiser4_context *ctx;
71430cf6 37561+ struct unix_file_info *uf_info;
44254afd
MT
37562+ int result;
37563+ int in_reiser4;
37564+
37565+ in_reiser4 = is_in_reiser4_context();
37566+
71430cf6 37567+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37568+ if (IS_ERR(ctx))
37569+ return PTR_ERR(ctx);
37570+
37571+ result = 0;
37572+ if (in_reiser4 == 0) {
37573+ uf_info = unix_file_inode_data(inode);
37574+
71430cf6 37575+ get_exclusive_access_careful(uf_info, inode);
44254afd
MT
37576+ if (atomic_read(&file->f_dentry->d_count) == 1 &&
37577+ uf_info->container == UF_CONTAINER_EXTENTS &&
37578+ !should_have_notail(uf_info, inode->i_size) &&
37579+ !rofs_inode(inode)) {
37580+ result = extent2tail(uf_info);
37581+ if (result != 0) {
37582+ warning("nikita-3233",
37583+ "Failed (%d) to convert in %s (%llu)",
37584+ result, __FUNCTION__,
37585+ (unsigned long long)
37586+ get_inode_oid(inode));
37587+ }
37588+ }
37589+ drop_exclusive_access(uf_info);
44254afd
MT
37590+ } else {
37591+ /*
37592+ we are within reiser4 context already. How latter is
37593+ possible? Simple:
37594+
37595+ (gdb) bt
37596+ #0 get_exclusive_access ()
37597+ #2 0xc01e56d3 in release_unix_file ()
37598+ #3 0xc01c3643 in reiser4_release ()
37599+ #4 0xc014cae0 in __fput ()
37600+ #5 0xc013ffc3 in remove_vm_struct ()
37601+ #6 0xc0141786 in exit_mmap ()
37602+ #7 0xc0118480 in mmput ()
37603+ #8 0xc0133205 in oom_kill ()
37604+ #9 0xc01332d1 in out_of_memory ()
37605+ #10 0xc013bc1d in try_to_free_pages ()
37606+ #11 0xc013427b in __alloc_pages ()
37607+ #12 0xc013f058 in do_anonymous_page ()
37608+ #13 0xc013f19d in do_no_page ()
37609+ #14 0xc013f60e in handle_mm_fault ()
37610+ #15 0xc01131e5 in do_page_fault ()
37611+ #16 0xc0104935 in error_code ()
37612+ #17 0xc025c0c6 in __copy_to_user_ll ()
71430cf6 37613+ #18 0xc01d496f in reiser4_read_tail ()
44254afd
MT
37614+ #19 0xc01e4def in read_unix_file ()
37615+ #20 0xc01c3504 in reiser4_read ()
37616+ #21 0xc014bd4f in vfs_read ()
37617+ #22 0xc014bf66 in sys_read ()
37618+ */
37619+ warning("vs-44", "out of memory?");
37620+ }
37621+
37622+ reiser4_free_file_fsdata(file);
37623+
37624+ reiser4_exit_context(ctx);
37625+ return result;
37626+}
37627+
37628+static void set_file_notail(struct inode *inode)
37629+{
37630+ reiser4_inode *state;
37631+ formatting_plugin *tplug;
37632+
37633+ state = reiser4_inode_data(inode);
37634+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
71430cf6 37635+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
44254afd
MT
37636+}
37637+
37638+/* if file is built of tails - convert it to extents */
37639+static int unpack(struct file *filp, struct inode *inode, int forever)
37640+{
37641+ int result = 0;
71430cf6 37642+ struct unix_file_info *uf_info;
44254afd
MT
37643+
37644+ uf_info = unix_file_inode_data(inode);
37645+ assert("vs-1628", ea_obtained(uf_info));
37646+
37647+ result = find_file_state(inode, uf_info);
37648+ if (result)
37649+ return result;
37650+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
37651+
37652+ if (uf_info->container == UF_CONTAINER_TAILS) {
37653+ /*
37654+ * if file is being convered by another process - wait until it
37655+ * completes
37656+ */
37657+ while (1) {
71430cf6
MT
37658+ if (reiser4_inode_get_flag(inode,
37659+ REISER4_PART_IN_CONV)) {
44254afd
MT
37660+ drop_exclusive_access(uf_info);
37661+ schedule();
37662+ get_exclusive_access(uf_info);
37663+ continue;
37664+ }
37665+ break;
37666+ }
37667+ if (uf_info->container == UF_CONTAINER_TAILS) {
37668+ result = tail2extent(uf_info);
37669+ if (result)
37670+ return result;
37671+ }
37672+ }
37673+ if (forever) {
37674+ /* safe new formatting plugin in stat data */
37675+ __u64 tograb;
71430cf6 37676+
44254afd 37677+ set_file_notail(inode);
71430cf6 37678+
44254afd
MT
37679+ grab_space_enable();
37680+ tograb = inode_file_plugin(inode)->estimate.update(inode);
37681+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
37682+ result = reiser4_update_sd(inode);
37683+ }
71430cf6 37684+
44254afd
MT
37685+ return result;
37686+}
37687+
37688+/* implentation of vfs' ioctl method of struct file_operations for unix file
37689+ plugin
37690+*/
37691+int
37692+ioctl_unix_file(struct inode *inode, struct file *filp,
37693+ unsigned int cmd, unsigned long arg UNUSED_ARG)
37694+{
37695+ reiser4_context *ctx;
37696+ int result;
37697+
71430cf6 37698+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37699+ if (IS_ERR(ctx))
37700+ return PTR_ERR(ctx);
37701+
37702+ switch (cmd) {
37703+ case REISER4_IOC_UNPACK:
37704+ get_exclusive_access(unix_file_inode_data(inode));
37705+ result = unpack(filp, inode, 1 /* forever */ );
37706+ drop_exclusive_access(unix_file_inode_data(inode));
37707+ break;
37708+
37709+ default:
37710+ result = RETERR(-ENOSYS);
37711+ break;
37712+ }
37713+ reiser4_exit_context(ctx);
37714+ return result;
37715+}
37716+
37717+/* implentation of vfs' bmap method of struct address_space_operations for unix
37718+ file plugin
37719+*/
37720+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
37721+{
37722+ reiser4_context *ctx;
37723+ sector_t result;
37724+ reiser4_key key;
37725+ coord_t coord;
37726+ lock_handle lh;
37727+ struct inode *inode;
37728+ item_plugin *iplug;
37729+ sector_t block;
37730+
37731+ inode = mapping->host;
37732+
71430cf6 37733+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37734+ if (IS_ERR(ctx))
37735+ return PTR_ERR(ctx);
37736+ key_by_inode_and_offset_common(inode,
37737+ (loff_t) lblock * current_blocksize,
37738+ &key);
37739+
37740+ init_lh(&lh);
37741+ result =
37742+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
37743+ if (cbk_errored(result)) {
37744+ done_lh(&lh);
37745+ reiser4_exit_context(ctx);
37746+ return result;
37747+ }
37748+
37749+ result = zload(coord.node);
37750+ if (result) {
37751+ done_lh(&lh);
37752+ reiser4_exit_context(ctx);
37753+ return result;
37754+ }
37755+
37756+ iplug = item_plugin_by_coord(&coord);
37757+ if (iplug->s.file.get_block) {
37758+ result = iplug->s.file.get_block(&coord, lblock, &block);
37759+ if (result == 0)
37760+ result = block;
37761+ } else
37762+ result = RETERR(-EINVAL);
37763+
37764+ zrelse(coord.node);
37765+ done_lh(&lh);
37766+ reiser4_exit_context(ctx);
37767+ return result;
37768+}
37769+
37770+/**
37771+ * flow_by_inode_unix_file - initizlize structure flow
37772+ * @inode: inode of file for which read or write is abou
37773+ * @buf: buffer to perform read to or write from
37774+ * @user: flag showing whether @buf is user space or kernel space
37775+ * @size: size of buffer @buf
37776+ * @off: start offset fro read or write
37777+ * @op: READ or WRITE
37778+ * @flow:
37779+ *
37780+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
37781+ */
37782+int flow_by_inode_unix_file(struct inode *inode,
37783+ const char __user *buf, int user,
37784+ loff_t size, loff_t off,
37785+ rw_op op, flow_t *flow)
37786+{
37787+ assert("nikita-1100", inode != NULL);
37788+
37789+ flow->length = size;
37790+ memcpy(&flow->data, &buf, sizeof(buf));
37791+ flow->user = user;
37792+ flow->op = op;
37793+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
37794+ assert("nikita-1932",
37795+ inode_file_plugin(inode)->key_by_inode ==
37796+ key_by_inode_and_offset_common);
37797+ /* calculate key of write position and insert it into flow->key */
37798+ return key_by_inode_and_offset_common(inode, off, &flow->key);
37799+}
37800+
37801+/* plugin->u.file.set_plug_in_sd = NULL
37802+ plugin->u.file.set_plug_in_inode = NULL
37803+ plugin->u.file.create_blank_sd = NULL */
37804+/* plugin->u.file.delete */
37805+/*
71430cf6 37806+ plugin->u.file.add_link = reiser4_add_link_common
44254afd
MT
37807+ plugin->u.file.rem_link = NULL */
37808+
37809+/* plugin->u.file.owns_item
37810+ this is common_file_owns_item with assertion */
37811+/* Audited by: green(2002.06.15) */
37812+int
37813+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
37814+ const coord_t * coord /* coord to check */ )
37815+{
37816+ int result;
37817+
37818+ result = owns_item_common(inode, coord);
37819+ if (!result)
37820+ return 0;
71430cf6
MT
37821+ if (!plugin_of_group(item_plugin_by_coord(coord),
37822+ UNIX_FILE_METADATA_ITEM_TYPE))
44254afd
MT
37823+ return 0;
37824+ assert("vs-547",
37825+ item_id_by_coord(coord) == EXTENT_POINTER_ID ||
37826+ item_id_by_coord(coord) == FORMATTING_ID);
37827+ return 1;
37828+}
37829+
37830+static int setattr_truncate(struct inode *inode, struct iattr *attr)
37831+{
37832+ int result;
37833+ int s_result;
37834+ loff_t old_size;
37835+ reiser4_tree *tree;
37836+
37837+ inode_check_scale(inode, inode->i_size, attr->ia_size);
37838+
37839+ old_size = inode->i_size;
71430cf6 37840+ tree = reiser4_tree_by_inode(inode);
44254afd
MT
37841+
37842+ result = safe_link_grab(tree, BA_CAN_COMMIT);
37843+ if (result == 0)
37844+ result = safe_link_add(inode, SAFE_TRUNCATE);
37845+ if (result == 0)
37846+ result = truncate_file_body(inode, attr->ia_size);
37847+ if (result)
37848+ warning("vs-1588", "truncate_file failed: oid %lli, "
37849+ "old size %lld, new size %lld, retval %d",
37850+ (unsigned long long)get_inode_oid(inode),
37851+ old_size, attr->ia_size, result);
37852+
37853+ s_result = safe_link_grab(tree, BA_CAN_COMMIT);
37854+ if (s_result == 0)
37855+ s_result =
37856+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
37857+ if (s_result != 0) {
37858+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
37859+ (unsigned long long)get_inode_oid(inode), s_result);
37860+ }
37861+ safe_link_release(tree);
37862+ return result;
37863+}
37864+
37865+/* plugin->u.file.setattr method */
37866+/* This calls inode_setattr and if truncate is in effect it also takes
37867+ exclusive inode access to avoid races */
37868+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
37869+ struct iattr *attr /* change description */ )
37870+{
37871+ int result;
37872+
37873+ if (attr->ia_valid & ATTR_SIZE) {
37874+ reiser4_context *ctx;
71430cf6 37875+ struct unix_file_info *uf_info;
44254afd
MT
37876+
37877+ /* truncate does reservation itself and requires exclusive
37878+ access obtained */
71430cf6 37879+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
44254afd
MT
37880+ if (IS_ERR(ctx))
37881+ return PTR_ERR(ctx);
37882+
37883+ uf_info = unix_file_inode_data(dentry->d_inode);
71430cf6 37884+ get_exclusive_access_careful(uf_info, dentry->d_inode);
44254afd
MT
37885+ result = setattr_truncate(dentry->d_inode, attr);
37886+ drop_exclusive_access(uf_info);
44254afd
MT
37887+ context_set_commit_async(ctx);
37888+ reiser4_exit_context(ctx);
37889+ } else
71430cf6 37890+ result = reiser4_setattr_common(dentry, attr);
44254afd
MT
37891+
37892+ return result;
37893+}
37894+
37895+/* plugin->u.file.init_inode_data */
37896+void
37897+init_inode_data_unix_file(struct inode *inode,
37898+ reiser4_object_create_data * crd, int create)
37899+{
71430cf6 37900+ struct unix_file_info *data;
44254afd
MT
37901+
37902+ data = unix_file_inode_data(inode);
37903+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
37904+ init_rwsem(&data->latch);
44254afd
MT
37905+ data->tplug = inode_formatting_plugin(inode);
37906+ data->exclusive_use = 0;
37907+
37908+#if REISER4_DEBUG
37909+ data->ea_owner = NULL;
37910+ atomic_set(&data->nr_neas, 0);
37911+#endif
37912+ init_inode_ordering(inode, crd, create);
37913+}
37914+
37915+/**
37916+ * delete_object_unix_file - delete_object of file_plugin
37917+ * @inode: inode to be deleted
37918+ *
37919+ * Truncates file to length 0, removes stat data and safe link.
37920+ */
37921+int delete_object_unix_file(struct inode *inode)
37922+{
71430cf6 37923+ struct unix_file_info *uf_info;
44254afd
MT
37924+ int result;
37925+
71430cf6 37926+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
44254afd
MT
37927+ return 0;
37928+
37929+ /* truncate file bogy first */
37930+ uf_info = unix_file_inode_data(inode);
37931+ get_exclusive_access(uf_info);
37932+ result = truncate_file_body(inode, 0 /* size */ );
37933+ drop_exclusive_access(uf_info);
37934+
37935+ if (result)
37936+ warning("", "failed to truncate file (%llu) on removal: %d",
37937+ get_inode_oid(inode), result);
37938+
37939+ /* remove stat data and safe link */
71430cf6 37940+ return reiser4_delete_object_common(inode);
44254afd
MT
37941+}
37942+
37943+/**
37944+ * sendfile_unix_file - sendfile of struct file_operations
37945+ * @file: file to be sent
37946+ * @ppos: position to start from
37947+ * @count: number of bytes to send
37948+ * @actor: function to copy data
37949+ * @target: where to copy read data
37950+ *
37951+ * Reads @count bytes from @file and calls @actor for every page read. This is
37952+ * needed for loop back devices support.
37953+ */
37954+ssize_t
37955+sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
37956+ read_actor_t actor, void *target)
37957+{
37958+ reiser4_context *ctx;
37959+ ssize_t result;
37960+ struct inode *inode;
71430cf6 37961+ struct unix_file_info *uf_info;
44254afd
MT
37962+
37963+ inode = file->f_dentry->d_inode;
71430cf6 37964+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
37965+ if (IS_ERR(ctx))
37966+ return PTR_ERR(ctx);
37967+
37968+ /*
37969+ * generic_file_sndfile may want to call update_atime. Grab space for
37970+ * stat data update
37971+ */
37972+ result = reiser4_grab_space(estimate_update_common(inode),
37973+ BA_CAN_COMMIT);
37974+ if (result)
37975+ goto error;
37976+ mutex_lock(&inode->i_mutex);
71430cf6 37977+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
44254afd
MT
37978+ mutex_unlock(&inode->i_mutex);
37979+
37980+ uf_info = unix_file_inode_data(inode);
37981+ get_nonexclusive_access(uf_info);
37982+ result = generic_file_sendfile(file, ppos, count, actor, target);
37983+ drop_nonexclusive_access(uf_info);
37984+ error:
37985+ reiser4_exit_context(ctx);
37986+ return result;
37987+}
37988+
37989+int
37990+prepare_write_unix_file(struct file *file, struct page *page,
37991+ unsigned from, unsigned to)
37992+{
37993+ reiser4_context *ctx;
71430cf6 37994+ struct unix_file_info *uf_info;
44254afd
MT
37995+ int ret;
37996+
71430cf6 37997+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
44254afd
MT
37998+ if (IS_ERR(ctx))
37999+ return PTR_ERR(ctx);
38000+
38001+ uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38002+ get_exclusive_access(uf_info);
38003+ ret = find_file_state(file->f_dentry->d_inode, uf_info);
38004+ if (ret == 0) {
38005+ if (uf_info->container == UF_CONTAINER_TAILS)
38006+ ret = -EINVAL;
38007+ else
38008+ ret = do_prepare_write(file, page, from, to);
38009+ }
38010+ drop_exclusive_access(uf_info);
38011+
38012+ /* don't commit transaction under inode semaphore */
38013+ context_set_commit_async(ctx);
38014+ reiser4_exit_context(ctx);
38015+ return ret;
38016+}
38017+
38018+/*
38019+ * Local variables:
38020+ * c-indentation-style: "K&R"
38021+ * mode-name: "LC"
38022+ * c-basic-offset: 8
38023+ * tab-width: 8
38024+ * fill-column: 79
38025+ * scroll-step: 1
38026+ * End:
38027+ */
71430cf6
MT
38028diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.22/fs/reiser4/plugin/file/file_conversion.c
38029--- linux-2.6.22.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 03:00:00.000000000 +0300
38030+++ linux-2.6.22/fs/reiser4/plugin/file/file_conversion.c 2007-07-29 00:25:34.928709936 +0400
38031@@ -0,0 +1,609 @@
38032+/* Copyright 2001, 2002, 2003 by Hans Reiser,
38033+ licensing governed by reiser4/README */
38034+
38035+/* This file contains hooks that converts (*) cryptcompress files to unix-files,
38036+ and a set of protected (**) methods of a cryptcompress file plugin to perform
38037+ such conversion.
38038+
38039+(*)
38040+ The conversion is performed for incompressible files to reduce cpu and memory
38041+ usage. If first logical cluster (64K by default) of a file is incompressible,
38042+ then we make a desicion, that the whole file is incompressible.
38043+ The conversion can be enabled via installing a special compression mode
38044+ plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for
38045+ details).
38046+
38047+(**)
38048+ The protection means serialization of critical sections (readers and writers
38049+ of @pset->file)
38050+*/
38051+
38052+#include "../../inode.h"
38053+#include "../cluster.h"
38054+#include "file.h"
38055+
38056+#define conversion_enabled(inode) \
38057+ (inode_compression_mode_plugin(inode) == \
38058+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
38059+
38060+
38061+/* Located sections (readers and writers of @pset->file) are not
38062+ permanently critical: cryptcompress file can be converted only
38063+ if the conversion is enabled (see the macrio above). And we don't
38064+ convert unix files at all.
38065+ The following helper macro is a sanity check to decide if we
38066+ need to protect a located section.
38067+*/
38068+#define should_protect(inode) \
38069+ (inode_file_plugin(inode) == \
38070+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
38071+ conversion_enabled(inode))
38072+
38073+/* All protected methods have prefix "prot" in their names.
38074+ It is convenient to construct them by usual (unprotected) ones
38075+ using the following common macros:
38076+*/
38077+
38078+/* Macro for passive protection.
38079+ method_cryptcompress contains only readers */
38080+#define PROT_PASSIVE(type, method, args) \
38081+({ \
38082+ type _result; \
38083+ struct rw_semaphore * guard = \
38084+ &reiser4_inode_data(inode)->conv_sem; \
38085+ \
38086+ if (should_protect(inode)) { \
38087+ down_read(guard); \
38088+ if (!should_protect(inode)) \
38089+ up_read(guard); \
38090+ } \
38091+ if (inode_file_plugin(inode) == \
38092+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38093+ _result = method ## _unix_file args; \
38094+ else \
38095+ _result = method ## _cryptcompress args; \
38096+ if (should_protect(inode)) \
38097+ up_read(guard); \
38098+ _result; \
38099+})
38100+
38101+#define PROT_PASSIVE_VOID(method, args) \
38102+({ \
38103+ struct rw_semaphore * guard = \
38104+ &reiser4_inode_data(inode)->conv_sem; \
38105+ \
38106+ if (should_protect(inode)) { \
38107+ down_read(guard); \
38108+ if (!should_protect(inode)) \
38109+ up_read(guard); \
38110+ } \
38111+ if (inode_file_plugin(inode) == \
38112+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38113+ method ## _unix_file args; \
38114+ else \
38115+ method ## _cryptcompress args; \
38116+ if (should_protect(inode)) \
38117+ up_read(guard); \
38118+})
38119+
38120+/* Macro for active protection.
38121+ active_expr contains readers and writers; after its
38122+ evaluation conversion should be disabled */
38123+#define PROT_ACTIVE(type, method, args, active_expr) \
38124+({ \
38125+ type _result = 0; \
38126+ struct rw_semaphore * guard = \
38127+ &reiser4_inode_data(inode)->conv_sem; \
38128+ reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
38129+ if (IS_ERR(ctx)) \
38130+ return PTR_ERR(ctx); \
38131+ \
38132+ if (should_protect(inode)) { \
38133+ down_write(guard); \
38134+ if (should_protect(inode)) \
38135+ _result = active_expr; \
38136+ up_write(guard); \
38137+ } \
38138+ if (_result == 0) { \
38139+ if (inode_file_plugin(inode) == \
38140+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38141+ _result = method ## _unix_file args; \
38142+ else \
38143+ _result = method ## _cryptcompress args; \
38144+ } \
38145+ reiser4_exit_context(ctx); \
38146+ _result; \
38147+})
38148+
38149+/* Pass management to the unix-file plugin with "notail" policy */
38150+static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
38151+{
38152+ int result;
38153+ reiser4_inode *info;
38154+ struct unix_file_info * uf;
38155+ info = reiser4_inode_data(inode);
38156+
38157+ result = aset_set_unsafe(&info->pset,
38158+ PSET_FILE,
38159+ (reiser4_plugin *)
38160+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
38161+ if (result)
38162+ return result;
38163+ result = aset_set_unsafe(&info->pset,
38164+ PSET_FORMATTING,
38165+ (reiser4_plugin *)
38166+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
38167+ if (result)
38168+ return result;
38169+ /* get rid of non-standard plugins */
38170+ info->plugin_mask &= ~cryptcompress_mask;
38171+ /* get rid of plugin stat-data extension */
38172+ info->extmask &= ~(1 << PLUGIN_STAT);
38173+
38174+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
38175+
38176+ /* FIXME use init_inode_data_unix_file() instead,
38177+ but aviod init_inode_ordering() */
38178+ /* Init unix-file specific part of inode */
38179+ uf = unix_file_inode_data(inode);
38180+ uf->container = UF_CONTAINER_UNKNOWN;
38181+ init_rwsem(&uf->latch);
38182+ uf->tplug = inode_formatting_plugin(inode);
38183+ uf->exclusive_use = 0;
38184+#if REISER4_DEBUG
38185+ uf->ea_owner = NULL;
38186+ atomic_set(&uf->nr_neas, 0);
38187+#endif
38188+ inode->i_op =
38189+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops;
38190+ inode->i_fop =
38191+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops;
38192+ inode->i_mapping->a_ops =
38193+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops;
38194+ file->f_op = inode->i_fop;
38195+ return 0;
38196+}
38197+
38198+#if REISER4_DEBUG
38199+static int disabled_conversion_inode_ok(struct inode * inode)
38200+{
38201+ __u64 extmask = reiser4_inode_data(inode)->extmask;
38202+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
38203+
38204+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
38205+ (extmask & (1 << UNIX_STAT)) &&
38206+ (extmask & (1 << LARGE_TIMES_STAT)) &&
38207+ (extmask & (1 << PLUGIN_STAT)) &&
38208+ (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
38209+}
38210+#endif
38211+
38212+/* Assign another mode that will control
38213+ compression at flush time only */
38214+static int disable_conversion_no_update_sd(struct inode * inode)
38215+{
38216+ int result;
38217+ result =
38218+ force_plugin_pset(inode,
38219+ PSET_COMPRESSION_MODE,
38220+ (reiser4_plugin *)compression_mode_plugin_by_id
38221+ (LATTD_COMPRESSION_MODE_ID));
38222+ assert("edward-1500",
38223+ ergo(!result, disabled_conversion_inode_ok(inode)));
38224+ return result;
38225+}
38226+
38227+/* Disable future attempts to check/convert. This function is called by
38228+ conversion hooks. */
38229+static int disable_conversion(struct inode * inode)
38230+{
38231+ return disable_conversion_no_update_sd(inode);
38232+}
38233+
38234+static int check_position(struct inode * inode,
38235+ loff_t pos /* position in the file to write from */,
38236+ struct cluster_handle * clust,
38237+ int * check_compress)
38238+{
38239+ assert("edward-1505", conversion_enabled(inode));
38240+ /*
38241+ * if file size is more then cluster size, then compressible
38242+ * status must be figured out (i.e. compression was disabled,
38243+ * or file plugin was converted to unix_file)
38244+ */
38245+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
38246+
38247+ if (pos > inode->i_size)
38248+ /* first logical cluster will contain a (partial) hole */
38249+ return disable_conversion(inode);
38250+ if (pos < inode_cluster_size(inode))
38251+ /* writing to the first logical cluster */
38252+ return 0;
38253+ /*
38254+ * here we have:
38255+ * cluster_size <= pos <= i_size <= cluster_size,
38256+ * and, hence, pos == i_size == cluster_size
38257+ */
38258+ assert("edward-1498",
38259+ pos == inode->i_size &&
38260+ pos == inode_cluster_size(inode));
38261+
38262+ *check_compress = 1;
38263+ return 0;
38264+}
38265+
38266+static void start_check_compressibility(struct inode * inode,
38267+ struct cluster_handle * clust,
38268+ hint_t * hint)
38269+{
38270+ assert("edward-1507", clust->index == 1);
38271+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
38272+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
38273+
38274+ hint_init_zero(hint);
38275+ clust->hint = hint;
38276+ clust->index --;
38277+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
38278+
38279+ /* first logical cluster (of index #0) must be complete */
38280+ assert("edward-1510", lbytes(clust->index, inode) ==
38281+ inode_cluster_size(inode));
38282+}
38283+
38284+static void finish_check_compressibility(struct inode * inode,
38285+ struct cluster_handle * clust,
38286+ hint_t * hint)
38287+{
38288+ reiser4_unset_hint(clust->hint);
38289+ clust->hint = hint;
38290+ clust->index ++;
38291+}
38292+
38293+#if REISER4_DEBUG
38294+static int prepped_dclust_ok(hint_t * hint)
38295+{
38296+ reiser4_key key;
38297+ coord_t * coord = &hint->ext_coord.coord;
38298+
38299+ item_key_by_coord(coord, &key);
38300+ return (item_id_by_coord(coord) == CTAIL_ID &&
38301+ !coord_is_unprepped_ctail(coord) &&
38302+ (get_key_offset(&key) + nr_units_ctail(coord) ==
38303+ dclust_get_extension_dsize(hint)));
38304+}
38305+#endif
38306+
38307+#define fifty_persent(size) (size >> 1)
38308+/* evaluation of data compressibility */
38309+#define data_is_compressible(osize, isize) \
38310+ (osize < fifty_persent(isize))
38311+
38312+/* This is called only once per file life.
38313+ Read first logical cluster (of index #0) and estimate its compressibility.
38314+ Save estimation result in @compressible */
38315+static int read_check_compressibility(struct inode * inode,
38316+ struct cluster_handle * clust,
38317+ int * compressible)
38318+{
38319+ int i;
38320+ int result;
38321+ __u32 dst_len;
38322+ hint_t tmp_hint;
38323+ hint_t * cur_hint = clust->hint;
38324+
38325+ start_check_compressibility(inode, clust, &tmp_hint);
38326+
38327+ reset_cluster_pgset(clust, cluster_nrpages(inode));
38328+ result = grab_page_cluster(inode, clust, READ_OP);
38329+ if (result)
38330+ return result;
38331+ /* Read page cluster here */
38332+ for (i = 0; i < clust->nr_pages; i++) {
38333+ struct page *page = clust->pages[i];
38334+ lock_page(page);
38335+ result = do_readpage_ctail(inode, clust, page,
38336+ ZNODE_READ_LOCK);
38337+ unlock_page(page);
38338+ if (result)
38339+ goto error;
38340+ }
38341+ tfm_cluster_clr_uptodate(&clust->tc);
38342+
38343+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
38344+
38345+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
38346+ /* lenght of compressed data is known, no need to compress */
38347+ assert("edward-1511",
38348+ znode_is_any_locked(tmp_hint.lh.node));
38349+ assert("edward-1512",
38350+ WITH_DATA(tmp_hint.ext_coord.coord.node,
38351+ prepped_dclust_ok(&tmp_hint)));
38352+ dst_len = dclust_get_extension_dsize(&tmp_hint);
38353+ }
38354+ else {
38355+ struct tfm_cluster * tc = &clust->tc;
38356+ compression_plugin * cplug = inode_compression_plugin(inode);
38357+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
38358+ if (result)
38359+ goto error;
38360+ for (i = 0; i < clust->nr_pages; i++) {
38361+ char *data;
38362+ lock_page(clust->pages[i]);
38363+ BUG_ON(!PageUptodate(clust->pages[i]));
38364+ data = kmap(clust->pages[i]);
38365+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
38366+ data, PAGE_CACHE_SIZE);
38367+ kunmap(clust->pages[i]);
38368+ unlock_page(clust->pages[i]);
38369+ }
38370+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
38371+ if (result)
38372+ goto error;
38373+ result = grab_coa(tc, cplug);
38374+ if (result)
38375+ goto error;
38376+ tc->len = tc->lsize = lbytes(clust->index, inode);
38377+ assert("edward-1513", tc->len == inode_cluster_size(inode));
38378+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
38379+ cplug->compress(get_coa(tc, cplug->h.id, tc->act),
38380+ tfm_input_data(clust), tc->len,
38381+ tfm_output_data(clust), &dst_len);
38382+ assert("edward-1514",
38383+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
38384+ }
38385+ finish_check_compressibility(inode, clust, cur_hint);
38386+ *compressible = data_is_compressible(dst_len,
38387+ inode_cluster_size(inode));
38388+ return 0;
38389+ error:
38390+ put_page_cluster(clust, inode, READ_OP);
38391+ return result;
38392+}
38393+
38394+/* Cut disk cluster of index @idx */
38395+static int cut_disk_cluster(struct inode * inode, cloff_t idx)
38396+{
38397+ reiser4_key from, to;
38398+ assert("edward-1515", inode_file_plugin(inode) ==
38399+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
38400+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
38401+ to = from;
38402+ set_key_offset(&to,
38403+ get_key_offset(&from) + inode_cluster_size(inode) - 1);
38404+ return reiser4_cut_tree(reiser4_tree_by_inode(inode),
38405+ &from, &to, inode, 0);
38406+}
38407+
38408+static int reserve_cryptcompress2unixfile(struct inode *inode)
38409+{
38410+ reiser4_block_nr unformatted_nodes;
38411+ reiser4_tree *tree;
38412+
38413+ tree = reiser4_tree_by_inode(inode);
38414+
38415+ /* number of unformatted nodes which will be created */
38416+ unformatted_nodes = cluster_nrpages(inode); /* N */
38417+
38418+ /*
38419+ * space required for one iteration of extent->tail conversion:
38420+ *
38421+ * 1. kill ctail items
38422+ *
38423+ * 2. insert N unformatted nodes
38424+ *
38425+ * 3. insert N (worst-case single-block
38426+ * extents) extent units.
38427+ *
38428+ * 4. drilling to the leaf level by coord_by_key()
38429+ *
38430+ * 5. possible update of stat-data
38431+ *
38432+ */
38433+ grab_space_enable();
38434+ return reiser4_grab_space
38435+ (2 * tree->height +
38436+ unformatted_nodes +
38437+ unformatted_nodes * estimate_one_insert_into_item(tree) +
38438+ 1 + estimate_one_insert_item(tree) +
38439+ inode_file_plugin(inode)->estimate.update(inode),
38440+ BA_CAN_COMMIT);
38441+}
38442+
38443+/* clear flag that indicated conversion and update
38444+ stat-data with new (unix-file - specific) info */
38445+static int complete_file_conversion(struct inode *inode)
38446+{
38447+ int result;
38448+
38449+ grab_space_enable();
38450+ result =
38451+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
38452+ BA_CAN_COMMIT);
38453+ if (result == 0) {
38454+ reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38455+ result = reiser4_update_sd(inode);
38456+ }
38457+ if (result)
38458+ warning("edward-1452",
38459+ "Converting %llu to unix-file: update sd failed (%i)",
38460+ (unsigned long long)get_inode_oid(inode), result);
38461+ return 0;
38462+}
38463+
38464+
38465+/* do conversion */
38466+static int cryptcompress2unixfile(struct file * file, struct inode * inode,
38467+ struct cluster_handle * clust)
38468+{
38469+ int i;
38470+ int result = 0;
38471+ struct cryptcompress_info *cr_info;
38472+ struct unix_file_info *uf_info;
38473+
38474+ assert("edward-1516", clust->pages[0]->index == 0);
38475+ assert("edward-1517", clust->hint != NULL);
38476+
38477+ /* release all cryptcompress-specific recources */
38478+ cr_info = cryptcompress_inode_data(inode);
38479+ result = reserve_cryptcompress2unixfile(inode);
38480+ if (result)
38481+ goto out;
38482+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
38483+ reiser4_unset_hint(clust->hint);
38484+ result = cut_disk_cluster(inode, 0);
38485+ if (result)
38486+ goto out;
38487+ /* captured jnode of cluster and assotiated resources (pages,
38488+ reserved disk space) were released by ->kill_hook() method
38489+ of the item plugin */
38490+
38491+ result = __cryptcompress2unixfile(file, inode);
38492+ if (result)
38493+ goto out;
38494+ /* At this point file is managed by unix file plugin */
38495+
38496+ uf_info = unix_file_inode_data(inode);
38497+
38498+ assert("edward-1518",
38499+ ergo(jprivate(clust->pages[0]),
38500+ !jnode_is_cluster_page(jprivate(clust->pages[0]))));
38501+ for(i = 0; i < clust->nr_pages; i++) {
38502+ assert("edward-1519", clust->pages[i]);
38503+ assert("edward-1520", PageUptodate(clust->pages[i]));
38504+
38505+ result = find_or_create_extent(clust->pages[i]);
38506+ if (result)
38507+ break;
38508+ }
38509+ if (!result) {
38510+ uf_info->container = UF_CONTAINER_EXTENTS;
38511+ complete_file_conversion(inode);
38512+ }
38513+ out:
38514+ all_grabbed2free();
38515+ if (result)
38516+ warning("edward-1453", "Failed to convert file %llu: ret=%i",
38517+ (unsigned long long)get_inode_oid(inode), result);
38518+ return result;
38519+}
38520+
38521+/* Check, then perform or disable conversion if needed */
38522+int write_conversion_hook(struct file * file, struct inode * inode, loff_t pos,
38523+ struct cluster_handle * clust, int * progress)
38524+{
38525+ int result;
38526+ int check_compress = 0;
38527+ int compressible = 0;
38528+
38529+ if (!conversion_enabled(inode))
38530+ return 0;
38531+ result = check_position(inode, pos, clust, &check_compress);
38532+ if (result || !check_compress)
38533+ return result;
38534+ result = read_check_compressibility(inode, clust, &compressible);
38535+ if (result)
38536+ return result;
38537+
38538+ /* At this point page cluster is grabbed and uptodate */
38539+ if (!compressible) {
38540+ result = cryptcompress2unixfile(file, inode, clust);
38541+ if (result == 0)
38542+ *progress = 1;
38543+ }
38544+ else
38545+ result = disable_conversion(inode);
38546+
38547+ reiser4_txn_restart_current();
38548+ put_page_cluster(clust, inode, READ_OP);
38549+ return result;
38550+}
38551+
38552+static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
38553+{
38554+ return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
38555+}
38556+
38557+/* Protected methods of cryptcompress file plugin constructed
38558+ by the macros above */
38559+
38560+/* Wrappers with active protection for:
38561+ . write_cryptcompress;
38562+ . setattr_cryptcompress;
38563+*/
38564+
38565+ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf,
38566+ size_t count, loff_t *off)
38567+{
38568+ int prot = 0;
38569+ int conv = 0;
38570+ ssize_t written_cr = 0;
38571+ ssize_t written_uf = 0;
38572+ struct inode * inode = file->f_dentry->d_inode;
38573+ struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
38574+
38575+ if (should_protect(inode)) {
38576+ prot = 1;
38577+ down_write(guard);
38578+ }
38579+ written_cr = write_cryptcompress(file, buf, count, off, &conv);
38580+ if (prot)
38581+ up_write(guard);
38582+ if (written_cr < 0)
38583+ return written_cr;
38584+ if (conv)
38585+ written_uf = write_unix_file(file, buf + written_cr,
38586+ count - written_cr, off);
38587+ return written_cr + (written_uf < 0 ? 0 : written_uf);
38588+}
38589+
38590+int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
38591+{
38592+ struct inode * inode = dentry->d_inode;
38593+ return PROT_ACTIVE(int, setattr, (dentry, attr),
38594+ setattr_conversion_hook(inode, attr));
38595+}
38596+
38597+/* Wrappers with passive protection for:
38598+ . read_cryptcomperess;
38599+ . mmap_cryptcompress;
38600+ . release_cryptcompress;
38601+ . sendfile_cryptcompress;
38602+ . delete_object_cryptcompress.
38603+*/
38604+ssize_t prot_read_cryptcompress(struct file * file, char __user * buf,
38605+ size_t size, loff_t * off)
38606+{
38607+ struct inode * inode = file->f_dentry->d_inode;
38608+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
38609+}
38610+
38611+int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
38612+{
38613+ struct inode *inode = file->f_dentry->d_inode;
38614+ return PROT_PASSIVE(int, mmap, (file, vma));
38615+}
38616+
38617+int prot_release_cryptcompress(struct inode *inode, struct file *file)
38618+{
38619+ return PROT_PASSIVE(int, release, (inode, file));
38620+}
38621+
38622+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos,
38623+ size_t count, read_actor_t actor,
38624+ void *target)
38625+{
38626+ struct inode * inode = file->f_dentry->d_inode;
38627+ return PROT_PASSIVE(ssize_t, sendfile,
38628+ (file, ppos, count, actor, target));
38629+}
38630+
38631+/*
38632+ Local variables:
38633+ c-indentation-style: "K&R"
38634+ mode-name: "LC"
38635+ c-basic-offset: 8
38636+ tab-width: 8
38637+ fill-column: 80
38638+ scroll-step: 1
38639+ End:
38640+*/
38641diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/file.h linux-2.6.22/fs/reiser4/plugin/file/file.h
38642--- linux-2.6.22.orig/fs/reiser4/plugin/file/file.h 1970-01-01 03:00:00.000000000 +0300
38643+++ linux-2.6.22/fs/reiser4/plugin/file/file.h 2007-07-29 00:25:34.928709936 +0400
38644@@ -0,0 +1,272 @@
44254afd
MT
38645+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38646+ * reiser4/README */
38647+
71430cf6
MT
38648+/* this file contains declarations of methods implementing
38649+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
38650+ and SYMLINK_FILE_PLUGIN_ID) */
44254afd
MT
38651+
38652+#if !defined( __REISER4_FILE_H__ )
38653+#define __REISER4_FILE_H__
38654+
38655+/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38656+
38657+/* inode operations */
38658+int setattr_unix_file(struct dentry *, struct iattr *);
38659+
38660+/* file operations */
38661+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38662+ loff_t *off);
38663+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38664+ loff_t * off);
38665+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38666+ unsigned long arg);
38667+int mmap_unix_file(struct file *, struct vm_area_struct *);
38668+int open_unix_file(struct inode *, struct file *);
38669+int release_unix_file(struct inode *, struct file *);
38670+int sync_unix_file(struct file *, struct dentry *, int datasync);
38671+ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38672+ read_actor_t, void *target);
38673+
38674+/* address space operations */
38675+int readpage_unix_file(struct file *, struct page *);
71430cf6 38676+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
44254afd
MT
38677+int writepages_unix_file(struct address_space *, struct writeback_control *);
38678+int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38679+ unsigned to);
38680+int commit_write_unix_file(struct file *, struct page *, unsigned from,
38681+ unsigned to);
38682+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38683+
38684+/* file plugin operations */
38685+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38686+ int user, loff_t, loff_t, rw_op, flow_t *);
38687+int owns_item_unix_file(const struct inode *, const coord_t *);
38688+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38689+ int create);
38690+int delete_object_unix_file(struct inode *);
38691+
38692+/*
38693+ * all the write into unix file is performed by item write method. Write method
38694+ * of unix file plugin only decides which item plugin (extent or tail) and in
38695+ * which mode (one from the enum below) to call
38696+ */
38697+typedef enum {
38698+ FIRST_ITEM = 1,
38699+ APPEND_ITEM = 2,
38700+ OVERWRITE_ITEM = 3
38701+} write_mode_t;
38702+
38703+/* unix file may be in one the following states */
38704+typedef enum {
38705+ UF_CONTAINER_UNKNOWN = 0,
38706+ UF_CONTAINER_TAILS = 1,
38707+ UF_CONTAINER_EXTENTS = 2,
38708+ UF_CONTAINER_EMPTY = 3
38709+} file_container_t;
38710+
38711+struct formatting_plugin;
38712+struct inode;
38713+
38714+/* unix file plugin specific part of reiser4 inode */
71430cf6 38715+struct unix_file_info {
44254afd
MT
38716+ /*
38717+ * this read-write lock protects file containerization change. Accesses
38718+ * which do not change file containerization (see file_container_t)
38719+ * (read, readpage, writepage, write (until tail conversion is
38720+ * involved)) take read-lock. Accesses which modify file
38721+ * containerization (truncate, conversion from tail to extent and back)
38722+ * take write-lock.
38723+ */
38724+ struct rw_semaphore latch;
44254afd
MT
38725+ /* this enum specifies which items are used to build the file */
38726+ file_container_t container;
38727+ /*
38728+ * plugin which controls when file is to be converted to extents and
38729+ * back to tail
38730+ */
38731+ struct formatting_plugin *tplug;
38732+ /* if this is set, file is in exclusive use */
38733+ int exclusive_use;
38734+#if REISER4_DEBUG
38735+ /* pointer to task struct of thread owning exclusive access to file */
38736+ void *ea_owner;
38737+ atomic_t nr_neas;
38738+ void *last_reader;
38739+#endif
71430cf6 38740+};
44254afd
MT
38741+
38742+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
71430cf6
MT
38743+void get_exclusive_access(struct unix_file_info *);
38744+void drop_exclusive_access(struct unix_file_info *);
38745+void get_nonexclusive_access(struct unix_file_info *);
38746+void drop_nonexclusive_access(struct unix_file_info *);
38747+int try_to_get_nonexclusive_access(struct unix_file_info *);
44254afd
MT
38748+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38749+ struct inode *);
38750+int find_file_item_nohint(coord_t *, lock_handle *,
38751+ const reiser4_key *, znode_lock_mode,
38752+ struct inode *);
38753+
44254afd
MT
38754+int load_file_hint(struct file *, hint_t *);
38755+void save_file_hint(struct file *, const hint_t *);
38756+
44254afd
MT
38757+#include "../item/extent.h"
38758+#include "../item/tail.h"
38759+#include "../item/ctail.h"
38760+
38761+struct uf_coord {
38762+ coord_t coord;
38763+ lock_handle *lh;
38764+ int valid;
38765+ union {
71430cf6
MT
38766+ struct extent_coord_extension extent;
38767+ struct tail_coord_extension tail;
38768+ struct ctail_coord_extension ctail;
44254afd
MT
38769+ } extension;
38770+};
38771+
38772+#include "../../forward.h"
38773+#include "../../seal.h"
38774+#include "../../lock.h"
38775+
38776+/*
38777+ * This structure is used to speed up file operations (reads and writes). A
38778+ * hint is a suggestion about where a key resolved to last time. A seal
38779+ * indicates whether a node has been modified since a hint was last recorded.
38780+ * You check the seal, and if the seal is still valid, you can use the hint
38781+ * without traversing the tree again.
38782+ */
38783+struct hint {
38784+ seal_t seal; /* a seal over last file item accessed */
38785+ uf_coord_t ext_coord;
38786+ loff_t offset;
38787+ znode_lock_mode mode;
38788+ lock_handle lh;
38789+};
38790+
71430cf6
MT
38791+static inline int hint_is_valid(hint_t * hint)
38792+{
38793+ return hint->ext_coord.valid;
38794+}
38795+
38796+static inline void hint_set_valid(hint_t * hint)
38797+{
38798+ hint->ext_coord.valid = 1;
38799+}
38800+
38801+static inline void hint_clr_valid(hint_t * hint)
38802+{
38803+ hint->ext_coord.valid = 0;
38804+}
38805+
38806+int load_file_hint(struct file *, hint_t *);
38807+void save_file_hint(struct file *, const hint_t *);
44254afd 38808+void hint_init_zero(hint_t *);
71430cf6
MT
38809+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38810+int hint_is_set(const hint_t *);
38811+void reiser4_unset_hint(hint_t *);
44254afd 38812+
71430cf6 38813+int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
44254afd
MT
38814+int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38815+ loff_t cur_size, int (*update_actor) (struct inode *,
38816+ reiser4_key *, int));
44254afd
MT
38817+#if REISER4_DEBUG
38818+
38819+/* return 1 is exclusive access is obtained, 0 - otherwise */
71430cf6 38820+static inline int ea_obtained(struct unix_file_info * uf_info)
44254afd
MT
38821+{
38822+ int ret;
38823+
38824+ ret = down_read_trylock(&uf_info->latch);
38825+ if (ret)
38826+ up_read(&uf_info->latch);
38827+ return !ret;
38828+}
38829+
38830+#endif
38831+
38832+/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
71430cf6
MT
38833+int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
38834+ reiser4_object_create_data *);
44254afd
MT
38835+void destroy_inode_symlink(struct inode *);
38836+
71430cf6
MT
38837+/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID
38838+ file plugin */
44254afd
MT
38839+
38840+/* inode operations */
38841+int setattr_cryptcompress(struct dentry *, struct iattr *);
71430cf6 38842+int prot_setattr_cryptcompress(struct dentry *, struct iattr *);
44254afd
MT
38843+
38844+/* file operations */
38845+ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38846+ loff_t * off);
71430cf6
MT
38847+ssize_t prot_read_cryptcompress(struct file *, char __user *buf,
38848+ size_t read_amount, loff_t * off);
38849+
44254afd 38850+ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
71430cf6
MT
38851+ loff_t * off, int * conv);
38852+ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38853+ loff_t * off);
44254afd 38854+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
71430cf6 38855+int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *);
44254afd
MT
38856+ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38857+ read_actor_t actor, void *target);
71430cf6
MT
38858+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38859+ read_actor_t actor, void *target);
38860+
44254afd 38861+int release_cryptcompress(struct inode *, struct file *);
71430cf6 38862+int prot_release_cryptcompress(struct inode *, struct file *);
44254afd
MT
38863+
38864+/* address space operations */
38865+extern int readpage_cryptcompress(struct file *, struct page *);
38866+extern int writepages_cryptcompress(struct address_space *,
38867+ struct writeback_control *);
44254afd
MT
38868+/* file plugin operations */
38869+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38870+ int user, loff_t, loff_t, rw_op, flow_t *);
38871+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38872+int create_cryptcompress(struct inode *, struct inode *,
38873+ reiser4_object_create_data *);
71430cf6 38874+int delete_object_cryptcompress(struct inode *);
44254afd
MT
38875+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38876+ int create);
38877+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38878+ const reiser4_key * to_key,
38879+ reiser4_key * smallest_removed,
38880+ struct inode *object, int truncate,
38881+ int *progress);
38882+void destroy_inode_cryptcompress(struct inode *);
71430cf6 38883+int open_object_cryptcompress(struct inode * inode, struct file * file);
44254afd
MT
38884+
38885+extern reiser4_plugin_ops cryptcompress_plugin_ops;
38886+
38887+#define WRITE_GRANULARITY 32
38888+
71430cf6
MT
38889+int tail2extent(struct unix_file_info *);
38890+int extent2tail(struct unix_file_info *);
44254afd
MT
38891+
38892+int goto_right_neighbor(coord_t *, lock_handle *);
38893+int find_or_create_extent(struct page *);
38894+int equal_to_ldk(znode *, const reiser4_key *);
38895+
71430cf6 38896+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
44254afd 38897+
71430cf6 38898+static inline int cbk_errored(int cbk_result)
44254afd
MT
38899+{
38900+ return (cbk_result != CBK_COORD_NOTFOUND
38901+ && cbk_result != CBK_COORD_FOUND);
38902+}
38903+
38904+/* __REISER4_FILE_H__ */
38905+#endif
38906+
38907+/*
38908+ * Local variables:
38909+ * c-indentation-style: "K&R"
38910+ * mode-name: "LC"
38911+ * c-basic-offset: 8
38912+ * tab-width: 8
38913+ * fill-column: 79
38914+ * scroll-step: 1
38915+ * End:
38916+*/
71430cf6
MT
38917diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/Makefile linux-2.6.22/fs/reiser4/plugin/file/Makefile
38918--- linux-2.6.22.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 03:00:00.000000000 +0300
38919+++ linux-2.6.22/fs/reiser4/plugin/file/Makefile 2007-07-29 00:25:34.928709936 +0400
38920@@ -0,0 +1,7 @@
38921+obj-$(CONFIG_REISER4_FS) += file_plugins.o
44254afd 38922+
71430cf6
MT
38923+file_plugins-objs := \
38924+ file.o \
38925+ tail_conversion.o \
38926+ symlink.o \
38927+ cryptcompress.o
38928diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.22/fs/reiser4/plugin/file/symfile.c
38929--- linux-2.6.22.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 03:00:00.000000000 +0300
38930+++ linux-2.6.22/fs/reiser4/plugin/file/symfile.c 2007-07-29 00:25:34.928709936 +0400
44254afd
MT
38931@@ -0,0 +1,87 @@
38932+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38933+
38934+/* Symfiles are a generalization of Unix symlinks.
38935+
38936+ A symfile when read behaves as though you took its contents and
38937+ substituted them into the reiser4 naming system as the right hand side
38938+ of an assignment, and then read that which you had assigned to it.
38939+
38940+ A key issue for symfiles is how to implement writes through to
38941+ subfiles. In general, one must have some method of determining what
38942+ of that which is written to the symfile is written to what subfile.
38943+ This can be done by use of custom plugin methods written by users, or
38944+ by using a few general methods we provide for those willing to endure
38945+ the insertion of delimiters into what is read.
38946+
38947+ Writing to symfiles without delimiters to denote what is written to
38948+ what subfile is not supported by any plugins we provide in this
38949+ release. Our most sophisticated support for writes is that embodied
38950+ by the invert plugin (see invert.c).
38951+
38952+ A read only version of the /etc/passwd file might be
38953+ constructed as a symfile whose contents are as follows:
38954+
38955+ /etc/passwd/userlines/*
38956+
38957+ or
38958+
38959+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
38960+
38961+ or
38962+
38963+ /etc/passwd/userlines/(demidov+edward+reiser+root)
38964+
38965+ A symfile with contents
38966+
38967+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
38968+
38969+ will return when read
38970+
38971+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
38972+
38973+ and write of what has been read will not be possible to implement as
38974+ an identity operation because there are no delimiters denoting the
38975+ boundaries of what is to be written to what subfile.
38976+
38977+ Note that one could make this a read/write symfile if one specified
38978+ delimiters, and the write method understood those delimiters delimited
38979+ what was written to subfiles.
38980+
38981+ So, specifying the symfile in a manner that allows writes:
38982+
38983+ /etc/passwd/userlines/demidov+"(
38984+ )+/etc/passwd/userlines/edward+"(
38985+ )+/etc/passwd/userlines/reiser+"(
38986+ )+/etc/passwd/userlines/root+"(
38987+ )
38988+
38989+ or
38990+
38991+ /etc/passwd/userlines/(demidov+"(
38992+ )+edward+"(
38993+ )+reiser+"(
38994+ )+root+"(
38995+ ))
38996+
38997+ and the file demidov might be specified as:
38998+
38999+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39000+
39001+ or
39002+
39003+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39004+
39005+ Notice that if the file demidov has a carriage return in it, the
39006+ parsing fails, but then if you put carriage returns in the wrong place
39007+ in a normal /etc/passwd file it breaks things also.
39008+
39009+ Note that it is forbidden to have no text between two interpolations
39010+ if one wants to be able to define what parts of a write go to what
39011+ subfiles referenced in an interpolation.
39012+
39013+ If one wants to be able to add new lines by writing to the file, one
39014+ must either write a custom plugin for /etc/passwd that knows how to
39015+ name an added line, or one must use an invert, or one must use a more
39016+ sophisticated symfile syntax that we are not planning to write for
39017+ version 4.0.
39018+*/
71430cf6
MT
39019diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.22/fs/reiser4/plugin/file/symlink.c
39020--- linux-2.6.22.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 03:00:00.000000000 +0300
39021+++ linux-2.6.22/fs/reiser4/plugin/file/symlink.c 2007-07-29 00:25:34.928709936 +0400
39022@@ -0,0 +1,95 @@
44254afd
MT
39023+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39024+
39025+#include "../../inode.h"
39026+
39027+#include <linux/types.h>
39028+#include <linux/fs.h>
39029+
39030+/* file plugin methods specific for symlink files
39031+ (SYMLINK_FILE_PLUGIN_ID) */
39032+
39033+/* this is implementation of create_object method of file plugin for
39034+ SYMLINK_FILE_PLUGIN_ID
39035+ */
39036+
39037+/**
71430cf6 39038+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
44254afd
MT
39039+ * @symlink: inode of symlink object
39040+ * @dir: inode of parent directory
39041+ * @info: parameters of new object
39042+ *
39043+ * Inserts stat data with symlink extension where into the tree.
39044+ */
71430cf6
MT
39045+int reiser4_create_symlink(struct inode *symlink,
39046+ struct inode *dir UNUSED_ARG,
39047+ reiser4_object_create_data *data /* info passed to us
39048+ * this is filled by
39049+ * reiser4() syscall
39050+ * in particular */)
44254afd
MT
39051+{
39052+ int result;
39053+
39054+ assert("nikita-680", symlink != NULL);
39055+ assert("nikita-681", S_ISLNK(symlink->i_mode));
71430cf6 39056+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
44254afd
MT
39057+ assert("nikita-682", dir != NULL);
39058+ assert("nikita-684", data != NULL);
39059+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39060+
39061+ /*
39062+ * stat data of symlink has symlink extension in which we store
39063+ * symlink content, that is, path symlink is pointing to.
39064+ */
39065+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39066+
71430cf6
MT
39067+ assert("vs-838", symlink->i_private == NULL);
39068+ symlink->i_private = (void *)data->name;
44254afd
MT
39069+
39070+ assert("vs-843", symlink->i_size == 0);
39071+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39072+
39073+ /* insert stat data appended with data->name */
39074+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39075+ if (result) {
71430cf6 39076+ /* FIXME-VS: Make sure that symlink->i_private is not attached
44254afd
MT
39077+ to kmalloced data */
39078+ INODE_SET_FIELD(symlink, i_size, 0);
39079+ } else {
71430cf6
MT
39080+ assert("vs-849", symlink->i_private
39081+ && reiser4_inode_get_flag(symlink,
39082+ REISER4_GENERIC_PTR_USED));
44254afd 39083+ assert("vs-850",
71430cf6 39084+ !memcmp((char *)symlink->i_private, data->name,
44254afd
MT
39085+ (size_t) symlink->i_size + 1));
39086+ }
39087+ return result;
39088+}
39089+
39090+/* this is implementation of destroy_inode method of file plugin for
39091+ SYMLINK_FILE_PLUGIN_ID
39092+ */
39093+void destroy_inode_symlink(struct inode *inode)
39094+{
39095+ assert("edward-799",
39096+ inode_file_plugin(inode) ==
39097+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39098+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
71430cf6
MT
39099+ assert("edward-801", reiser4_inode_get_flag(inode,
39100+ REISER4_GENERIC_PTR_USED));
44254afd
MT
39101+ assert("vs-839", S_ISLNK(inode->i_mode));
39102+
71430cf6
MT
39103+ kfree(inode->i_private);
39104+ inode->i_private = NULL;
39105+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
44254afd
MT
39106+}
39107+
71430cf6
MT
39108+/*
39109+ Local variables:
39110+ c-indentation-style: "K&R"
39111+ mode-name: "LC"
39112+ c-basic-offset: 8
39113+ tab-width: 8
39114+ fill-column: 80
39115+ scroll-step: 1
39116+ End:
44254afd 39117+*/
71430cf6
MT
39118diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.22/fs/reiser4/plugin/file/tail_conversion.c
39119--- linux-2.6.22.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 03:00:00.000000000 +0300
39120+++ linux-2.6.22/fs/reiser4/plugin/file/tail_conversion.c 2007-07-29 00:25:34.932710971 +0400
39121@@ -0,0 +1,729 @@
44254afd
MT
39122+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39123+
39124+#include "../../inode.h"
39125+#include "../../super.h"
39126+#include "../../page_cache.h"
39127+#include "../../carry.h"
39128+#include "../../safe_link.h"
39129+#include "../../vfs_ops.h"
39130+
39131+#include <linux/writeback.h>
39132+
39133+/* this file contains:
39134+ tail2extent and extent2tail */
39135+
39136+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
71430cf6 39137+void get_exclusive_access(struct unix_file_info * uf_info)
44254afd 39138+{
71430cf6 39139+ assert("nikita-3028", reiser4_schedulable());
44254afd
MT
39140+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39141+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39142+ /*
39143+ * "deadlock avoidance": sometimes we commit a transaction under
39144+ * rw-semaphore on a file. Such commit can deadlock with another
39145+ * thread that captured some block (hence preventing atom from being
39146+ * committed) and waits on rw-semaphore.
39147+ */
71430cf6 39148+ reiser4_txn_restart_current();
44254afd
MT
39149+ LOCK_CNT_INC(inode_sem_w);
39150+ down_write(&uf_info->latch);
39151+ uf_info->exclusive_use = 1;
39152+ assert("vs-1713", uf_info->ea_owner == NULL);
39153+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39154+ ON_DEBUG(uf_info->ea_owner = current);
39155+}
39156+
71430cf6 39157+void drop_exclusive_access(struct unix_file_info * uf_info)
44254afd
MT
39158+{
39159+ assert("vs-1714", uf_info->ea_owner == current);
39160+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39161+ ON_DEBUG(uf_info->ea_owner = NULL);
39162+ uf_info->exclusive_use = 0;
39163+ up_write(&uf_info->latch);
39164+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39165+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39166+ LOCK_CNT_DEC(inode_sem_w);
71430cf6 39167+ reiser4_txn_restart_current();
44254afd
MT
39168+}
39169+
39170+/**
39171+ * nea_grabbed - do something when file semaphore is down_read-ed
39172+ * @uf_info:
39173+ *
39174+ * This is called when nonexclisive access is obtained on file. All it does is
39175+ * for debugging purposes.
39176+ */
71430cf6 39177+static void nea_grabbed(struct unix_file_info *uf_info)
44254afd
MT
39178+{
39179+#if REISER4_DEBUG
39180+ LOCK_CNT_INC(inode_sem_r);
39181+ assert("vs-1716", uf_info->ea_owner == NULL);
39182+ atomic_inc(&uf_info->nr_neas);
39183+ uf_info->last_reader = current;
39184+#endif
39185+}
39186+
39187+/**
39188+ * get_nonexclusive_access - get nonexclusive access to a file
39189+ * @uf_info: unix file specific part of inode to obtain access to
39190+ *
39191+ * Nonexclusive access is obtained on a file before read, write, readpage.
39192+ */
71430cf6 39193+void get_nonexclusive_access(struct unix_file_info *uf_info)
44254afd 39194+{
71430cf6 39195+ assert("nikita-3029", reiser4_schedulable());
44254afd
MT
39196+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
39197+
39198+ down_read(&uf_info->latch);
39199+ nea_grabbed(uf_info);
39200+}
39201+
39202+/**
39203+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39204+ * @uf_info: unix file specific part of inode to obtain access to
39205+ *
39206+ * Non-blocking version of nonexclusive access obtaining.
39207+ */
71430cf6 39208+int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
44254afd
MT
39209+{
39210+ int result;
39211+
39212+ result = down_read_trylock(&uf_info->latch);
39213+ if (result)
39214+ nea_grabbed(uf_info);
39215+ return result;
39216+}
39217+
71430cf6 39218+void drop_nonexclusive_access(struct unix_file_info * uf_info)
44254afd
MT
39219+{
39220+ assert("vs-1718", uf_info->ea_owner == NULL);
39221+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39222+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39223+
39224+ up_read(&uf_info->latch);
39225+
39226+ LOCK_CNT_DEC(inode_sem_r);
71430cf6 39227+ reiser4_txn_restart_current();
44254afd
MT
39228+}
39229+
39230+/* part of tail2extent. Cut all items covering @count bytes starting from
39231+ @offset */
39232+/* Audited by: green(2002.06.15) */
39233+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39234+{
39235+ reiser4_key from, to;
39236+
39237+ /* AUDIT: How about putting an assertion here, what would check
39238+ all provided range is covered by tail items only? */
39239+ /* key of first byte in the range to be cut */
39240+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39241+
39242+ /* key of last byte in that range */
39243+ to = from;
39244+ set_key_offset(&to, (__u64) (offset + count - 1));
39245+
39246+ /* cut everything between those keys */
71430cf6
MT
39247+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
39248+ inode, 0);
44254afd
MT
39249+}
39250+
39251+static void release_all_pages(struct page **pages, unsigned nr_pages)
39252+{
39253+ unsigned i;
39254+
39255+ for (i = 0; i < nr_pages; i++) {
39256+ if (pages[i] == NULL) {
39257+ unsigned j;
39258+ for (j = i + 1; j < nr_pages; j++)
39259+ assert("vs-1620", pages[j] == NULL);
39260+ break;
39261+ }
39262+ page_cache_release(pages[i]);
39263+ pages[i] = NULL;
39264+ }
39265+}
39266+
39267+/* part of tail2extent. replace tail items with extent one. Content of tail
39268+ items (@count bytes) being cut are copied already into
39269+ pages. extent_writepage method is called to create extents corresponding to
39270+ those pages */
39271+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39272+{
39273+ int result;
39274+ unsigned i;
39275+ STORE_COUNTERS;
39276+
39277+ if (nr_pages == 0)
39278+ return 0;
39279+
39280+ assert("vs-596", pages[0]);
39281+
39282+ /* cut copied items */
71430cf6 39283+ result = cut_formatting_items(inode, page_offset(pages[0]), count);
44254afd
MT
39284+ if (result)
39285+ return result;
39286+
39287+ CHECK_COUNTERS;
39288+
39289+ /* put into tree replacement for just removed items: extent item, namely */
39290+ for (i = 0; i < nr_pages; i++) {
39291+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39292+ pages[i]->index,
39293+ mapping_gfp_mask(inode->
39294+ i_mapping));
39295+ if (result)
39296+ break;
39297+ unlock_page(pages[i]);
39298+ result = find_or_create_extent(pages[i]);
39299+ if (result)
39300+ break;
39301+ SetPageUptodate(pages[i]);
39302+ }
39303+ return result;
39304+}
39305+
39306+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39307+ * items */
39308+
39309+static int reserve_tail2extent_iteration(struct inode *inode)
39310+{
39311+ reiser4_block_nr unformatted_nodes;
39312+ reiser4_tree *tree;
39313+
71430cf6 39314+ tree = reiser4_tree_by_inode(inode);
44254afd
MT
39315+
39316+ /* number of unformatted nodes which will be created */
39317+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39318+
39319+ /*
39320+ * space required for one iteration of extent->tail conversion:
39321+ *
39322+ * 1. kill N tail items
39323+ *
39324+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39325+ *
39326+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39327+ * extents) extent units.
39328+ *
39329+ * 4. drilling to the leaf level by coord_by_key()
39330+ *
39331+ * 5. possible update of stat-data
39332+ *
39333+ */
39334+ grab_space_enable();
39335+ return reiser4_grab_space
39336+ (2 * tree->height +
39337+ TAIL2EXTENT_PAGE_NUM +
39338+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39339+ 1 + estimate_one_insert_item(tree) +
39340+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39341+}
39342+
39343+/* clear stat data's flag indicating that conversion is being converted */
39344+static int complete_conversion(struct inode *inode)
39345+{
39346+ int result;
39347+
39348+ grab_space_enable();
39349+ result =
39350+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39351+ BA_CAN_COMMIT);
39352+ if (result == 0) {
71430cf6 39353+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
44254afd
MT
39354+ result = reiser4_update_sd(inode);
39355+ }
39356+ if (result)
39357+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39358+ (unsigned long long)get_inode_oid(inode), result);
39359+ return 0;
39360+}
39361+
39362+/**
39363+ * find_start
39364+ * @inode:
39365+ * @id:
39366+ * @offset:
39367+ *
39368+ * this is used by tail2extent and extent2tail to detect where previous
39369+ * uncompleted conversion stopped
39370+ */
39371+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39372+{
39373+ int result;
39374+ lock_handle lh;
39375+ coord_t coord;
71430cf6 39376+ struct unix_file_info *ufo;
44254afd
MT
39377+ int found;
39378+ reiser4_key key;
39379+
39380+ ufo = unix_file_inode_data(inode);
39381+ init_lh(&lh);
39382+ result = 0;
39383+ found = 0;
39384+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39385+ do {
39386+ init_lh(&lh);
39387+ result = find_file_item_nohint(&coord, &lh, &key,
39388+ ZNODE_READ_LOCK, inode);
39389+
39390+ if (result == CBK_COORD_FOUND) {
39391+ if (coord.between == AT_UNIT) {
39392+ /*coord_clear_iplug(&coord); */
39393+ result = zload(coord.node);
39394+ if (result == 0) {
39395+ if (item_id_by_coord(&coord) == id)
39396+ found = 1;
39397+ else
39398+ item_plugin_by_coord(&coord)->s.
39399+ file.append_key(&coord,
39400+ &key);
39401+ zrelse(coord.node);
39402+ }
39403+ } else
39404+ result = RETERR(-ENOENT);
39405+ }
39406+ done_lh(&lh);
39407+ } while (result == 0 && !found);
39408+ *offset = get_key_offset(&key);
39409+ return result;
39410+}
39411+
39412+/**
39413+ * tail2extent
39414+ * @uf_info:
39415+ *
39416+ *
39417+ */
71430cf6 39418+int tail2extent(struct unix_file_info *uf_info)
44254afd
MT
39419+{
39420+ int result;
39421+ reiser4_key key; /* key of next byte to be moved to page */
39422+ char *p_data; /* data of page */
39423+ unsigned page_off = 0, /* offset within the page where to copy data */
39424+ count; /* number of bytes of item which can be
39425+ * copied to page */
39426+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
39427+ struct page *page;
39428+ int done; /* set to 1 when all file is read */
39429+ char *item;
39430+ int i;
39431+ struct inode *inode;
39432+ int first_iteration;
39433+ int bytes;
39434+ __u64 offset;
39435+
39436+ assert("nikita-3362", ea_obtained(uf_info));
39437+ inode = unix_file_info_to_inode(uf_info);
39438+ assert("nikita-3412", !IS_RDONLY(inode));
39439+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
71430cf6 39440+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
44254afd
MT
39441+
39442+ offset = 0;
39443+ first_iteration = 1;
39444+ result = 0;
71430cf6 39445+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
44254afd
MT
39446+ /*
39447+ * file is marked on disk as there was a conversion which did
39448+ * not complete due to either crash or some error. Find which
39449+ * offset tail conversion stopped at
39450+ */
39451+ result = find_start(inode, FORMATTING_ID, &offset);
39452+ if (result == -ENOENT) {
39453+ /* no tail items found, everything is converted */
39454+ uf_info->container = UF_CONTAINER_EXTENTS;
39455+ complete_conversion(inode);
39456+ return 0;
39457+ } else if (result != 0)
39458+ /* some other error */
39459+ return result;
39460+ first_iteration = 0;
39461+ }
39462+
71430cf6 39463+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
44254afd
MT
39464+
39465+ /* get key of first byte of a file */
39466+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39467+
39468+ done = 0;
39469+ while (done == 0) {
39470+ memset(pages, 0, sizeof(pages));
39471+ result = reserve_tail2extent_iteration(inode);
39472+ if (result != 0)
39473+ goto out;
39474+ if (first_iteration) {
71430cf6 39475+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
44254afd
MT
39476+ reiser4_update_sd(inode);
39477+ first_iteration = 0;
39478+ }
39479+ bytes = 0;
39480+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
39481+ assert("vs-598",
39482+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
71430cf6 39483+ page = alloc_page(reiser4_ctx_gfp_mask_get());
44254afd
MT
39484+ if (!page) {
39485+ result = RETERR(-ENOMEM);
39486+ goto error;
39487+ }
39488+
39489+ page->index =
39490+ (unsigned long)(get_key_offset(&key) >>
39491+ PAGE_CACHE_SHIFT);
39492+ /*
39493+ * usually when one is going to longterm lock znode (as
39494+ * find_file_item does, for instance) he must not hold
39495+ * locked pages. However, there is an exception for
39496+ * case tail2extent. Pages appearing here are not
39497+ * reachable to everyone else, they are clean, they do
39498+ * not have jnodes attached so keeping them locked do
39499+ * not risk deadlock appearance
39500+ */
39501+ assert("vs-983", !PagePrivate(page));
39502+ reiser4_invalidate_pages(inode->i_mapping, page->index,
39503+ 1, 0);
39504+
39505+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
39506+ coord_t coord;
39507+ lock_handle lh;
39508+
39509+ /* get next item */
39510+ /* FIXME: we might want to readahead here */
39511+ init_lh(&lh);
39512+ result =
39513+ find_file_item_nohint(&coord, &lh, &key,
39514+ ZNODE_READ_LOCK,
39515+ inode);
39516+ if (result != CBK_COORD_FOUND) {
39517+ /*
39518+ * error happened of not items of file
39519+ * were found
39520+ */
39521+ done_lh(&lh);
39522+ page_cache_release(page);
39523+ goto error;
39524+ }
39525+
39526+ if (coord.between == AFTER_UNIT) {
39527+ /*
39528+ * end of file is reached. Padd page
39529+ * with zeros
39530+ */
39531+ done_lh(&lh);
39532+ done = 1;
39533+ p_data = kmap_atomic(page, KM_USER0);
39534+ memset(p_data + page_off, 0,
39535+ PAGE_CACHE_SIZE - page_off);
39536+ kunmap_atomic(p_data, KM_USER0);
39537+ break;
39538+ }
39539+
39540+ result = zload(coord.node);
39541+ if (result) {
39542+ page_cache_release(page);
39543+ done_lh(&lh);
39544+ goto error;
39545+ }
39546+ assert("vs-856", coord.between == AT_UNIT);
39547+ item = ((char *)item_body_by_coord(&coord)) +
39548+ coord.unit_pos;
39549+
39550+ /* how many bytes to copy */
39551+ count =
39552+ item_length_by_coord(&coord) -
39553+ coord.unit_pos;
39554+ /* limit length of copy to end of page */
39555+ if (count > PAGE_CACHE_SIZE - page_off)
39556+ count = PAGE_CACHE_SIZE - page_off;
39557+
39558+ /*
39559+ * copy item (as much as will fit starting from
39560+ * the beginning of the item) into the page
39561+ */
39562+ p_data = kmap_atomic(page, KM_USER0);
39563+ memcpy(p_data + page_off, item, count);
39564+ kunmap_atomic(p_data, KM_USER0);
39565+
39566+ page_off += count;
39567+ bytes += count;
39568+ set_key_offset(&key,
39569+ get_key_offset(&key) + count);
39570+
39571+ zrelse(coord.node);
39572+ done_lh(&lh);
39573+ } /* end of loop which fills one page by content of
39574+ * formatting items */
39575+
39576+ if (page_off) {
39577+ /* something was copied into page */
39578+ pages[i] = page;
39579+ } else {
39580+ page_cache_release(page);
39581+ assert("vs-1648", done == 1);
39582+ break;
39583+ }
39584+ } /* end of loop through pages of one conversion iteration */
39585+
39586+ if (i > 0) {
39587+ result = replace(inode, pages, i, bytes);
39588+ release_all_pages(pages, sizeof_array(pages));
39589+ if (result)
39590+ goto error;
39591+ /*
71430cf6
MT
39592+ * We have to drop exclusive access to avoid deadlock
39593+ * which may happen because called by reiser4_writepages
39594+ * capture_unix_file requires to get non-exclusive
39595+ * access to a file. It is safe to drop EA in the middle
39596+ * of tail2extent conversion because write_unix_file,
39597+ * setattr_unix_file(truncate), mmap_unix_file,
39598+ * release_unix_file(extent2tail) checks if conversion
39599+ * is not in progress (see comments before
39600+ * get_exclusive_access_careful().
39601+ * Other processes that acquire non-exclusive access
39602+ * (read_unix_file, reiser4_writepages, etc) should work
39603+ * on partially converted files.
44254afd
MT
39604+ */
39605+ drop_exclusive_access(uf_info);
39606+ /* throttle the conversion */
39607+ reiser4_throttle_write(inode);
39608+ get_exclusive_access(uf_info);
39609+
39610+ /*
39611+ * nobody is allowed to complete conversion but a
39612+ * process which started it
39613+ */
71430cf6
MT
39614+ assert("", reiser4_inode_get_flag(inode,
39615+ REISER4_PART_MIXED));
44254afd
MT
39616+ }
39617+ }
39618+
71430cf6 39619+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
44254afd
MT
39620+
39621+ if (result == 0) {
39622+ /* file is converted to extent items */
71430cf6
MT
39623+ assert("vs-1697", reiser4_inode_get_flag(inode,
39624+ REISER4_PART_MIXED));
44254afd
MT
39625+
39626+ uf_info->container = UF_CONTAINER_EXTENTS;
39627+ complete_conversion(inode);
39628+ } else {
39629+ /*
39630+ * conversion is not complete. Inode was already marked as
39631+ * REISER4_PART_CONV and stat-data were updated at the first
39632+ * iteration of the loop above.
39633+ */
39634+ error:
39635+ release_all_pages(pages, sizeof_array(pages));
39636+ warning("nikita-2282", "Partial conversion of %llu: %i",
39637+ (unsigned long long)get_inode_oid(inode), result);
39638+ }
39639+
39640+ out:
39641+ return result;
39642+}
39643+
39644+static int reserve_extent2tail_iteration(struct inode *inode)
39645+{
39646+ reiser4_tree *tree;
39647+
71430cf6 39648+ tree = reiser4_tree_by_inode(inode);
44254afd
MT
39649+ /*
39650+ * reserve blocks for (in this order):
39651+ *
39652+ * 1. removal of extent item
39653+ *
39654+ * 2. insertion of tail by insert_flow()
39655+ *
39656+ * 3. drilling to the leaf level by coord_by_key()
39657+ *
39658+ * 4. possible update of stat-data
39659+ */
39660+ grab_space_enable();
39661+ return reiser4_grab_space
39662+ (estimate_one_item_removal(tree) +
39663+ estimate_insert_flow(tree->height) +
39664+ 1 + estimate_one_insert_item(tree) +
39665+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39666+}
39667+
44254afd
MT
39668+/* for every page of file: read page, cut part of extent pointing to this page,
39669+ put data of page tree by tail item */
71430cf6 39670+int extent2tail(struct unix_file_info *uf_info)
44254afd
MT
39671+{
39672+ int result;
39673+ struct inode *inode;
39674+ struct page *page;
39675+ unsigned long num_pages, i;
39676+ unsigned long start_page;
39677+ reiser4_key from;
39678+ reiser4_key to;
39679+ unsigned count;
39680+ __u64 offset;
39681+
39682+ assert("nikita-3362", ea_obtained(uf_info));
39683+ inode = unix_file_info_to_inode(uf_info);
39684+ assert("nikita-3412", !IS_RDONLY(inode));
39685+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
71430cf6 39686+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
44254afd
MT
39687+
39688+ offset = 0;
71430cf6 39689+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
44254afd
MT
39690+ /*
39691+ * file is marked on disk as there was a conversion which did
39692+ * not complete due to either crash or some error. Find which
39693+ * offset tail conversion stopped at
39694+ */
39695+ result = find_start(inode, EXTENT_POINTER_ID, &offset);
39696+ if (result == -ENOENT) {
39697+ /* no extent found, everything is converted */
39698+ uf_info->container = UF_CONTAINER_TAILS;
39699+ complete_conversion(inode);
39700+ return 0;
39701+ } else if (result != 0)
39702+ /* some other error */
39703+ return result;
39704+ }
39705+
71430cf6 39706+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
44254afd
MT
39707+
39708+ /* number of pages in the file */
39709+ num_pages =
39710+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
39711+ start_page = offset >> PAGE_CACHE_SHIFT;
39712+
39713+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39714+ to = from;
39715+
39716+ result = 0;
39717+ for (i = 0; i < num_pages; i++) {
39718+ __u64 start_byte;
39719+
39720+ result = reserve_extent2tail_iteration(inode);
39721+ if (result != 0)
39722+ break;
39723+ if (i == 0 && offset == 0) {
71430cf6 39724+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
44254afd
MT
39725+ reiser4_update_sd(inode);
39726+ }
39727+
71430cf6
MT
39728+ page = read_mapping_page(inode->i_mapping,
39729+ (unsigned)(i + start_page), NULL);
44254afd
MT
39730+ if (IS_ERR(page)) {
39731+ result = PTR_ERR(page);
39732+ break;
39733+ }
39734+
39735+ wait_on_page_locked(page);
39736+
39737+ if (!PageUptodate(page)) {
39738+ page_cache_release(page);
39739+ result = RETERR(-EIO);
39740+ break;
39741+ }
39742+
39743+ /* cut part of file we have read */
71430cf6 39744+ start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
44254afd
MT
39745+ set_key_offset(&from, start_byte);
39746+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
39747+ /*
71430cf6 39748+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
44254afd
MT
39749+ * commits during over-long truncates. But
39750+ * extent->tail conversion should be performed in one
39751+ * transaction.
39752+ */
71430cf6
MT
39753+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
39754+ &to, inode, 0);
44254afd
MT
39755+
39756+ if (result) {
39757+ page_cache_release(page);
39758+ break;
39759+ }
39760+
39761+ /* put page data into tree via tail_write */
39762+ count = PAGE_CACHE_SIZE;
39763+ if ((i == (num_pages - 1)) &&
39764+ (inode->i_size & ~PAGE_CACHE_MASK))
39765+ /* last page can be incompleted */
39766+ count = (inode->i_size & ~PAGE_CACHE_MASK);
39767+ while (count) {
39768+ struct dentry dentry;
39769+ struct file file;
39770+ loff_t pos;
39771+
39772+ dentry.d_inode = inode;
39773+ file.f_dentry = &dentry;
39774+ file.private_data = NULL;
39775+ file.f_pos = start_byte;
39776+ file.private_data = NULL;
39777+ pos = start_byte;
71430cf6
MT
39778+ result = reiser4_write_tail(&file,
39779+ (char __user *)kmap(page),
39780+ count, &pos);
44254afd
MT
39781+ reiser4_free_file_fsdata(&file);
39782+ if (result <= 0) {
71430cf6 39783+ warning("", "reiser4_write_tail failed");
44254afd 39784+ page_cache_release(page);
71430cf6 39785+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
44254afd
MT
39786+ return result;
39787+ }
39788+ count -= result;
39789+ }
39790+
39791+ /* release page */
39792+ lock_page(page);
39793+ /* page is already detached from jnode and mapping. */
39794+ assert("vs-1086", page->mapping == NULL);
39795+ assert("nikita-2690",
39796+ (!PagePrivate(page) && jprivate(page) == 0));
39797+ /* waiting for writeback completion with page lock held is
39798+ * perfectly valid. */
39799+ wait_on_page_writeback(page);
71430cf6 39800+ reiser4_drop_page(page);
44254afd
MT
39801+ /* release reference taken by read_cache_page() above */
39802+ page_cache_release(page);
39803+
39804+ drop_exclusive_access(uf_info);
39805+ /* throttle the conversion */
39806+ reiser4_throttle_write(inode);
39807+ get_exclusive_access(uf_info);
39808+ /*
39809+ * nobody is allowed to complete conversion but a process which
39810+ * started it
39811+ */
71430cf6 39812+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
44254afd
MT
39813+ }
39814+
71430cf6 39815+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
44254afd
MT
39816+
39817+ if (i == num_pages) {
39818+ /* file is converted to formatted items */
71430cf6
MT
39819+ assert("vs-1698", reiser4_inode_get_flag(inode,
39820+ REISER4_PART_MIXED));
44254afd
MT
39821+ assert("vs-1260",
39822+ inode_has_no_jnodes(reiser4_inode_data(inode)));
39823+
39824+ uf_info->container = UF_CONTAINER_TAILS;
39825+ complete_conversion(inode);
39826+ return 0;
39827+ }
39828+ /*
39829+ * conversion is not complete. Inode was already marked as
39830+ * REISER4_PART_MIXED and stat-data were updated at the first *
39831+ * iteration of the loop above.
39832+ */
39833+ warning("nikita-2282",
39834+ "Partial conversion of %llu: %lu of %lu: %i",
39835+ (unsigned long long)get_inode_oid(inode), i,
39836+ num_pages, result);
39837+
39838+ return result;
39839+}
39840+
39841+/*
39842+ * Local variables:
39843+ * c-indentation-style: "K&R"
39844+ * mode-name: "LC"
39845+ * c-basic-offset: 8
39846+ * tab-width: 8
39847+ * fill-column: 79
39848+ * scroll-step: 1
39849+ * End:
39850+ */
71430cf6
MT
39851diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file_ops.c linux-2.6.22/fs/reiser4/plugin/file_ops.c
39852--- linux-2.6.22.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 03:00:00.000000000 +0300
39853+++ linux-2.6.22/fs/reiser4/plugin/file_ops.c 2007-07-29 00:25:34.932710971 +0400
39854@@ -0,0 +1,168 @@
44254afd
MT
39855+/* Copyright 2005 by Hans Reiser, licensing governed by
39856+ reiser4/README */
39857+
39858+/* this file contains typical implementations for some of methods of
39859+ struct file_operations and of struct address_space_operations
39860+*/
39861+
39862+#include "../inode.h"
39863+#include "object.h"
39864+
39865+/* file operations */
39866+
39867+/* implementation of vfs's llseek method of struct file_operations for
39868+ typical directory can be found in readdir_common.c
39869+*/
71430cf6 39870+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
44254afd
MT
39871+
39872+/* implementation of vfs's readdir method of struct file_operations for
39873+ typical directory can be found in readdir_common.c
39874+*/
71430cf6 39875+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
44254afd
MT
39876+
39877+/**
71430cf6 39878+ * reiser4_release_dir_common - release of struct file_operations
44254afd
MT
39879+ * @inode: inode of released file
39880+ * @file: file to release
39881+ *
39882+ * Implementation of release method of struct file_operations for typical
39883+ * directory. All it does is freeing of reiser4 specific file data.
39884+*/
71430cf6 39885+int reiser4_release_dir_common(struct inode *inode, struct file *file)
44254afd
MT
39886+{
39887+ reiser4_context *ctx;
39888+
71430cf6 39889+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
39890+ if (IS_ERR(ctx))
39891+ return PTR_ERR(ctx);
39892+ reiser4_free_file_fsdata(file);
39893+ reiser4_exit_context(ctx);
39894+ return 0;
39895+}
39896+
39897+/* this is common implementation of vfs's fsync method of struct
39898+ file_operations
39899+*/
71430cf6 39900+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
44254afd
MT
39901+{
39902+ reiser4_context *ctx;
39903+ int result;
39904+
71430cf6 39905+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
44254afd
MT
39906+ if (IS_ERR(ctx))
39907+ return PTR_ERR(ctx);
39908+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
39909+
39910+ context_set_commit_async(ctx);
39911+ reiser4_exit_context(ctx);
39912+ return result;
39913+}
39914+
39915+/* this is common implementation of vfs's sendfile method of struct
39916+ file_operations
39917+
39918+ Reads @count bytes from @file and calls @actor for every page read. This is
39919+ needed for loop back devices support.
39920+*/
39921+#if 0
39922+ssize_t
39923+sendfile_common(struct file *file, loff_t *ppos, size_t count,
39924+ read_actor_t actor, void *target)
39925+{
39926+ reiser4_context *ctx;
39927+ ssize_t result;
39928+
71430cf6 39929+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
44254afd
MT
39930+ if (IS_ERR(ctx))
39931+ return PTR_ERR(ctx);
39932+ result = generic_file_sendfile(file, ppos, count, actor, target);
39933+ reiser4_exit_context(ctx);
39934+ return result;
39935+}
39936+#endif /* 0 */
39937+
39938+/* address space operations */
39939+
39940+/* this is common implementation of vfs's prepare_write method of struct
39941+ address_space_operations
39942+*/
39943+int
39944+prepare_write_common(struct file *file, struct page *page, unsigned from,
39945+ unsigned to)
39946+{
39947+ reiser4_context *ctx;
39948+ int result;
39949+
71430cf6 39950+ ctx = reiser4_init_context(page->mapping->host->i_sb);
44254afd
MT
39951+ result = do_prepare_write(file, page, from, to);
39952+
39953+ /* don't commit transaction under inode semaphore */
39954+ context_set_commit_async(ctx);
39955+ reiser4_exit_context(ctx);
39956+
39957+ return result;
39958+}
39959+
39960+/* this is helper for prepare_write_common and prepare_write_unix_file
39961+ */
39962+int
39963+do_prepare_write(struct file *file, struct page *page, unsigned from,
39964+ unsigned to)
39965+{
39966+ int result;
39967+ file_plugin *fplug;
39968+ struct inode *inode;
39969+
39970+ assert("umka-3099", file != NULL);
39971+ assert("umka-3100", page != NULL);
39972+ assert("umka-3095", PageLocked(page));
39973+
39974+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
39975+ return 0;
39976+
39977+ inode = page->mapping->host;
39978+ fplug = inode_file_plugin(inode);
39979+
39980+ if (page->mapping->a_ops->readpage == NULL)
39981+ return RETERR(-EINVAL);
39982+
39983+ result = page->mapping->a_ops->readpage(file, page);
39984+ if (result != 0) {
39985+ SetPageError(page);
39986+ ClearPageUptodate(page);
39987+ /* All reiser4 readpage() implementations should return the
39988+ * page locked in case of error. */
39989+ assert("nikita-3472", PageLocked(page));
39990+ } else {
39991+ /*
39992+ * ->readpage() either:
39993+ *
39994+ * 1. starts IO against @page. @page is locked for IO in
39995+ * this case.
39996+ *
39997+ * 2. doesn't start IO. @page is unlocked.
39998+ *
39999+ * In either case, page should be locked.
40000+ */
40001+ lock_page(page);
40002+ /*
40003+ * IO (if any) is completed at this point. Check for IO
40004+ * errors.
40005+ */
40006+ if (!PageUptodate(page))
40007+ result = RETERR(-EIO);
40008+ }
40009+ assert("umka-3098", PageLocked(page));
40010+ return result;
40011+}
40012+
40013+/*
40014+ * Local variables:
40015+ * c-indentation-style: "K&R"
40016+ * mode-name: "LC"
40017+ * c-basic-offset: 8
40018+ * tab-width: 8
40019+ * fill-column: 79
71430cf6 40020+ * scroll-step: 1
44254afd
MT
40021+ * End:
40022+ */
71430cf6
MT
40023diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.22/fs/reiser4/plugin/file_ops_readdir.c
40024--- linux-2.6.22.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 03:00:00.000000000 +0300
40025+++ linux-2.6.22/fs/reiser4/plugin/file_ops_readdir.c 2007-07-29 00:25:34.932710971 +0400
40026@@ -0,0 +1,658 @@
44254afd
MT
40027+/* Copyright 2005 by Hans Reiser, licensing governed by
40028+ * reiser4/README */
40029+
40030+#include "../inode.h"
40031+
40032+/* return true, iff @coord points to the valid directory item that is part of
40033+ * @inode directory. */
40034+static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40035+{
71430cf6
MT
40036+ return plugin_of_group(item_plugin_by_coord(coord),
40037+ DIR_ENTRY_ITEM_TYPE) &&
40038+ inode_file_plugin(inode)->owns_item(inode, coord);
44254afd
MT
40039+}
40040+
40041+/* compare two logical positions within the same directory */
71430cf6 40042+static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2)
44254afd
MT
40043+{
40044+ cmp_t result;
40045+
40046+ assert("nikita-2534", p1 != NULL);
40047+ assert("nikita-2535", p2 != NULL);
40048+
40049+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40050+ if (result == EQUAL_TO) {
40051+ int diff;
40052+
40053+ diff = p1->pos - p2->pos;
40054+ result =
40055+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40056+ }
40057+ return result;
40058+}
40059+
71430cf6 40060+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
44254afd
MT
40061+ * necessary. */
40062+static void
71430cf6
MT
40063+adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot,
40064+ const struct dir_pos * mod_point, int adj)
44254afd 40065+{
71430cf6 40066+ struct dir_pos *pos;
44254afd
MT
40067+
40068+ /*
40069+ * new directory entry was added (adj == +1) or removed (adj == -1) at
40070+ * the @mod_point. Directory file descriptor @dir is doing readdir and
40071+ * is currently positioned at @readdir_spot. Latter has to be updated
40072+ * to maintain stable readdir.
40073+ */
40074+ /* directory is positioned to the beginning. */
40075+ if (readdir_spot->entry_no == 0)
40076+ return;
40077+
40078+ pos = &readdir_spot->position;
40079+ switch (dir_pos_cmp(mod_point, pos)) {
40080+ case LESS_THAN:
40081+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
40082+ * added/removed on the left (in key order) of current
40083+ * position. */
40084+ /* logical number of directory entry readdir is "looking" at
40085+ * changes */
40086+ readdir_spot->entry_no += adj;
40087+ assert("nikita-2577",
71430cf6 40088+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
44254afd
MT
40089+ if (de_id_cmp(&pos->dir_entry_key,
40090+ &mod_point->dir_entry_key) == EQUAL_TO) {
40091+ assert("nikita-2575", mod_point->pos < pos->pos);
40092+ /*
40093+ * if entry added/removed has the same key as current
40094+ * for readdir, update counter of duplicate keys in
40095+ * @readdir_spot.
40096+ */
40097+ pos->pos += adj;
40098+ }
40099+ break;
40100+ case GREATER_THAN:
40101+ /* directory is modified after @pos: nothing to do. */
40102+ break;
40103+ case EQUAL_TO:
40104+ /* cannot insert an entry readdir is looking at, because it
40105+ already exists. */
40106+ assert("nikita-2576", adj < 0);
40107+ /* directory entry to which @pos points to is being
40108+ removed.
40109+
40110+ NOTE-NIKITA: Right thing to do is to update @pos to point
40111+ to the next entry. This is complex (we are under spin-lock
40112+ for one thing). Just rewind it to the beginning. Next
40113+ readdir will have to scan the beginning of
40114+ directory. Proper solution is to use semaphore in
40115+ spin lock's stead and use rewind_right() here.
40116+
40117+ NOTE-NIKITA: now, semaphore is used, so...
40118+ */
40119+ memset(readdir_spot, 0, sizeof *readdir_spot);
40120+ }
40121+}
40122+
40123+/* scan all file-descriptors for this directory and adjust their
40124+ positions respectively. Should be used by implementations of
40125+ add_entry and rem_entry of dir plugin */
71430cf6
MT
40126+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
40127+ int offset, int adj)
44254afd
MT
40128+{
40129+ reiser4_file_fsdata *scan;
71430cf6 40130+ struct dir_pos mod_point;
44254afd
MT
40131+
40132+ assert("nikita-2536", dir != NULL);
40133+ assert("nikita-2538", de != NULL);
40134+ assert("nikita-2539", adj != 0);
40135+
40136+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40137+ mod_point.pos = offset;
40138+
40139+ spin_lock_inode(dir);
40140+
40141+ /*
40142+ * new entry was added/removed in directory @dir. Scan all file
40143+ * descriptors for @dir that are currently involved into @readdir and
40144+ * update them.
40145+ */
40146+
40147+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40148+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40149+
40150+ spin_unlock_inode(dir);
40151+}
40152+
40153+/*
40154+ * traverse tree to start/continue readdir from the readdir position @pos.
40155+ */
71430cf6 40156+static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap)
44254afd
MT
40157+{
40158+ reiser4_key key;
40159+ int result;
40160+ struct inode *inode;
40161+
40162+ assert("nikita-2554", pos != NULL);
40163+
40164+ inode = dir->f_dentry->d_inode;
40165+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40166+ if (result != 0)
40167+ return result;
71430cf6
MT
40168+ result = reiser4_object_lookup(inode,
40169+ &key,
40170+ tap->coord,
40171+ tap->lh,
40172+ tap->mode,
40173+ FIND_EXACT,
40174+ LEAF_LEVEL, LEAF_LEVEL,
40175+ 0, &tap->ra_info);
44254afd
MT
40176+ if (result == CBK_COORD_FOUND)
40177+ result = rewind_right(tap, (int)pos->position.pos);
40178+ else {
40179+ tap->coord->node = NULL;
40180+ done_lh(tap->lh);
40181+ result = RETERR(-EIO);
40182+ }
40183+ return result;
40184+}
40185+
40186+/*
40187+ * handling of non-unique keys: calculate at what ordinal position within
40188+ * sequence of directory items with identical keys @pos is.
40189+ */
71430cf6 40190+static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap)
44254afd
MT
40191+{
40192+ int result;
40193+ coord_t coord;
40194+ lock_handle lh;
40195+ tap_t scan;
40196+ de_id *did;
40197+ reiser4_key de_key;
40198+
40199+ coord_init_zero(&coord);
40200+ init_lh(&lh);
71430cf6
MT
40201+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40202+ reiser4_tap_copy(&scan, tap);
40203+ reiser4_tap_load(&scan);
44254afd
MT
40204+ pos->position.pos = 0;
40205+
40206+ did = &pos->position.dir_entry_key;
40207+
40208+ if (is_valid_dir_coord(inode, scan.coord)) {
40209+
40210+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40211+
40212+ while (1) {
40213+
40214+ result = go_prev_unit(&scan);
40215+ if (result != 0)
40216+ break;
40217+
40218+ if (!is_valid_dir_coord(inode, scan.coord)) {
40219+ result = -EINVAL;
40220+ break;
40221+ }
40222+
40223+ /* get key of directory entry */
40224+ unit_key_by_coord(scan.coord, &de_key);
40225+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40226+ /* duplicate-sequence is over */
40227+ break;
40228+ }
40229+ pos->position.pos++;
40230+ }
40231+ } else
40232+ result = RETERR(-ENOENT);
71430cf6
MT
40233+ reiser4_tap_relse(&scan);
40234+ reiser4_tap_done(&scan);
44254afd
MT
40235+ return result;
40236+}
40237+
40238+/*
40239+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40240+ */
71430cf6 40241+static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap)
44254afd
MT
40242+{
40243+ __u64 destination;
40244+ __s64 shift;
40245+ int result;
40246+ struct inode *inode;
40247+ loff_t dirpos;
40248+
40249+ assert("nikita-2553", dir != NULL);
40250+ assert("nikita-2548", pos != NULL);
40251+ assert("nikita-2551", tap->coord != NULL);
40252+ assert("nikita-2552", tap->lh != NULL);
40253+
71430cf6 40254+ dirpos = reiser4_get_dir_fpos(dir);
44254afd
MT
40255+ shift = dirpos - pos->fpos;
40256+ /* this is logical directory entry within @dir which we are rewinding
40257+ * to */
40258+ destination = pos->entry_no + shift;
40259+
40260+ inode = dir->f_dentry->d_inode;
40261+ if (dirpos < 0)
40262+ return RETERR(-EINVAL);
40263+ else if (destination == 0ll || dirpos == 0) {
40264+ /* rewind to the beginning of directory */
40265+ memset(pos, 0, sizeof *pos);
40266+ return dir_go_to(dir, pos, tap);
40267+ } else if (destination >= inode->i_size)
40268+ return RETERR(-ENOENT);
40269+
40270+ if (shift < 0) {
40271+ /* I am afraid of negative numbers */
40272+ shift = -shift;
40273+ /* rewinding to the left */
40274+ if (shift <= (int)pos->position.pos) {
40275+ /* destination is within sequence of entries with
40276+ duplicate keys. */
40277+ result = dir_go_to(dir, pos, tap);
40278+ } else {
40279+ shift -= pos->position.pos;
40280+ while (1) {
40281+ /* repetitions: deadlock is possible when
40282+ going to the left. */
40283+ result = dir_go_to(dir, pos, tap);
40284+ if (result == 0) {
40285+ result = rewind_left(tap, shift);
40286+ if (result == -E_DEADLOCK) {
71430cf6 40287+ reiser4_tap_done(tap);
44254afd
MT
40288+ continue;
40289+ }
40290+ }
40291+ break;
40292+ }
40293+ }
40294+ } else {
40295+ /* rewinding to the right */
40296+ result = dir_go_to(dir, pos, tap);
40297+ if (result == 0)
40298+ result = rewind_right(tap, shift);
40299+ }
40300+ if (result == 0) {
40301+ result = set_pos(inode, pos, tap);
40302+ if (result == 0) {
40303+ /* update pos->position.pos */
40304+ pos->entry_no = destination;
40305+ pos->fpos = dirpos;
40306+ }
40307+ }
40308+ return result;
40309+}
40310+
40311+/*
40312+ * Function that is called by common_readdir() on each directory entry while
40313+ * doing readdir. ->filldir callback may block, so we had to release long term
40314+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
40315+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40316+ *
40317+ * Whether node is unlocked in case of any other error is undefined. It is
40318+ * guaranteed to be still locked if success (0) is returned.
40319+ *
40320+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
40321+ * unlocked.
40322+ */
40323+static int
71430cf6
MT
40324+feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap,
40325+ filldir_t filldir, void *dirent)
44254afd
MT
40326+{
40327+ item_plugin *iplug;
40328+ char *name;
40329+ reiser4_key sd_key;
40330+ int result;
40331+ char buf[DE_NAME_BUF_LEN];
40332+ char name_buf[32];
40333+ char *local_name;
40334+ unsigned file_type;
40335+ seal_t seal;
40336+ coord_t *coord;
40337+ reiser4_key entry_key;
40338+
40339+ coord = tap->coord;
40340+ iplug = item_plugin_by_coord(coord);
40341+
40342+ /* pointer to name within the node */
40343+ name = iplug->s.dir.extract_name(coord, buf);
40344+ assert("nikita-1371", name != NULL);
40345+
40346+ /* key of object the entry points to */
40347+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40348+ return RETERR(-EIO);
40349+
40350+ /* we must release longterm znode lock before calling filldir to avoid
40351+ deadlock which may happen if filldir causes page fault. So, copy
40352+ name to intermediate buffer */
40353+ if (strlen(name) + 1 > sizeof(name_buf)) {
71430cf6
MT
40354+ local_name = kmalloc(strlen(name) + 1,
40355+ reiser4_ctx_gfp_mask_get());
44254afd
MT
40356+ if (local_name == NULL)
40357+ return RETERR(-ENOMEM);
40358+ } else
40359+ local_name = name_buf;
40360+
40361+ strcpy(local_name, name);
40362+ file_type = iplug->s.dir.extract_file_type(coord);
40363+
40364+ unit_key_by_coord(coord, &entry_key);
71430cf6 40365+ reiser4_seal_init(&seal, coord, &entry_key);
44254afd
MT
40366+
40367+ longterm_unlock_znode(tap->lh);
40368+
40369+ /*
40370+ * send information about directory entry to the ->filldir() filler
40371+ * supplied to us by caller (VFS).
40372+ *
40373+ * ->filldir is entitled to do weird things. For example, ->filldir
40374+ * supplied by knfsd re-enters file system. Make sure no locks are
40375+ * held.
40376+ */
40377+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40378+
71430cf6 40379+ reiser4_txn_restart_current();
44254afd
MT
40380+ result = filldir(dirent, name, (int)strlen(name),
40381+ /* offset of this entry */
40382+ f->f_pos,
40383+ /* inode number of object bounden by this entry */
40384+ oid_to_uino(get_key_objectid(&sd_key)), file_type);
40385+ if (local_name != name_buf)
40386+ kfree(local_name);
40387+ if (result < 0)
40388+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
40389+ result = 1;
40390+ else
71430cf6
MT
40391+ result = reiser4_seal_validate(&seal, coord, &entry_key,
40392+ tap->lh, tap->mode,
40393+ ZNODE_LOCK_HIPRI);
44254afd
MT
40394+ return result;
40395+}
40396+
71430cf6 40397+static void move_entry(struct readdir_pos * pos, coord_t * coord)
44254afd
MT
40398+{
40399+ reiser4_key de_key;
40400+ de_id *did;
40401+
40402+ /* update @pos */
40403+ ++pos->entry_no;
40404+ did = &pos->position.dir_entry_key;
40405+
40406+ /* get key of directory entry */
40407+ unit_key_by_coord(coord, &de_key);
40408+
40409+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40410+ /* we are within sequence of directory entries
40411+ with duplicate keys. */
40412+ ++pos->position.pos;
40413+ else {
40414+ pos->position.pos = 0;
40415+ build_de_id_by_key(&de_key, did);
40416+ }
40417+ ++pos->fpos;
40418+}
40419+
40420+/*
40421+ * STATELESS READDIR
40422+ *
40423+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
40424+ * into reiser4_file_fsdata on each directory modification (name insertion and
71430cf6
MT
40425+ * removal), see reiser4_readdir_common() function below. This obviously doesn't
40426+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
44254afd
MT
40427+ * across client READDIR requests for the same directory.
40428+ *
40429+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
40430+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40431+ * find detached reiser4_file_fsdata corresponding to previous readdir
40432+ * request. In other words, additional state is maintained on the
40433+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
40434+ *
40435+ * To efficiently detect when our ->readdir() method is called by NFS server,
40436+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40437+ * file_is_stateless() function).
40438+ *
40439+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
40440+ * bits of NFS readdir cookie: when first readdir request comes to the given
40441+ * directory from the given client, cookie is set to 0. This situation is
40442+ * detected, global cid_counter is incremented, and stored in highest bits of
40443+ * all direntry offsets returned to the client, including last one. As the
40444+ * only valid readdir cookie is one obtained as direntry->offset, we are
40445+ * guaranteed that next readdir request (continuing current one) will have
40446+ * current cid in the highest bits of starting readdir cookie. All d_cursors
40447+ * are hashed into per-super-block hash table by (oid, cid) key.
40448+ *
40449+ * In addition d_cursors are placed into per-super-block radix tree where they
40450+ * are keyed by oid alone. This is necessary to efficiently remove them during
40451+ * rmdir.
40452+ *
40453+ * At last, currently unused d_cursors are linked into special list. This list
40454+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40455+ *
40456+ */
40457+
44254afd
MT
40458+/*
40459+ * prepare for readdir.
40460+ */
71430cf6
MT
40461+static int dir_readdir_init(struct file *f, tap_t * tap,
40462+ struct readdir_pos ** pos)
44254afd
MT
40463+{
40464+ struct inode *inode;
40465+ reiser4_file_fsdata *fsdata;
40466+ int result;
40467+
40468+ assert("nikita-1359", f != NULL);
40469+ inode = f->f_dentry->d_inode;
40470+ assert("nikita-1360", inode != NULL);
40471+
40472+ if (!S_ISDIR(inode->i_mode))
40473+ return RETERR(-ENOTDIR);
40474+
40475+ /* try to find detached readdir state */
71430cf6 40476+ result = reiser4_attach_fsdata(f, inode);
44254afd
MT
40477+ if (result != 0)
40478+ return result;
40479+
40480+ fsdata = reiser4_get_file_fsdata(f);
40481+ assert("nikita-2571", fsdata != NULL);
40482+ if (IS_ERR(fsdata))
40483+ return PTR_ERR(fsdata);
40484+
40485+ /* add file descriptor to the readdir list hanging of directory
40486+ * inode. This list is used to scan "readdirs-in-progress" while
40487+ * inserting or removing names in the directory. */
40488+ spin_lock_inode(inode);
40489+ if (list_empty_careful(&fsdata->dir.linkage))
40490+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
40491+ *pos = &fsdata->dir.readdir;
40492+ spin_unlock_inode(inode);
40493+
40494+ /* move @tap to the current position */
40495+ return dir_rewind(f, *pos, tap);
40496+}
40497+
40498+/* this is implementation of vfs's llseek method of struct file_operations for
40499+ typical directory
71430cf6 40500+ See comment before reiser4_readdir_common() for explanation.
44254afd 40501+*/
71430cf6 40502+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
44254afd
MT
40503+{
40504+ reiser4_context *ctx;
40505+ loff_t result;
40506+ struct inode *inode;
40507+
40508+ inode = file->f_dentry->d_inode;
40509+
71430cf6 40510+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
40511+ if (IS_ERR(ctx))
40512+ return PTR_ERR(ctx);
40513+
40514+ mutex_lock(&inode->i_mutex);
40515+
40516+ /* update ->f_pos */
40517+ result = default_llseek(file, off, origin);
40518+ if (result >= 0) {
40519+ int ff;
40520+ coord_t coord;
40521+ lock_handle lh;
40522+ tap_t tap;
71430cf6 40523+ struct readdir_pos *pos;
44254afd
MT
40524+
40525+ coord_init_zero(&coord);
40526+ init_lh(&lh);
71430cf6 40527+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
44254afd
MT
40528+
40529+ ff = dir_readdir_init(file, &tap, &pos);
71430cf6 40530+ reiser4_detach_fsdata(file);
44254afd
MT
40531+ if (ff != 0)
40532+ result = (loff_t) ff;
71430cf6 40533+ reiser4_tap_done(&tap);
44254afd 40534+ }
71430cf6 40535+ reiser4_detach_fsdata(file);
44254afd
MT
40536+ mutex_unlock(&inode->i_mutex);
40537+
40538+ reiser4_exit_context(ctx);
40539+ return result;
40540+}
40541+
40542+/* this is common implementation of vfs's readdir method of struct
40543+ file_operations
40544+
40545+ readdir problems:
40546+
40547+ readdir(2)/getdents(2) interface is based on implicit assumption that
40548+ readdir can be restarted from any particular point by supplying file system
40549+ with off_t-full of data. That is, file system fills ->d_off field in struct
40550+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
40551+ implemented by glibc as lseek(2) on directory.
40552+
40553+ Reiser4 cannot restart readdir from 64 bits of data, because two last
40554+ components of the key of directory entry are unknown, which given 128 bits:
40555+ locality and type fields in the key of directory entry are always known, to
40556+ start readdir() from given point objectid and offset fields have to be
40557+ filled.
40558+
40559+ Traditional UNIX API for scanning through directory
40560+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
40561+ assumption that directory is structured very much like regular file, in
40562+ particular, it is implied that each name within given directory (directory
40563+ entry) can be uniquely identified by scalar offset and that such offset is
40564+ stable across the life-time of the name is identifies.
40565+
40566+ This is manifestly not so for reiser4. In reiser4 the only stable unique
40567+ identifies for the directory entry is its key that doesn't fit into
40568+ seekdir/telldir API.
40569+
40570+ solution:
40571+
40572+ Within each file descriptor participating in readdir-ing of directory
40573+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
40574+ the "current" directory entry that file descriptor looks at. It contains a
40575+ key of directory entry (plus some additional info to deal with non-unique
40576+ keys that we wouldn't dwell onto here) and a logical position of this
40577+ directory entry starting from the beginning of the directory, that is
40578+ ordinal number of this entry in the readdir order.
40579+
40580+ Obviously this logical position is not stable in the face of directory
40581+ modifications. To work around this, on each addition or removal of directory
40582+ entry all file descriptors for directory inode are scanned and their
40583+ readdir_pos are updated accordingly (adjust_dir_pos()).
40584+*/
71430cf6
MT
40585+int reiser4_readdir_common(struct file *f /* directory file being read */,
40586+ void *dirent /* opaque data passed to us by VFS */,
40587+ filldir_t filld /* filler function passed to us
40588+ * by VFS */)
44254afd
MT
40589+{
40590+ reiser4_context *ctx;
40591+ int result;
40592+ struct inode *inode;
40593+ coord_t coord;
40594+ lock_handle lh;
40595+ tap_t tap;
71430cf6 40596+ struct readdir_pos *pos;
44254afd
MT
40597+
40598+ assert("nikita-1359", f != NULL);
40599+ inode = f->f_dentry->d_inode;
40600+ assert("nikita-1360", inode != NULL);
40601+
40602+ if (!S_ISDIR(inode->i_mode))
40603+ return RETERR(-ENOTDIR);
40604+
71430cf6 40605+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
40606+ if (IS_ERR(ctx))
40607+ return PTR_ERR(ctx);
40608+
40609+ coord_init_zero(&coord);
40610+ init_lh(&lh);
71430cf6 40611+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
44254afd
MT
40612+
40613+ reiser4_readdir_readahead_init(inode, &tap);
40614+
40615+ repeat:
40616+ result = dir_readdir_init(f, &tap, &pos);
40617+ if (result == 0) {
71430cf6 40618+ result = reiser4_tap_load(&tap);
44254afd
MT
40619+ /* scan entries one by one feeding them to @filld */
40620+ while (result == 0) {
40621+ coord_t *coord;
40622+
40623+ coord = tap.coord;
40624+ assert("nikita-2572", coord_is_existing_unit(coord));
40625+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
40626+
40627+ result = feed_entry(f, pos, &tap, filld, dirent);
40628+ if (result > 0) {
40629+ break;
40630+ } else if (result == 0) {
40631+ ++f->f_pos;
40632+ result = go_next_unit(&tap);
40633+ if (result == -E_NO_NEIGHBOR ||
40634+ result == -ENOENT) {
40635+ result = 0;
40636+ break;
40637+ } else if (result == 0) {
40638+ if (is_valid_dir_coord(inode, coord))
40639+ move_entry(pos, coord);
40640+ else
40641+ break;
40642+ }
40643+ } else if (result == -E_REPEAT) {
40644+ /* feed_entry() had to restart. */
40645+ ++f->f_pos;
71430cf6 40646+ reiser4_tap_relse(&tap);
44254afd
MT
40647+ goto repeat;
40648+ } else
40649+ warning("vs-1617",
71430cf6 40650+ "reiser4_readdir_common: unexpected error %d",
44254afd
MT
40651+ result);
40652+ }
71430cf6 40653+ reiser4_tap_relse(&tap);
44254afd
MT
40654+
40655+ if (result >= 0)
40656+ f->f_version = inode->i_version;
40657+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
40658+ result = 0;
71430cf6
MT
40659+ reiser4_tap_done(&tap);
40660+ reiser4_detach_fsdata(f);
44254afd
MT
40661+
40662+ /* try to update directory's atime */
71430cf6 40663+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
44254afd
MT
40664+ BA_CAN_COMMIT) != 0)
40665+ warning("", "failed to update atime on readdir: %llu",
40666+ get_inode_oid(inode));
40667+ else
40668+ file_accessed(f);
40669+
40670+ context_set_commit_async(ctx);
40671+ reiser4_exit_context(ctx);
40672+
40673+ return (result <= 0) ? result : 0;
40674+}
40675+
40676+/*
40677+ * Local variables:
40678+ * c-indentation-style: "K&R"
40679+ * mode-name: "LC"
40680+ * c-basic-offset: 8
40681+ * tab-width: 8
40682+ * fill-column: 79
40683+ * End:
40684+ */
71430cf6
MT
40685diff -urN linux-2.6.22.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.22/fs/reiser4/plugin/file_plugin_common.c
40686--- linux-2.6.22.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 03:00:00.000000000 +0300
40687+++ linux-2.6.22/fs/reiser4/plugin/file_plugin_common.c 2007-07-29 00:25:34.936712007 +0400
40688@@ -0,0 +1,1007 @@
44254afd
MT
40689+/* Copyright 2005 by Hans Reiser, licensing governed by
40690+ reiser4/README */
40691+
40692+/* this file contains typical implementations for most of methods of
40693+ file plugin
40694+*/
40695+
40696+#include "../inode.h"
40697+#include "object.h"
40698+#include "../safe_link.h"
40699+
40700+#include <linux/quotaops.h>
40701+
40702+static int insert_new_sd(struct inode *inode);
40703+static int update_sd(struct inode *inode);
40704+
40705+/* this is common implementation of write_sd_by_inode method of file plugin
40706+ either insert stat data or update it
40707+ */
40708+int write_sd_by_inode_common(struct inode *inode /* object to save */ )
40709+{
40710+ int result;
40711+
40712+ assert("nikita-730", inode != NULL);
40713+
71430cf6 40714+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
44254afd
MT
40715+ /* object doesn't have stat-data yet */
40716+ result = insert_new_sd(inode);
40717+ else
40718+ result = update_sd(inode);
40719+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
40720+ /* Don't issue warnings about "name is too long" */
40721+ warning("nikita-2221", "Failed to save sd for %llu: %i",
40722+ (unsigned long long)get_inode_oid(inode), result);
40723+ return result;
40724+}
40725+
40726+/* this is common implementation of key_by_inode method of file plugin
40727+ */
40728+int
40729+key_by_inode_and_offset_common(struct inode *inode, loff_t off,
40730+ reiser4_key * key)
40731+{
40732+ reiser4_key_init(key);
40733+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
40734+ set_key_ordering(key, get_inode_ordering(inode));
40735+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
40736+ set_key_type(key, KEY_BODY_MINOR);
40737+ set_key_offset(key, (__u64) off);
40738+ return 0;
40739+}
40740+
40741+/* this is common implementation of set_plug_in_inode method of file plugin
40742+ */
40743+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
40744+ struct inode *parent /* parent object */ ,
40745+ reiser4_object_create_data * data /* creational
40746+ * data */ )
40747+{
40748+ __u64 mask;
40749+
40750+ object->i_mode = data->mode;
40751+ /* this should be plugin decision */
40752+ object->i_uid = current->fsuid;
40753+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
40754+
40755+ /* support for BSD style group-id assignment. See mount's manual page
40756+ description of bsdgroups ext2 mount options for more details */
40757+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
40758+ object->i_gid = parent->i_gid;
40759+ else if (parent->i_mode & S_ISGID) {
40760+ /* parent directory has sguid bit */
40761+ object->i_gid = parent->i_gid;
40762+ if (S_ISDIR(object->i_mode))
40763+ /* sguid is inherited by sub-directories */
40764+ object->i_mode |= S_ISGID;
40765+ } else
40766+ object->i_gid = current->fsgid;
40767+
40768+ /* this object doesn't have stat-data yet */
71430cf6 40769+ reiser4_inode_set_flag(object, REISER4_NO_SD);
44254afd
MT
40770+#if 0
40771+ /* this is now called after all inode plugins are initialized:
40772+ do_create_vfs_child after adjust_to_parent */
40773+ /* setup inode and file-operations for this inode */
40774+ setup_inode_ops(object, data);
40775+#endif
40776+ object->i_nlink = 0;
71430cf6 40777+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
44254afd
MT
40778+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
40779+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
40780+ mask |= (1 << LARGE_TIMES_STAT);
40781+
40782+ reiser4_inode_data(object)->extmask = mask;
40783+ return 0;
40784+}
40785+
40786+/* this is common implementation of adjust_to_parent method of file plugin for
40787+ regular files
40788+ */
40789+int adjust_to_parent_common(struct inode *object /* new object */ ,
40790+ struct inode *parent /* parent directory */ ,
40791+ struct inode *root /* root directory */ )
40792+{
40793+ assert("nikita-2165", object != NULL);
40794+ if (parent == NULL)
40795+ parent = root;
40796+ assert("nikita-2069", parent != NULL);
40797+
40798+ /*
40799+ * inherit missing plugins from parent
40800+ */
40801+
71430cf6
MT
40802+ grab_plugin_pset(object, parent, PSET_FILE);
40803+ grab_plugin_pset(object, parent, PSET_SD);
40804+ grab_plugin_pset(object, parent, PSET_FORMATTING);
40805+ grab_plugin_pset(object, parent, PSET_PERM);
44254afd
MT
40806+ return 0;
40807+}
40808+
40809+/* this is common implementation of adjust_to_parent method of file plugin for
40810+ typical directories
40811+ */
40812+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
40813+ struct inode *parent /* parent directory */ ,
40814+ struct inode *root /* root directory */ )
40815+{
40816+ int result = 0;
40817+ pset_member memb;
40818+
40819+ assert("nikita-2166", object != NULL);
40820+ if (parent == NULL)
40821+ parent = root;
40822+ assert("nikita-2167", parent != NULL);
40823+
40824+ /*
40825+ * inherit missing plugins from parent
40826+ */
40827+ for (memb = 0; memb < PSET_LAST; ++memb) {
71430cf6 40828+ result = grab_plugin_pset(object, parent, memb);
44254afd
MT
40829+ if (result != 0)
40830+ break;
40831+ }
40832+ return result;
40833+}
40834+
40835+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
40836+ struct inode *parent /* parent directory */,
40837+ struct inode *root /* root directory */)
40838+{
40839+ int result;
40840+ result = adjust_to_parent_common(object, parent, root);
40841+ if (result)
40842+ return result;
40843+ assert("edward-1416", parent != NULL);
40844+
71430cf6
MT
40845+ grab_plugin_pset(object, parent, PSET_CLUSTER);
40846+ grab_plugin_pset(object, parent, PSET_CIPHER);
40847+ grab_plugin_pset(object, parent, PSET_DIGEST);
40848+ grab_plugin_pset(object, parent, PSET_COMPRESSION);
40849+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
44254afd
MT
40850+
40851+ return 0;
40852+}
40853+
40854+/* this is common implementation of create_object method of file plugin
40855+ */
71430cf6
MT
40856+int reiser4_create_object_common(struct inode *object, struct inode *parent,
40857+ reiser4_object_create_data * data)
44254afd
MT
40858+{
40859+ reiser4_block_nr reserve;
40860+ assert("nikita-744", object != NULL);
40861+ assert("nikita-745", parent != NULL);
40862+ assert("nikita-747", data != NULL);
71430cf6 40863+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
44254afd
MT
40864+
40865+ reserve = estimate_create_common(object);
40866+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
40867+ return RETERR(-ENOSPC);
40868+ return write_sd_by_inode_common(object);
40869+}
40870+
40871+static int common_object_delete_no_reserve(struct inode *inode);
40872+
40873+/**
71430cf6 40874+ * reiser4_delete_object_common - delete_object of file_plugin
44254afd
MT
40875+ * @inode: inode to be deleted
40876+ *
40877+ * This is common implementation of delete_object method of file_plugin. It
40878+ * applies to object its deletion consists of removing two items - stat data
40879+ * and safe-link.
40880+ */
71430cf6 40881+int reiser4_delete_object_common(struct inode *inode)
44254afd
MT
40882+{
40883+ int result;
40884+
40885+ assert("nikita-1477", inode != NULL);
40886+ /* FIXME: if file body deletion failed (i/o error, for instance),
40887+ inode->i_size can be != 0 here */
40888+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
40889+ assert("nikita-3421", inode->i_nlink == 0);
40890+
71430cf6 40891+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
44254afd
MT
40892+ reiser4_block_nr reserve;
40893+
40894+ /* grab space which is needed to remove 2 items from the tree:
40895+ stat data and safe-link */
71430cf6
MT
40896+ reserve = 2 *
40897+ estimate_one_item_removal(reiser4_tree_by_inode(inode));
44254afd
MT
40898+ if (reiser4_grab_space_force(reserve,
40899+ BA_RESERVED | BA_CAN_COMMIT))
40900+ return RETERR(-ENOSPC);
40901+ result = common_object_delete_no_reserve(inode);
40902+ } else
40903+ result = 0;
40904+ return result;
40905+}
40906+
40907+/**
71430cf6 40908+ * reiser4_delete_dir_common - delete_object of file_plugin
44254afd
MT
40909+ * @inode: inode to be deleted
40910+ *
40911+ * This is common implementation of delete_object method of file_plugin for
40912+ * typical directory. It calls done method of dir_plugin to remove "." and
40913+ * removes stat data and safe-link.
40914+ */
71430cf6 40915+int reiser4_delete_dir_common(struct inode *inode)
44254afd
MT
40916+{
40917+ int result;
40918+ dir_plugin *dplug;
40919+
40920+ assert("", (get_current_context() &&
40921+ get_current_context()->trans->atom == NULL));
40922+
40923+ dplug = inode_dir_plugin(inode);
40924+ assert("vs-1101", dplug && dplug->done);
40925+
40926+ /* kill cursors which might be attached to inode */
71430cf6 40927+ reiser4_kill_cursors(inode);
44254afd
MT
40928+
40929+ /* grab space enough for removing two items */
40930+ if (reiser4_grab_space
71430cf6 40931+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
44254afd
MT
40932+ BA_RESERVED | BA_CAN_COMMIT))
40933+ return RETERR(-ENOSPC);
40934+
40935+ result = dplug->done(inode);
40936+ if (!result)
40937+ result = common_object_delete_no_reserve(inode);
40938+ return result;
40939+}
40940+
40941+/* this is common implementation of add_link method of file plugin
40942+ */
71430cf6 40943+int reiser4_add_link_common(struct inode *object, struct inode *parent)
44254afd
MT
40944+{
40945+ /*
40946+ * increment ->i_nlink and update ->i_ctime
40947+ */
40948+
40949+ INODE_INC_FIELD(object, i_nlink);
40950+ object->i_ctime = CURRENT_TIME;
40951+ return 0;
40952+}
40953+
40954+/* this is common implementation of rem_link method of file plugin
40955+ */
71430cf6 40956+int reiser4_rem_link_common(struct inode *object, struct inode *parent)
44254afd
MT
40957+{
40958+ assert("nikita-2021", object != NULL);
40959+ assert("nikita-2163", object->i_nlink > 0);
40960+
40961+ /*
40962+ * decrement ->i_nlink and update ->i_ctime
40963+ */
40964+
40965+ INODE_DEC_FIELD(object, i_nlink);
40966+ object->i_ctime = CURRENT_TIME;
40967+ return 0;
40968+}
40969+
40970+/* this is common implementation of rem_link method of file plugin for typical
40971+ directory
40972+*/
40973+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
40974+{
40975+ assert("nikita-20211", object != NULL);
40976+ assert("nikita-21631", object->i_nlink > 0);
40977+
40978+ /*
40979+ * decrement ->i_nlink and update ->i_ctime
40980+ */
40981+ INODE_DEC_FIELD(object, i_nlink);
40982+ if (object->i_nlink == 1)
40983+ INODE_DEC_FIELD(object, i_nlink);
40984+ object->i_ctime = CURRENT_TIME;
40985+ return 0;
40986+}
40987+
40988+/* this is common implementation of owns_item method of file plugin
40989+ compare objectids of keys in inode and coord */
40990+int owns_item_common(const struct inode *inode, /* object to check
40991+ * against */
40992+ const coord_t * coord /* coord to check */ )
40993+{
40994+ reiser4_key item_key;
40995+ reiser4_key file_key;
40996+
40997+ assert("nikita-760", inode != NULL);
40998+ assert("nikita-761", coord != NULL);
40999+
41000+ return coord_is_existing_item(coord) &&
41001+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
41002+ get_key_objectid(item_key_by_coord(coord, &item_key)));
41003+}
41004+
41005+/* this is common implementation of owns_item method of file plugin
41006+ for typical directory
41007+*/
41008+int owns_item_common_dir(const struct inode *inode, /* object to check against */
41009+ const coord_t * coord /* coord of item to check */ )
41010+{
41011+ reiser4_key item_key;
41012+
41013+ assert("nikita-1335", inode != NULL);
41014+ assert("nikita-1334", coord != NULL);
41015+
71430cf6 41016+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
44254afd
MT
41017+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41018+ get_inode_oid(inode);
41019+ else
41020+ return owns_item_common(inode, coord);
41021+}
41022+
41023+/* this is common implementation of can_add_link method of file plugin
41024+ checks whether yet another hard links to this object can be added
41025+*/
41026+int can_add_link_common(const struct inode *object /* object to check */ )
41027+{
41028+ assert("nikita-732", object != NULL);
41029+
41030+ /* inode->i_nlink is unsigned int, so just check for integer
41031+ overflow */
41032+ return object->i_nlink + 1 != 0;
41033+}
41034+
41035+/* this is common implementation of can_rem_link method of file plugin for
41036+ typical directory
41037+*/
41038+int can_rem_link_common_dir(const struct inode *inode)
41039+{
41040+ /* is_dir_empty() returns 0 is dir is empty */
41041+ return !is_dir_empty(inode);
41042+}
41043+
41044+/* this is common implementation of detach method of file plugin for typical
41045+ directory
41046+*/
71430cf6 41047+int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
44254afd
MT
41048+{
41049+ dir_plugin *dplug;
41050+
41051+ dplug = inode_dir_plugin(child);
41052+ assert("nikita-2883", dplug != NULL);
41053+ assert("nikita-2884", dplug->detach != NULL);
41054+ return dplug->detach(child, parent);
41055+}
41056+
41057+/* this is common implementation of bind method of file plugin for typical
41058+ directory
41059+*/
71430cf6 41060+int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
44254afd
MT
41061+{
41062+ dir_plugin *dplug;
41063+
41064+ dplug = inode_dir_plugin(child);
41065+ assert("nikita-2646", dplug != NULL);
41066+ return dplug->attach(child, parent);
41067+}
41068+
41069+static int process_truncate(struct inode *, __u64 size);
41070+
41071+/* this is common implementation of safelink method of file plugin
41072+ */
41073+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41074+{
41075+ int result;
41076+
41077+ assert("vs-1705", get_current_context()->trans->atom == NULL);
41078+ if (link == SAFE_UNLINK)
41079+ /* nothing to do. iput() in the caller (process_safelink) will
41080+ * finish with file */
41081+ result = 0;
41082+ else if (link == SAFE_TRUNCATE)
41083+ result = process_truncate(object, value);
41084+ else {
41085+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41086+ result = RETERR(-EIO);
41087+ }
41088+ return result;
41089+}
41090+
41091+/* this is common implementation of estimate.create method of file plugin
41092+ can be used when object creation involves insertion of one item (usually stat
41093+ data) into tree
41094+*/
41095+reiser4_block_nr estimate_create_common(const struct inode * object)
41096+{
71430cf6 41097+ return estimate_one_insert_item(reiser4_tree_by_inode(object));
44254afd
MT
41098+}
41099+
41100+/* this is common implementation of estimate.create method of file plugin for
41101+ typical directory
41102+ can be used when directory creation involves insertion of two items (usually
41103+ stat data and item containing "." and "..") into tree
41104+*/
41105+reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41106+{
71430cf6 41107+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
44254afd
MT
41108+}
41109+
41110+/* this is common implementation of estimate.update method of file plugin
41111+ can be used when stat data update does not do more than inserting a unit
41112+ into a stat data item which is probably true for most cases
41113+*/
41114+reiser4_block_nr estimate_update_common(const struct inode * inode)
41115+{
71430cf6 41116+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
44254afd
MT
41117+}
41118+
41119+/* this is common implementation of estimate.unlink method of file plugin
41120+ */
41121+reiser4_block_nr
41122+estimate_unlink_common(const struct inode * object UNUSED_ARG,
41123+ const struct inode * parent UNUSED_ARG)
41124+{
41125+ return 0;
41126+}
41127+
41128+/* this is common implementation of estimate.unlink method of file plugin for
41129+ typical directory
41130+*/
41131+reiser4_block_nr
41132+estimate_unlink_common_dir(const struct inode * object,
41133+ const struct inode * parent)
41134+{
41135+ dir_plugin *dplug;
41136+
41137+ dplug = inode_dir_plugin(object);
41138+ assert("nikita-2888", dplug != NULL);
41139+ assert("nikita-2887", dplug->estimate.unlink != NULL);
41140+ return dplug->estimate.unlink(object, parent);
41141+}
41142+
41143+char *wire_write_common(struct inode *inode, char *start)
41144+{
41145+ return build_inode_onwire(inode, start);
41146+}
41147+
41148+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41149+{
41150+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41151+}
41152+
41153+struct dentry *wire_get_common(struct super_block *sb,
41154+ reiser4_object_on_wire * obj)
41155+{
41156+ struct inode *inode;
41157+ struct dentry *dentry;
41158+ reiser4_key key;
41159+
41160+ extract_key_from_id(&obj->u.std.key_id, &key);
41161+ inode = reiser4_iget(sb, &key, 1);
41162+ if (!IS_ERR(inode)) {
41163+ reiser4_iget_complete(inode);
41164+ dentry = d_alloc_anon(inode);
41165+ if (dentry == NULL) {
41166+ iput(inode);
41167+ dentry = ERR_PTR(-ENOMEM);
41168+ } else
41169+ dentry->d_op = &get_super_private(sb)->ops.dentry;
41170+ } else if (PTR_ERR(inode) == -ENOENT)
41171+ /*
41172+ * inode wasn't found at the key encoded in the file
41173+ * handle. Hence, file handle is stale.
41174+ */
41175+ dentry = ERR_PTR(RETERR(-ESTALE));
41176+ else
41177+ dentry = (void *)inode;
41178+ return dentry;
41179+}
41180+
41181+int wire_size_common(struct inode *inode)
41182+{
41183+ return inode_onwire_size(inode);
41184+}
41185+
41186+void wire_done_common(reiser4_object_on_wire * obj)
41187+{
41188+ /* nothing to do */
41189+}
41190+
41191+/* helper function to print errors */
41192+static void key_warning(const reiser4_key * key /* key to print */ ,
41193+ const struct inode *inode,
41194+ int code /* error code to print */ )
41195+{
41196+ assert("nikita-716", key != NULL);
41197+
41198+ if (code != -ENOMEM) {
41199+ warning("nikita-717", "Error for inode %llu (%i)",
41200+ (unsigned long long)get_key_objectid(key), code);
71430cf6 41201+ reiser4_print_key("for key", key);
44254afd
MT
41202+ }
41203+}
41204+
41205+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41206+#if REISER4_DEBUG
41207+static void
41208+check_inode_seal(const struct inode *inode,
41209+ const coord_t * coord, const reiser4_key * key)
41210+{
41211+ reiser4_key unit_key;
41212+
41213+ unit_key_by_coord(coord, &unit_key);
41214+ assert("nikita-2752",
41215+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41216+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41217+}
41218+
41219+static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41220+{
41221+ reiser4_key ukey;
41222+
41223+ coord_clear_iplug(coord);
41224+ if (zload(coord->node))
41225+ return;
41226+
41227+ if (!coord_is_existing_unit(coord) ||
41228+ !item_plugin_by_coord(coord) ||
41229+ !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41230+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
41231+ !item_is_statdata(coord)) {
41232+ warning("nikita-1901", "Conspicuous seal");
71430cf6 41233+ reiser4_print_key("key", key);
44254afd
MT
41234+ print_coord("coord", coord, 1);
41235+ impossible("nikita-2877", "no way");
41236+ }
41237+ zrelse(coord->node);
41238+}
41239+
41240+#else
41241+#define check_inode_seal(inode, coord, key) noop
41242+#define check_sd_coord(coord, key) noop
41243+#endif
41244+
41245+/* insert new stat-data into tree. Called with inode state
41246+ locked. Return inode state locked. */
41247+static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41248+{
41249+ int result;
41250+ reiser4_key key;
41251+ coord_t coord;
41252+ reiser4_item_data data;
41253+ char *area;
41254+ reiser4_inode *ref;
41255+ lock_handle lh;
41256+ oid_t oid;
41257+
41258+ assert("nikita-723", inode != NULL);
71430cf6 41259+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
44254afd
MT
41260+
41261+ ref = reiser4_inode_data(inode);
41262+ spin_lock_inode(inode);
41263+
41264+ if (ref->plugin_mask != 0)
41265+ /* inode has non-standard plugins */
41266+ inode_set_extension(inode, PLUGIN_STAT);
41267+ /*
41268+ * prepare specification of new item to be inserted
41269+ */
41270+
41271+ data.iplug = inode_sd_plugin(inode);
41272+ data.length = data.iplug->s.sd.save_len(inode);
41273+ spin_unlock_inode(inode);
41274+
41275+ data.data = NULL;
41276+ data.user = 0;
41277+/* could be optimized for case where there is only one node format in
41278+ * use in the filesystem, probably there are lots of such
41279+ * places we could optimize for only one node layout.... -Hans */
71430cf6 41280+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
44254afd
MT
41281+ /* This is silly check, but we don't know actual node where
41282+ insertion will go into. */
41283+ return RETERR(-ENAMETOOLONG);
41284+ }
41285+ oid = oid_allocate(inode->i_sb);
41286+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41287+ if (oid == ABSOLUTE_MAX_OID)
41288+ return RETERR(-EOVERFLOW);
41289+
41290+ set_inode_oid(inode, oid);
41291+
41292+ coord_init_zero(&coord);
41293+ init_lh(&lh);
41294+
71430cf6 41295+ result = insert_by_key(reiser4_tree_by_inode(inode),
44254afd
MT
41296+ build_sd_key(inode, &key), &data, &coord, &lh,
41297+ /* stat data lives on a leaf level */
41298+ LEAF_LEVEL, CBK_UNIQUE);
41299+
41300+ /* we don't want to re-check that somebody didn't insert
41301+ stat-data while we were doing io, because if it did,
41302+ insert_by_key() returned error. */
41303+ /* but what _is_ possible is that plugin for inode's stat-data,
41304+ list of non-standard plugins or their state would change
41305+ during io, so that stat-data wouldn't fit into sd. To avoid
41306+ this race we keep inode_state lock. This lock has to be
41307+ taken each time you access inode in a way that would cause
41308+ changes in sd size: changing plugins etc.
41309+ */
41310+
41311+ if (result == IBK_INSERT_OK) {
41312+ coord_clear_iplug(&coord);
41313+ result = zload(coord.node);
41314+ if (result == 0) {
41315+ /* have we really inserted stat data? */
41316+ assert("nikita-725", item_is_statdata(&coord));
41317+
41318+ /* inode was just created. It is inserted into hash
41319+ table, but no directory entry was yet inserted into
41320+ parent. So, inode is inaccessible through
41321+ ->lookup(). All places that directly grab inode
41322+ from hash-table (like old knfsd), should check
41323+ IMMUTABLE flag that is set by common_create_child.
41324+ */
41325+ assert("nikita-3240", data.iplug != NULL);
41326+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
41327+ area = item_body_by_coord(&coord);
41328+ result = data.iplug->s.sd.save(inode, &area);
41329+ znode_make_dirty(coord.node);
41330+ if (result == 0) {
41331+ /* object has stat-data now */
71430cf6
MT
41332+ reiser4_inode_clr_flag(inode, REISER4_NO_SD);
41333+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
44254afd 41334+ /* initialise stat-data seal */
71430cf6 41335+ reiser4_seal_init(&ref->sd_seal, &coord, &key);
44254afd
MT
41336+ ref->sd_coord = coord;
41337+ check_inode_seal(inode, &coord, &key);
41338+ } else if (result != -ENOMEM)
41339+ /*
41340+ * convert any other error code to -EIO to
41341+ * avoid confusing user level with unexpected
41342+ * errors.
41343+ */
41344+ result = RETERR(-EIO);
41345+ zrelse(coord.node);
41346+ }
41347+ }
41348+ done_lh(&lh);
41349+
41350+ if (result != 0)
41351+ key_warning(&key, inode, result);
41352+ else
41353+ oid_count_allocated();
41354+
41355+ return result;
41356+}
41357+
41358+/* find sd of inode in a tree, deal with errors */
41359+int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41360+ znode_lock_mode lock_mode /* lock mode */ ,
41361+ coord_t * coord /* resulting coord */ ,
41362+ lock_handle * lh /* resulting lock handle */ ,
41363+ const reiser4_key * key /* resulting key */ ,
41364+ int silent)
41365+{
41366+ int result;
41367+ __u32 flags;
41368+
41369+ assert("nikita-1692", inode != NULL);
41370+ assert("nikita-1693", coord != NULL);
41371+ assert("nikita-1694", key != NULL);
41372+
41373+ /* look for the object's stat data in a tree.
41374+ This returns in "node" pointer to a locked znode and in "pos"
41375+ position of an item found in node. Both are only valid if
41376+ coord_found is returned. */
41377+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41378+ flags |= CBK_UNIQUE;
41379+ /*
41380+ * traverse tree to find stat data. We cannot use vroot here, because
41381+ * it only covers _body_ of the file, and stat data don't belong
41382+ * there.
41383+ */
71430cf6 41384+ result = coord_by_key(reiser4_tree_by_inode(inode),
44254afd
MT
41385+ key,
41386+ coord,
41387+ lh,
41388+ lock_mode,
41389+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41390+ if (REISER4_DEBUG && result == 0)
41391+ check_sd_coord(coord, key);
41392+
41393+ if (result != 0 && !silent)
41394+ key_warning(key, inode, result);
41395+ return result;
41396+}
41397+
41398+static int
41399+locate_inode_sd(struct inode *inode,
41400+ reiser4_key * key, coord_t * coord, lock_handle * lh)
41401+{
41402+ reiser4_inode *state;
41403+ seal_t seal;
41404+ int result;
41405+
41406+ assert("nikita-3483", inode != NULL);
41407+
41408+ state = reiser4_inode_data(inode);
41409+ spin_lock_inode(inode);
41410+ *coord = state->sd_coord;
41411+ coord_clear_iplug(coord);
41412+ seal = state->sd_seal;
41413+ spin_unlock_inode(inode);
41414+
41415+ build_sd_key(inode, key);
71430cf6 41416+ if (reiser4_seal_is_set(&seal)) {
44254afd 41417+ /* first, try to use seal */
71430cf6
MT
41418+ result = reiser4_seal_validate(&seal,
41419+ coord,
41420+ key,
41421+ lh, ZNODE_WRITE_LOCK,
41422+ ZNODE_LOCK_LOPRI);
44254afd
MT
41423+ if (result == 0)
41424+ check_sd_coord(coord, key);
41425+ } else
41426+ result = -E_REPEAT;
41427+
41428+ if (result != 0) {
41429+ coord_init_zero(coord);
41430+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41431+ }
41432+ return result;
41433+}
41434+
71430cf6
MT
41435+#if REISER4_DEBUG
41436+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
41437+{
41438+ return (get_key_locality(k1) == get_key_locality(k2) &&
41439+ get_key_type(k1) == get_key_type(k2) &&
41440+ get_key_band(k1) == get_key_band(k2) &&
41441+ get_key_ordering(k1) == get_key_ordering(k2) &&
41442+ get_key_objectid(k1) == get_key_objectid(k2));
41443+}
41444+
41445+#include "../tree_walk.h"
41446+
41447+/* make some checks before and after stat-data resize operation */
41448+static int check_sd_resize(struct inode * inode, coord_t * coord,
41449+ int length, int progress /* 1 means after resize */)
41450+{
41451+ int ret = 0;
41452+ lock_handle left_lock;
41453+ coord_t left_coord;
41454+ reiser4_key left_key;
41455+ reiser4_key key;
41456+
41457+ if (inode_file_plugin(inode) !=
41458+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
41459+ return 0;
41460+ if (!length)
41461+ return 0;
41462+ if (coord->item_pos != 0)
41463+ return 0;
41464+
41465+ init_lh(&left_lock);
41466+ ret = reiser4_get_left_neighbor(&left_lock,
41467+ coord->node,
41468+ ZNODE_WRITE_LOCK,
41469+ GN_CAN_USE_UPPER_LEVELS);
41470+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
41471+ ret == -ENOENT || ret == -EINVAL
41472+ || ret == -E_DEADLOCK) {
41473+ ret = 0;
41474+ goto exit;
41475+ }
41476+ ret = zload(left_lock.node);
41477+ if (ret)
41478+ goto exit;
41479+ coord_init_last_unit(&left_coord, left_lock.node);
41480+ item_key_by_coord(&left_coord, &left_key);
41481+ item_key_by_coord(coord, &key);
41482+
41483+ if (all_but_offset_key_eq(&key, &left_key))
41484+ /* corruption occured */
41485+ ret = 1;
41486+ zrelse(left_lock.node);
41487+ exit:
41488+ done_lh(&left_lock);
41489+ return ret;
41490+}
41491+#endif
41492+
44254afd
MT
41493+/* update stat-data at @coord */
41494+static int
41495+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41496+ lock_handle * lh)
41497+{
41498+ int result;
41499+ reiser4_item_data data;
41500+ char *area;
41501+ reiser4_inode *state;
41502+ znode *loaded;
41503+
41504+ state = reiser4_inode_data(inode);
41505+
41506+ coord_clear_iplug(coord);
41507+ result = zload(coord->node);
41508+ if (result != 0)
41509+ return result;
41510+ loaded = coord->node;
41511+
41512+ spin_lock_inode(inode);
41513+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
41514+ data.iplug = inode_sd_plugin(inode);
41515+
41516+ /* if inode has non-standard plugins, add appropriate stat data
41517+ * extension */
71430cf6
MT
41518+ if (state->extmask & (1 << PLUGIN_STAT)) {
41519+ if (state->plugin_mask == 0)
41520+ inode_clr_extension(inode, PLUGIN_STAT);
41521+ } else if (state->plugin_mask != 0)
44254afd
MT
41522+ inode_set_extension(inode, PLUGIN_STAT);
41523+
71430cf6
MT
41524+ if (state->extmask & (1 << HEIR_STAT)) {
41525+ if (state->heir_mask == 0)
41526+ inode_clr_extension(inode, HEIR_STAT);
41527+ } else if (state->heir_mask != 0)
41528+ inode_set_extension(inode, HEIR_STAT);
41529+
44254afd
MT
41530+ /* data.length is how much space to add to (or remove
41531+ from if negative) sd */
71430cf6 41532+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
44254afd
MT
41533+ /* recalculate stat-data length */
41534+ data.length =
41535+ data.iplug->s.sd.save_len(inode) -
41536+ item_length_by_coord(coord);
71430cf6 41537+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
44254afd
MT
41538+ } else
41539+ data.length = 0;
41540+ spin_unlock_inode(inode);
41541+
41542+ /* if on-disk stat data is of different length than required
41543+ for this inode, resize it */
71430cf6 41544+
44254afd
MT
41545+ if (data.length != 0) {
41546+ data.data = NULL;
41547+ data.user = 0;
41548+
71430cf6
MT
41549+ assert("edward-1441",
41550+ !check_sd_resize(inode, coord,
41551+ data.length, 0/* before resize */));
41552+
44254afd
MT
41553+ /* insertion code requires that insertion point (coord) was
41554+ * between units. */
41555+ coord->between = AFTER_UNIT;
71430cf6
MT
41556+ result = reiser4_resize_item(coord, &data, key, lh,
41557+ COPI_DONT_SHIFT_LEFT);
44254afd
MT
41558+ if (result != 0) {
41559+ key_warning(key, inode, result);
41560+ zrelse(loaded);
41561+ return result;
41562+ }
41563+ if (loaded != coord->node) {
71430cf6
MT
41564+ /* reiser4_resize_item moved coord to another node.
41565+ Zload it */
44254afd
MT
41566+ zrelse(loaded);
41567+ coord_clear_iplug(coord);
41568+ result = zload(coord->node);
41569+ if (result != 0)
41570+ return result;
41571+ loaded = coord->node;
41572+ }
71430cf6
MT
41573+ assert("edward-1442",
41574+ !check_sd_resize(inode, coord,
41575+ data.length, 1/* after resize */));
44254afd 41576+ }
44254afd
MT
41577+ area = item_body_by_coord(coord);
41578+ spin_lock_inode(inode);
41579+ result = data.iplug->s.sd.save(inode, &area);
41580+ znode_make_dirty(coord->node);
41581+
41582+ /* re-initialise stat-data seal */
41583+
41584+ /*
41585+ * coord.between was possibly skewed from AT_UNIT when stat-data size
41586+ * was changed and new extensions were pasted into item.
41587+ */
41588+ coord->between = AT_UNIT;
71430cf6 41589+ reiser4_seal_init(&state->sd_seal, coord, key);
44254afd
MT
41590+ state->sd_coord = *coord;
41591+ spin_unlock_inode(inode);
41592+ check_inode_seal(inode, coord, key);
41593+ zrelse(loaded);
41594+ return result;
41595+}
41596+
41597+/* Update existing stat-data in a tree. Called with inode state locked. Return
41598+ inode state locked. */
41599+static int update_sd(struct inode *inode /* inode to update sd for */ )
41600+{
41601+ int result;
41602+ reiser4_key key;
41603+ coord_t coord;
41604+ lock_handle lh;
41605+
41606+ assert("nikita-726", inode != NULL);
41607+
41608+ /* no stat-data, nothing to update?! */
71430cf6 41609+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
44254afd
MT
41610+
41611+ init_lh(&lh);
41612+
41613+ result = locate_inode_sd(inode, &key, &coord, &lh);
41614+ if (result == 0)
41615+ result = update_sd_at(inode, &coord, &key, &lh);
41616+ done_lh(&lh);
41617+
41618+ return result;
41619+}
41620+
71430cf6
MT
41621+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
41622+ Remove object stat data. Space for that must be reserved by caller before
44254afd
MT
41623+*/
41624+static int
41625+common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
41626+{
41627+ int result;
41628+
41629+ assert("nikita-1477", inode != NULL);
41630+
71430cf6 41631+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
44254afd
MT
41632+ reiser4_key sd_key;
41633+
41634+ DQUOT_FREE_INODE(inode);
41635+ DQUOT_DROP(inode);
41636+
41637+ build_sd_key(inode, &sd_key);
41638+ result =
71430cf6
MT
41639+ reiser4_cut_tree(reiser4_tree_by_inode(inode),
41640+ &sd_key, &sd_key, NULL, 0);
44254afd 41641+ if (result == 0) {
71430cf6 41642+ reiser4_inode_set_flag(inode, REISER4_NO_SD);
44254afd
MT
41643+ result = oid_release(inode->i_sb, get_inode_oid(inode));
41644+ if (result == 0) {
41645+ oid_count_released();
41646+
71430cf6 41647+ result = safe_link_del(reiser4_tree_by_inode(inode),
44254afd
MT
41648+ get_inode_oid(inode),
41649+ SAFE_UNLINK);
41650+ }
41651+ }
41652+ } else
41653+ result = 0;
41654+ return result;
41655+}
41656+
41657+/* helper for safelink_common */
41658+static int process_truncate(struct inode *inode, __u64 size)
41659+{
41660+ int result;
41661+ struct iattr attr;
41662+ file_plugin *fplug;
41663+ reiser4_context *ctx;
41664+ struct dentry dentry;
41665+
41666+ assert("vs-21", is_in_reiser4_context());
71430cf6 41667+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
41668+ assert("vs-22", !IS_ERR(ctx));
41669+
41670+ attr.ia_size = size;
41671+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
41672+ fplug = inode_file_plugin(inode);
41673+
41674+ mutex_lock(&inode->i_mutex);
41675+ assert("vs-1704", get_current_context()->trans->atom == NULL);
41676+ dentry.d_inode = inode;
41677+ result = inode->i_op->setattr(&dentry, &attr);
41678+ mutex_unlock(&inode->i_mutex);
41679+
41680+ context_set_commit_async(ctx);
41681+ reiser4_exit_context(ctx);
41682+
41683+ return result;
41684+}
41685+
71430cf6
MT
41686+/*
41687+ Local variables:
41688+ c-indentation-style: "K&R"
41689+ mode-name: "LC"
41690+ c-basic-offset: 8
41691+ tab-width: 8
41692+ fill-column: 80
41693+ scroll-step: 1
41694+ End:
44254afd 41695+*/
71430cf6
MT
41696diff -urN linux-2.6.22.orig/fs/reiser4/plugin/hash.c linux-2.6.22/fs/reiser4/plugin/hash.c
41697--- linux-2.6.22.orig/fs/reiser4/plugin/hash.c 1970-01-01 03:00:00.000000000 +0300
41698+++ linux-2.6.22/fs/reiser4/plugin/hash.c 2007-07-29 00:25:34.936712007 +0400
41699@@ -0,0 +1,353 @@
44254afd
MT
41700+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
41701+ * reiser4/README */
41702+
41703+/* Hash functions */
41704+
41705+#include "../debug.h"
41706+#include "plugin_header.h"
41707+#include "plugin.h"
41708+#include "../super.h"
41709+#include "../inode.h"
41710+
41711+#include <linux/types.h>
41712+
41713+/* old rupasov (yura) hash */
41714+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
41715+ int len /* @name's length */ )
41716+{
41717+ int i;
41718+ int j;
41719+ int pow;
41720+ __u64 a;
41721+ __u64 c;
41722+
41723+ assert("nikita-672", name != NULL);
41724+ assert("nikita-673", len >= 0);
41725+
41726+ for (pow = 1, i = 1; i < len; ++i)
41727+ pow = pow * 10;
41728+
41729+ if (len == 1)
41730+ a = name[0] - 48;
41731+ else
41732+ a = (name[0] - 48) * pow;
41733+
41734+ for (i = 1; i < len; ++i) {
41735+ c = name[i] - 48;
41736+ for (pow = 1, j = i; j < len - 1; ++j)
41737+ pow = pow * 10;
41738+ a = a + c * pow;
41739+ }
41740+ for (; i < 40; ++i) {
41741+ c = '0' - 48;
41742+ for (pow = 1, j = i; j < len - 1; ++j)
41743+ pow = pow * 10;
41744+ a = a + c * pow;
41745+ }
41746+
41747+ for (; i < 256; ++i) {
41748+ c = i;
41749+ for (pow = 1, j = i; j < len - 1; ++j)
41750+ pow = pow * 10;
41751+ a = a + c * pow;
41752+ }
41753+
41754+ a = a << 7;
41755+ return a;
41756+}
41757+
41758+/* r5 hash */
41759+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
41760+ int len UNUSED_ARG /* @name's length */ )
41761+{
41762+ __u64 a = 0;
41763+
41764+ assert("nikita-674", name != NULL);
41765+ assert("nikita-675", len >= 0);
41766+
41767+ while (*name) {
41768+ a += *name << 4;
41769+ a += *name >> 4;
41770+ a *= 11;
41771+ name++;
41772+ }
41773+ return a;
41774+}
41775+
41776+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
41777+ H0 = Key
41778+ Hi = E Mi(Hi-1) + Hi-1
41779+
41780+ (see Applied Cryptography, 2nd edition, p448).
41781+
41782+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
41783+
41784+ Jeremy has agreed to the contents of reiserfs/README. -Hans
41785+
41786+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
41787+*/
41788+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
41789+ int len /* @name's length */ )
41790+{
41791+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
41792+
41793+ __u64 h0 = k[0], h1 = k[1];
41794+ __u64 a, b, c, d;
41795+ __u64 pad;
41796+ int i;
41797+
41798+ assert("nikita-676", name != NULL);
41799+ assert("nikita-677", len >= 0);
41800+
41801+#define DELTA 0x9E3779B9u
41802+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
41803+#define PARTROUNDS 6 /* 6 gets complete mixing */
41804+
41805+/* a, b, c, d - data; h0, h1 - accumulated hash */
41806+#define TEACORE(rounds) \
41807+ do { \
41808+ __u64 sum = 0; \
41809+ int n = rounds; \
41810+ __u64 b0, b1; \
41811+ \
41812+ b0 = h0; \
41813+ b1 = h1; \
41814+ \
41815+ do \
41816+ { \
41817+ sum += DELTA; \
41818+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
41819+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
41820+ } while(--n); \
41821+ \
41822+ h0 += b0; \
41823+ h1 += b1; \
41824+ } while(0)
41825+
41826+ pad = (__u64) len | ((__u64) len << 8);
41827+ pad |= pad << 16;
41828+
41829+ while (len >= 16) {
41830+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41831+ 16 | (__u64) name[3] << 24;
41832+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41833+ 16 | (__u64) name[7] << 24;
41834+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41835+ 16 | (__u64) name[11] << 24;
41836+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
41837+ << 16 | (__u64) name[15] << 24;
41838+
41839+ TEACORE(PARTROUNDS);
41840+
41841+ len -= 16;
41842+ name += 16;
41843+ }
41844+
41845+ if (len >= 12) {
41846+ //assert(len < 16);
41847+ if (len >= 16)
41848+ *(int *)0 = 0;
41849+
41850+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41851+ 16 | (__u64) name[3] << 24;
41852+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41853+ 16 | (__u64) name[7] << 24;
41854+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41855+ 16 | (__u64) name[11] << 24;
41856+
41857+ d = pad;
41858+ for (i = 12; i < len; i++) {
41859+ d <<= 8;
41860+ d |= name[i];
41861+ }
41862+ } else if (len >= 8) {
41863+ //assert(len < 12);
41864+ if (len >= 12)
41865+ *(int *)0 = 0;
41866+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41867+ 16 | (__u64) name[3] << 24;
41868+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41869+ 16 | (__u64) name[7] << 24;
41870+
41871+ c = d = pad;
41872+ for (i = 8; i < len; i++) {
41873+ c <<= 8;
41874+ c |= name[i];
41875+ }
41876+ } else if (len >= 4) {
41877+ //assert(len < 8);
41878+ if (len >= 8)
41879+ *(int *)0 = 0;
41880+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41881+ 16 | (__u64) name[3] << 24;
41882+
41883+ b = c = d = pad;
41884+ for (i = 4; i < len; i++) {
41885+ b <<= 8;
41886+ b |= name[i];
41887+ }
41888+ } else {
41889+ //assert(len < 4);
41890+ if (len >= 4)
41891+ *(int *)0 = 0;
41892+ a = b = c = d = pad;
41893+ for (i = 0; i < len; i++) {
41894+ a <<= 8;
41895+ a |= name[i];
41896+ }
41897+ }
41898+
41899+ TEACORE(FULLROUNDS);
41900+
41901+/* return 0;*/
41902+ return h0 ^ h1;
41903+
41904+}
41905+
41906+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
41907+
41908+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
41909+
41910+ Excerpts:
41911+
41912+ FNV hashes are designed to be fast while maintaining a low collision
41913+ rate.
41914+
41915+ [This version also seems to preserve lexicographical order locally.]
41916+
41917+ FNV hash algorithms and source code have been released into the public
41918+ domain.
41919+
41920+*/
41921+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
41922+ int len UNUSED_ARG /* @name's length */ )
41923+{
41924+ unsigned long long a = 0xcbf29ce484222325ull;
41925+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
41926+
41927+ assert("nikita-678", name != NULL);
41928+ assert("nikita-679", len >= 0);
41929+
41930+ /* FNV-1 hash each octet in the buffer */
41931+ for (; *name; ++name) {
41932+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
41933+ a *= fnv_64_prime;
41934+ /* xor the bottom with the current octet */
41935+ a ^= (unsigned long long)(*name);
41936+ }
41937+ /* return our new hash value */
41938+ return a;
41939+}
41940+
41941+/* degenerate hash function used to simplify testing of non-unique key
41942+ handling */
41943+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
41944+ int len UNUSED_ARG /* @name's length */ )
41945+{
41946+ return 0xc0c0c0c010101010ull;
41947+}
41948+
71430cf6
MT
41949+static int change_hash(struct inode *inode,
41950+ reiser4_plugin * plugin,
41951+ pset_member memb)
44254afd
MT
41952+{
41953+ int result;
41954+
41955+ assert("nikita-3503", inode != NULL);
41956+ assert("nikita-3504", plugin != NULL);
41957+
41958+ assert("nikita-3505", is_reiser4_inode(inode));
44254afd
MT
41959+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
41960+
71430cf6
MT
41961+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
41962+ return RETERR(-EINVAL);
41963+
44254afd
MT
41964+ result = 0;
41965+ if (inode_hash_plugin(inode) == NULL ||
41966+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
41967+ if (is_dir_empty(inode) == 0)
71430cf6
MT
41968+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
41969+ PSET_HASH, plugin);
44254afd
MT
41970+ else
41971+ result = RETERR(-ENOTEMPTY);
41972+
41973+ }
41974+ return result;
41975+}
41976+
41977+static reiser4_plugin_ops hash_plugin_ops = {
41978+ .init = NULL,
41979+ .load = NULL,
41980+ .save_len = NULL,
41981+ .save = NULL,
41982+ .change = change_hash
41983+};
41984+
41985+/* hash plugins */
41986+hash_plugin hash_plugins[LAST_HASH_ID] = {
41987+ [RUPASOV_HASH_ID] = {
41988+ .h = {
41989+ .type_id = REISER4_HASH_PLUGIN_TYPE,
41990+ .id = RUPASOV_HASH_ID,
41991+ .pops = &hash_plugin_ops,
41992+ .label = "rupasov",
41993+ .desc = "Original Yura's hash",
41994+ .linkage = {NULL, NULL}
41995+ },
41996+ .hash = hash_rupasov
41997+ },
41998+ [R5_HASH_ID] = {
41999+ .h = {
42000+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42001+ .id = R5_HASH_ID,
42002+ .pops = &hash_plugin_ops,
42003+ .label = "r5",
42004+ .desc = "r5 hash",
42005+ .linkage = {NULL, NULL}
42006+ },
42007+ .hash = hash_r5
42008+ },
42009+ [TEA_HASH_ID] = {
42010+ .h = {
42011+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42012+ .id = TEA_HASH_ID,
42013+ .pops = &hash_plugin_ops,
42014+ .label = "tea",
42015+ .desc = "tea hash",
42016+ .linkage = {NULL, NULL}
42017+ },
42018+ .hash = hash_tea
42019+ },
42020+ [FNV1_HASH_ID] = {
42021+ .h = {
42022+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42023+ .id = FNV1_HASH_ID,
42024+ .pops = &hash_plugin_ops,
42025+ .label = "fnv1",
42026+ .desc = "fnv1 hash",
42027+ .linkage = {NULL, NULL}
42028+ },
42029+ .hash = hash_fnv1
42030+ },
42031+ [DEGENERATE_HASH_ID] = {
42032+ .h = {
42033+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42034+ .id = DEGENERATE_HASH_ID,
42035+ .pops = &hash_plugin_ops,
42036+ .label = "degenerate hash",
42037+ .desc = "Degenerate hash: only for testing",
42038+ .linkage = {NULL, NULL}
42039+ },
42040+ .hash = hash_deg
42041+ }
42042+};
42043+
42044+/* Make Linus happy.
42045+ Local variables:
42046+ c-indentation-style: "K&R"
42047+ mode-name: "LC"
42048+ c-basic-offset: 8
42049+ tab-width: 8
42050+ fill-column: 120
42051+ End:
42052+*/
71430cf6
MT
42053diff -urN linux-2.6.22.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.22/fs/reiser4/plugin/inode_ops.c
42054--- linux-2.6.22.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 03:00:00.000000000 +0300
42055+++ linux-2.6.22/fs/reiser4/plugin/inode_ops.c 2007-07-29 00:25:34.936712007 +0400
42056@@ -0,0 +1,897 @@
44254afd
MT
42057+/*
42058+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42059+ */
42060+
42061+/*
42062+ * this file contains typical implementations for most of methods of struct
42063+ * inode_operations
42064+ */
42065+
42066+#include "../inode.h"
42067+#include "../safe_link.h"
42068+
42069+#include <linux/quotaops.h>
42070+#include <linux/namei.h>
42071+
44254afd
MT
42072+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42073+ reiser4_object_create_data *data);
42074+
42075+/**
71430cf6 42076+ * reiser4_create_common - create of inode operations
44254afd
MT
42077+ * @parent: inode of parent directory
42078+ * @dentry: dentry of new object to create
42079+ * @mode: the permissions to use
42080+ * @nameidata:
42081+ *
42082+ * This is common implementation of vfs's create method of struct
42083+ * inode_operations.
42084+ * Creates regular file using file plugin from parent directory plugin set.
42085+ */
71430cf6
MT
42086+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
42087+ int mode, struct nameidata *nameidata)
44254afd
MT
42088+{
42089+ reiser4_object_create_data data;
71430cf6 42090+ file_plugin *fplug;
44254afd
MT
42091+
42092+ memset(&data, 0, sizeof data);
42093+ data.mode = S_IFREG | mode;
71430cf6
MT
42094+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
42095+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
42096+ warning("vpf-1900", "'%s' is not a regular file plugin.",
42097+ fplug->h.label);
42098+ return RETERR(-EIO);
42099+ }
42100+ data.id = fplug->h.id;
44254afd
MT
42101+ return create_vfs_object(parent, dentry, &data);
42102+}
42103+
71430cf6 42104+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
44254afd
MT
42105+void check_light_weight(struct inode *inode, struct inode *parent);
42106+
42107+/**
71430cf6 42108+ * reiser4_lookup_common - lookup of inode operations
44254afd
MT
42109+ * @parent: inode of directory to lookup into
42110+ * @dentry: name to look for
42111+ * @nameidata:
42112+ *
42113+ * This is common implementation of vfs's lookup method of struct
42114+ * inode_operations.
42115+ */
71430cf6
MT
42116+struct dentry *reiser4_lookup_common(struct inode *parent,
42117+ struct dentry *dentry,
42118+ struct nameidata *nameidata)
44254afd
MT
42119+{
42120+ reiser4_context *ctx;
42121+ int result;
42122+ struct dentry *new;
42123+ struct inode *inode;
42124+ reiser4_dir_entry_desc entry;
42125+
71430cf6 42126+ ctx = reiser4_init_context(parent->i_sb);
44254afd
MT
42127+ if (IS_ERR(ctx))
42128+ return (struct dentry *)ctx;
42129+
42130+ /* set up operations on dentry. */
42131+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42132+
71430cf6 42133+ result = reiser4_lookup_name(parent, dentry, &entry.key);
44254afd
MT
42134+ if (result) {
42135+ context_set_commit_async(ctx);
42136+ reiser4_exit_context(ctx);
42137+ if (result == -ENOENT) {
42138+ /* object not found */
42139+ if (!IS_DEADDIR(parent))
42140+ d_add(dentry, NULL);
42141+ return NULL;
42142+ }
42143+ return ERR_PTR(result);
42144+ }
42145+
42146+ inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42147+ if (IS_ERR(inode)) {
42148+ context_set_commit_async(ctx);
42149+ reiser4_exit_context(ctx);
42150+ return ERR_PTR(PTR_ERR(inode));
42151+ }
42152+
42153+ /* success */
42154+ check_light_weight(inode, parent);
42155+ new = d_splice_alias(inode, dentry);
42156+ reiser4_iget_complete(inode);
42157+
42158+ /* prevent balance_dirty_pages() from being called: we don't want to
42159+ * do this under directory i_mutex. */
42160+ context_set_commit_async(ctx);
42161+ reiser4_exit_context(ctx);
42162+ return new;
42163+}
42164+
42165+static reiser4_block_nr common_estimate_link(struct inode *parent,
42166+ struct inode *object);
42167+int reiser4_update_dir(struct inode *);
42168+
42169+/**
71430cf6 42170+ * reiser4_link_common - link of inode operations
44254afd
MT
42171+ * @existing: dentry of object which is to get new name
42172+ * @parent: directory where new name is to be created
42173+ * @newname: new name
42174+ *
42175+ * This is common implementation of vfs's link method of struct
42176+ * inode_operations.
42177+ */
71430cf6
MT
42178+int reiser4_link_common(struct dentry *existing, struct inode *parent,
42179+ struct dentry *newname)
44254afd
MT
42180+{
42181+ reiser4_context *ctx;
42182+ int result;
42183+ struct inode *object;
42184+ dir_plugin *parent_dplug;
42185+ reiser4_dir_entry_desc entry;
42186+ reiser4_object_create_data data;
42187+ reiser4_block_nr reserve;
42188+
71430cf6 42189+ ctx = reiser4_init_context(parent->i_sb);
44254afd
MT
42190+ if (IS_ERR(ctx))
42191+ return PTR_ERR(ctx);
42192+
42193+ assert("nikita-1431", existing != NULL);
42194+ assert("nikita-1432", parent != NULL);
42195+ assert("nikita-1433", newname != NULL);
42196+
42197+ object = existing->d_inode;
42198+ assert("nikita-1434", object != NULL);
42199+
42200+ /* check for race with create_object() */
71430cf6 42201+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
44254afd
MT
42202+ context_set_commit_async(ctx);
42203+ reiser4_exit_context(ctx);
42204+ return RETERR(-E_REPEAT);
42205+ }
42206+
42207+ parent_dplug = inode_dir_plugin(parent);
42208+
42209+ memset(&entry, 0, sizeof entry);
42210+ entry.obj = object;
42211+
42212+ data.mode = object->i_mode;
42213+ data.id = inode_file_plugin(object)->h.id;
42214+
42215+ reserve = common_estimate_link(parent, existing->d_inode);
42216+ if ((__s64) reserve < 0) {
42217+ context_set_commit_async(ctx);
42218+ reiser4_exit_context(ctx);
42219+ return reserve;
42220+ }
42221+
42222+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42223+ context_set_commit_async(ctx);
42224+ reiser4_exit_context(ctx);
42225+ return RETERR(-ENOSPC);
42226+ }
42227+
42228+ /*
42229+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42230+ * means that link(2) can race against unlink(2) or rename(2), and
42231+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42232+ *
42233+ * For such inode we have to undo special processing done in
42234+ * reiser4_unlink() viz. creation of safe-link.
42235+ */
42236+ if (unlikely(object->i_nlink == 0)) {
71430cf6 42237+ result = safe_link_del(reiser4_tree_by_inode(object),
44254afd
MT
42238+ get_inode_oid(object), SAFE_UNLINK);
42239+ if (result != 0) {
42240+ context_set_commit_async(ctx);
42241+ reiser4_exit_context(ctx);
42242+ return result;
42243+ }
42244+ }
42245+
42246+ /* increment nlink of @existing and update its stat data */
42247+ result = reiser4_add_nlink(object, parent, 1);
42248+ if (result == 0) {
42249+ /* add entry to the parent */
42250+ result =
42251+ parent_dplug->add_entry(parent, newname, &data, &entry);
42252+ if (result != 0) {
42253+ /* failed to add entry to the parent, decrement nlink
42254+ of @existing */
42255+ reiser4_del_nlink(object, parent, 1);
42256+ /*
42257+ * now, if that failed, we have a file with too big
42258+ * nlink---space leak, much better than directory
42259+ * entry pointing to nowhere
42260+ */
42261+ }
42262+ }
42263+ if (result == 0) {
42264+ atomic_inc(&object->i_count);
42265+ /*
42266+ * Upon successful completion, link() shall mark for update
42267+ * the st_ctime field of the file. Also, the st_ctime and
42268+ * st_mtime fields of the directory that contains the new
42269+ * entry shall be marked for update. --SUS
42270+ */
42271+ result = reiser4_update_dir(parent);
42272+ }
42273+ if (result == 0)
42274+ d_instantiate(newname, existing->d_inode);
42275+
42276+ context_set_commit_async(ctx);
42277+ reiser4_exit_context(ctx);
42278+ return result;
42279+}
42280+
42281+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42282+
42283+/**
71430cf6 42284+ * reiser4_unlink_common - unlink of inode operations
44254afd
MT
42285+ * @parent: inode of directory to remove name from
42286+ * @victim: name to be removed
42287+ *
42288+ * This is common implementation of vfs's unlink method of struct
42289+ * inode_operations.
42290+ */
71430cf6 42291+int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
44254afd
MT
42292+{
42293+ reiser4_context *ctx;
42294+ int result;
42295+ struct inode *object;
42296+ file_plugin *fplug;
42297+
71430cf6 42298+ ctx = reiser4_init_context(parent->i_sb);
44254afd
MT
42299+ if (IS_ERR(ctx))
42300+ return PTR_ERR(ctx);
42301+
42302+ object = victim->d_inode;
42303+ fplug = inode_file_plugin(object);
42304+ assert("nikita-2882", fplug->detach != NULL);
42305+
42306+ result = unlink_check_and_grab(parent, victim);
42307+ if (result != 0) {
42308+ context_set_commit_async(ctx);
42309+ reiser4_exit_context(ctx);
42310+ return result;
42311+ }
42312+
42313+ result = fplug->detach(object, parent);
42314+ if (result == 0) {
42315+ dir_plugin *parent_dplug;
42316+ reiser4_dir_entry_desc entry;
42317+
42318+ parent_dplug = inode_dir_plugin(parent);
42319+ memset(&entry, 0, sizeof entry);
42320+
42321+ /* first, delete directory entry */
42322+ result = parent_dplug->rem_entry(parent, victim, &entry);
42323+ if (result == 0) {
42324+ /*
42325+ * if name was removed successfully, we _have_ to
42326+ * return 0 from this function, because upper level
42327+ * caller (vfs_{rmdir,unlink}) expect this.
42328+ *
42329+ * now that directory entry is removed, update
42330+ * stat-data
42331+ */
42332+ reiser4_del_nlink(object, parent, 1);
42333+ /*
42334+ * Upon successful completion, unlink() shall mark for
42335+ * update the st_ctime and st_mtime fields of the
42336+ * parent directory. Also, if the file's link count is
42337+ * not 0, the st_ctime field of the file shall be
42338+ * marked for update. --SUS
42339+ */
42340+ reiser4_update_dir(parent);
42341+ /* add safe-link for this file */
42342+ if (object->i_nlink == 0)
42343+ safe_link_add(object, SAFE_UNLINK);
42344+ }
42345+ }
42346+
42347+ if (unlikely(result != 0)) {
42348+ if (result != -ENOMEM)
42349+ warning("nikita-3398", "Cannot unlink %llu (%i)",
42350+ (unsigned long long)get_inode_oid(object),
42351+ result);
42352+ /* if operation failed commit pending inode modifications to
42353+ * the stat-data */
42354+ reiser4_update_sd(object);
42355+ reiser4_update_sd(parent);
42356+ }
42357+
42358+ reiser4_release_reserved(object->i_sb);
42359+
42360+ /* @object's i_ctime was updated by ->rem_link() method(). */
42361+
42362+ /* @victim can be already removed from the disk by this time. Inode is
42363+ then marked so that iput() wouldn't try to remove stat data. But
42364+ inode itself is still there.
42365+ */
42366+
42367+ /*
42368+ * we cannot release directory semaphore here, because name has
42369+ * already been deleted, but dentry (@victim) still exists. Prevent
42370+ * balance_dirty_pages() from being called on exiting this context: we
42371+ * don't want to do this under directory i_mutex.
42372+ */
42373+ context_set_commit_async(ctx);
42374+ reiser4_exit_context(ctx);
42375+ return result;
42376+}
42377+
42378+/**
71430cf6 42379+ * reiser4_symlink_common - symlink of inode operations
44254afd
MT
42380+ * @parent: inode of parent directory
42381+ * @dentry: dentry of object to be created
42382+ * @linkname: string symlink is to contain
42383+ *
42384+ * This is common implementation of vfs's symlink method of struct
42385+ * inode_operations.
42386+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42387+ */
71430cf6
MT
42388+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
42389+ const char *linkname)
44254afd
MT
42390+{
42391+ reiser4_object_create_data data;
42392+
42393+ memset(&data, 0, sizeof data);
42394+ data.name = linkname;
42395+ data.id = SYMLINK_FILE_PLUGIN_ID;
42396+ data.mode = S_IFLNK | S_IRWXUGO;
42397+ return create_vfs_object(parent, dentry, &data);
42398+}
42399+
42400+/**
71430cf6 42401+ * reiser4_mkdir_common - mkdir of inode operations
44254afd
MT
42402+ * @parent: inode of parent directory
42403+ * @dentry: dentry of object to be created
42404+ * @mode: the permissions to use
42405+ *
42406+ * This is common implementation of vfs's mkdir method of struct
42407+ * inode_operations.
42408+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42409+ */
71430cf6 42410+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
44254afd
MT
42411+{
42412+ reiser4_object_create_data data;
42413+
42414+ memset(&data, 0, sizeof data);
42415+ data.mode = S_IFDIR | mode;
42416+ data.id = DIRECTORY_FILE_PLUGIN_ID;
42417+ return create_vfs_object(parent, dentry, &data);
42418+}
42419+
42420+/**
71430cf6 42421+ * reiser4_mknod_common - mknod of inode operations
44254afd
MT
42422+ * @parent: inode of parent directory
42423+ * @dentry: dentry of object to be created
42424+ * @mode: the permissions to use and file type
42425+ * @rdev: minor and major of new device file
42426+ *
42427+ * This is common implementation of vfs's mknod method of struct
42428+ * inode_operations.
42429+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42430+ */
71430cf6
MT
42431+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
42432+ int mode, dev_t rdev)
44254afd
MT
42433+{
42434+ reiser4_object_create_data data;
42435+
42436+ memset(&data, 0, sizeof data);
42437+ data.mode = mode;
42438+ data.rdev = rdev;
42439+ data.id = SPECIAL_FILE_PLUGIN_ID;
42440+ return create_vfs_object(parent, dentry, &data);
42441+}
42442+
42443+/*
42444+ * implementation of vfs's rename method of struct inode_operations for typical
42445+ * directory is in inode_ops_rename.c
42446+ */
42447+
42448+/**
71430cf6 42449+ * reiser4_follow_link_common - follow_link of inode operations
44254afd
MT
42450+ * @dentry: dentry of symlink
42451+ * @data:
42452+ *
42453+ * This is common implementation of vfs's followlink method of struct
42454+ * inode_operations.
71430cf6 42455+ * Assumes that inode's i_private points to the content of symbolic link.
44254afd 42456+ */
71430cf6 42457+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
44254afd
MT
42458+{
42459+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42460+
71430cf6
MT
42461+ if (!dentry->d_inode->i_private
42462+ || !reiser4_inode_get_flag(dentry->d_inode,
42463+ REISER4_GENERIC_PTR_USED))
44254afd 42464+ return ERR_PTR(RETERR(-EINVAL));
71430cf6 42465+ nd_set_link(nd, dentry->d_inode->i_private);
44254afd
MT
42466+ return NULL;
42467+}
42468+
42469+/**
71430cf6 42470+ * reiser4_permission_common - permission of inode operations
44254afd
MT
42471+ * @inode: inode to check permissions for
42472+ * @mask: mode bits to check permissions for
42473+ * @nameidata:
42474+ *
42475+ * Uses generic function to check for rwx permissions.
42476+ */
71430cf6
MT
42477+int reiser4_permission_common(struct inode *inode, int mask,
42478+ struct nameidata *nameidata)
44254afd
MT
42479+{
42480+ return generic_permission(inode, mask, NULL);
42481+}
42482+
42483+static int setattr_reserve(reiser4_tree *);
42484+
42485+/* this is common implementation of vfs's setattr method of struct
42486+ inode_operations
42487+*/
71430cf6 42488+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
44254afd
MT
42489+{
42490+ reiser4_context *ctx;
42491+ struct inode *inode;
42492+ int result;
42493+
42494+ inode = dentry->d_inode;
42495+ result = inode_change_ok(inode, attr);
42496+ if (result)
42497+ return result;
42498+
71430cf6 42499+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
42500+ if (IS_ERR(ctx))
42501+ return PTR_ERR(ctx);
42502+
42503+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42504+
42505+ /*
42506+ * grab disk space and call standard inode_setattr().
42507+ */
71430cf6 42508+ result = setattr_reserve(reiser4_tree_by_inode(inode));
44254afd
MT
42509+ if (!result) {
42510+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42511+ || (attr->ia_valid & ATTR_GID
42512+ && attr->ia_gid != inode->i_gid)) {
42513+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42514+ if (result) {
42515+ context_set_commit_async(ctx);
42516+ reiser4_exit_context(ctx);
42517+ return result;
42518+ }
42519+ }
42520+ result = inode_setattr(inode, attr);
42521+ if (!result)
42522+ reiser4_update_sd(inode);
42523+ }
42524+
42525+ context_set_commit_async(ctx);
42526+ reiser4_exit_context(ctx);
42527+ return result;
42528+}
42529+
42530+/* this is common implementation of vfs's getattr method of struct
42531+ inode_operations
42532+*/
71430cf6
MT
42533+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
42534+ struct dentry *dentry, struct kstat *stat)
44254afd
MT
42535+{
42536+ struct inode *obj;
42537+
42538+ assert("nikita-2298", dentry != NULL);
42539+ assert("nikita-2299", stat != NULL);
42540+ assert("nikita-2300", dentry->d_inode != NULL);
42541+
42542+ obj = dentry->d_inode;
42543+
42544+ stat->dev = obj->i_sb->s_dev;
42545+ stat->ino = oid_to_uino(get_inode_oid(obj));
42546+ stat->mode = obj->i_mode;
42547+ /* don't confuse userland with huge nlink. This is not entirely
42548+ * correct, because nlink_t is not necessary 16 bit signed. */
42549+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42550+ stat->uid = obj->i_uid;
42551+ stat->gid = obj->i_gid;
42552+ stat->rdev = obj->i_rdev;
42553+ stat->atime = obj->i_atime;
42554+ stat->mtime = obj->i_mtime;
42555+ stat->ctime = obj->i_ctime;
42556+ stat->size = obj->i_size;
42557+ stat->blocks =
42558+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42559+ /* "preferred" blocksize for efficient file system I/O */
42560+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42561+
42562+ return 0;
42563+}
42564+
42565+/* Estimate the maximum amount of nodes which might be allocated or changed on
42566+ typical new object creation. Typical creation consists of calling create
42567+ method of file plugin, adding directory entry to parent and update parent
42568+ directory's stat data.
42569+*/
42570+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
42571+ struct inode *object
42572+ /* object */ )
42573+{
42574+ assert("vpf-309", parent != NULL);
42575+ assert("vpf-307", object != NULL);
42576+
42577+ return
42578+ /* object creation estimation */
42579+ inode_file_plugin(object)->estimate.create(object) +
42580+ /* stat data of parent directory estimation */
42581+ inode_file_plugin(parent)->estimate.update(parent) +
42582+ /* adding entry estimation */
42583+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
42584+ /* to undo in the case of failure */
42585+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
42586+}
42587+
42588+/* Create child in directory.
42589+
42590+ . get object's plugin
42591+ . get fresh inode
42592+ . initialize inode
42593+ . add object's stat-data
42594+ . initialize object's directory
42595+ . add entry to the parent
42596+ . instantiate dentry
42597+
42598+*/
42599+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
42600+ object */
42601+ struct inode **retobj)
42602+{
42603+ int result;
42604+
42605+ struct dentry *dentry; /* parent object */
42606+ struct inode *parent; /* new name */
42607+
42608+ dir_plugin *par_dir; /* directory plugin on the parent */
42609+ dir_plugin *obj_dir; /* directory plugin on the new object */
42610+ file_plugin *obj_plug; /* object plugin on the new object */
42611+ struct inode *object; /* new object */
42612+ reiser4_block_nr reserve;
42613+
42614+ reiser4_dir_entry_desc entry; /* new directory entry */
42615+
42616+ assert("nikita-1420", data != NULL);
42617+ parent = data->parent;
42618+ dentry = data->dentry;
42619+
42620+ assert("nikita-1418", parent != NULL);
42621+ assert("nikita-1419", dentry != NULL);
42622+
42623+ /* check, that name is acceptable for parent */
42624+ par_dir = inode_dir_plugin(parent);
42625+ if (par_dir->is_name_acceptable &&
42626+ !par_dir->is_name_acceptable(parent,
42627+ dentry->d_name.name,
42628+ (int)dentry->d_name.len))
42629+ return RETERR(-ENAMETOOLONG);
42630+
42631+ result = 0;
42632+ obj_plug = file_plugin_by_id((int)data->id);
42633+ if (obj_plug == NULL) {
42634+ warning("nikita-430", "Cannot find plugin %i", data->id);
42635+ return RETERR(-ENOENT);
42636+ }
42637+ object = new_inode(parent->i_sb);
42638+ if (object == NULL)
42639+ return RETERR(-ENOMEM);
42640+ /* we'll update i_nlink below */
42641+ object->i_nlink = 0;
42642+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
42643+ * to simplify error handling: if some error occurs before i_ino is
42644+ * initialized with oid, i_ino should already be set to some
42645+ * distinguished value. */
42646+ object->i_ino = 0;
42647+
42648+ /* So that on error iput will be called. */
42649+ *retobj = object;
42650+
42651+ if (DQUOT_ALLOC_INODE(object)) {
42652+ DQUOT_DROP(object);
42653+ object->i_flags |= S_NOQUOTA;
42654+ return RETERR(-EDQUOT);
42655+ }
42656+
42657+ memset(&entry, 0, sizeof entry);
42658+ entry.obj = object;
42659+
71430cf6
MT
42660+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
42661+ file_plugin_to_plugin(obj_plug));
44254afd
MT
42662+ result = obj_plug->set_plug_in_inode(object, parent, data);
42663+ if (result) {
42664+ warning("nikita-431", "Cannot install plugin %i on %llx",
42665+ data->id, (unsigned long long)get_inode_oid(object));
42666+ DQUOT_FREE_INODE(object);
42667+ object->i_flags |= S_NOQUOTA;
42668+ return result;
42669+ }
42670+
42671+ /* reget plugin after installation */
42672+ obj_plug = inode_file_plugin(object);
42673+
42674+ if (obj_plug->create_object == NULL) {
42675+ DQUOT_FREE_INODE(object);
42676+ object->i_flags |= S_NOQUOTA;
42677+ return RETERR(-EPERM);
42678+ }
42679+
42680+ /* if any of hash, tail, sd or permission plugins for newly created
42681+ object are not set yet set them here inheriting them from parent
42682+ directory
42683+ */
42684+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
42685+ result = obj_plug->adjust_to_parent(object,
42686+ parent,
42687+ object->i_sb->s_root->d_inode);
71430cf6
MT
42688+ if (result == 0)
42689+ result = finish_pset(object);
44254afd
MT
42690+ if (result != 0) {
42691+ warning("nikita-432", "Cannot inherit from %llx to %llx",
42692+ (unsigned long long)get_inode_oid(parent),
42693+ (unsigned long long)get_inode_oid(object));
42694+ DQUOT_FREE_INODE(object);
42695+ object->i_flags |= S_NOQUOTA;
42696+ return result;
42697+ }
42698+
42699+ /* setup inode and file-operations for this inode */
42700+ setup_inode_ops(object, data);
42701+
42702+ /* call file plugin's method to initialize plugin specific part of
42703+ * inode */
42704+ if (obj_plug->init_inode_data)
42705+ obj_plug->init_inode_data(object, data, 1 /*create */ );
42706+
42707+ /* obtain directory plugin (if any) for new object. */
42708+ obj_dir = inode_dir_plugin(object);
42709+ if (obj_dir != NULL && obj_dir->init == NULL) {
42710+ DQUOT_FREE_INODE(object);
42711+ object->i_flags |= S_NOQUOTA;
42712+ return RETERR(-EPERM);
42713+ }
42714+
42715+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
42716+
42717+ reserve = estimate_create_vfs_object(parent, object);
42718+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42719+ DQUOT_FREE_INODE(object);
42720+ object->i_flags |= S_NOQUOTA;
42721+ return RETERR(-ENOSPC);
42722+ }
42723+
42724+ /* mark inode `immutable'. We disable changes to the file being
42725+ created until valid directory entry for it is inserted. Otherwise,
42726+ if file were expanded and insertion of directory entry fails, we
42727+ have to remove file, but we only alloted enough space in
42728+ transaction to remove _empty_ file. 3.x code used to remove stat
42729+ data in different transaction thus possibly leaking disk space on
42730+ crash. This all only matters if it's possible to access file
42731+ without name, for example, by inode number
42732+ */
71430cf6 42733+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
44254afd
MT
42734+
42735+ /* create empty object, this includes allocation of new objectid. For
42736+ directories this implies creation of dot and dotdot */
71430cf6 42737+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
44254afd
MT
42738+
42739+ /* mark inode as `loaded'. From this point onward
42740+ reiser4_delete_inode() will try to remove its stat-data. */
71430cf6 42741+ reiser4_inode_set_flag(object, REISER4_LOADED);
44254afd
MT
42742+
42743+ result = obj_plug->create_object(object, parent, data);
42744+ if (result != 0) {
71430cf6 42745+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
44254afd
MT
42746+ if (result != -ENAMETOOLONG && result != -ENOMEM)
42747+ warning("nikita-2219",
42748+ "Failed to create sd for %llu",
42749+ (unsigned long long)get_inode_oid(object));
42750+ DQUOT_FREE_INODE(object);
42751+ object->i_flags |= S_NOQUOTA;
42752+ return result;
42753+ }
42754+
42755+ if (obj_dir != NULL)
42756+ result = obj_dir->init(object, parent, data);
42757+ if (result == 0) {
71430cf6
MT
42758+ assert("nikita-434", !reiser4_inode_get_flag(object,
42759+ REISER4_NO_SD));
44254afd
MT
42760+ /* insert inode into VFS hash table */
42761+ insert_inode_hash(object);
42762+ /* create entry */
42763+ result = par_dir->add_entry(parent, dentry, data, &entry);
42764+ if (result == 0) {
42765+ result = reiser4_add_nlink(object, parent, 0);
42766+ /* If O_CREAT is set and the file did not previously
42767+ exist, upon successful completion, open() shall
42768+ mark for update the st_atime, st_ctime, and
42769+ st_mtime fields of the file and the st_ctime and
42770+ st_mtime fields of the parent directory. --SUS
42771+ */
42772+ /* @object times are already updated by
42773+ reiser4_add_nlink() */
42774+ if (result == 0)
42775+ reiser4_update_dir(parent);
42776+ if (result != 0)
42777+ /* cleanup failure to add nlink */
42778+ par_dir->rem_entry(parent, dentry, &entry);
42779+ }
42780+ if (result != 0)
42781+ /* cleanup failure to add entry */
42782+ obj_plug->detach(object, parent);
42783+ } else if (result != -ENOMEM)
42784+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
42785+ (unsigned long long)get_inode_oid(object), result);
42786+
42787+ /*
42788+ * update stat-data, committing all pending modifications to the inode
42789+ * fields.
42790+ */
42791+ reiser4_update_sd(object);
42792+ if (result != 0) {
42793+ DQUOT_FREE_INODE(object);
42794+ object->i_flags |= S_NOQUOTA;
42795+ /* if everything was ok (result == 0), parent stat-data is
42796+ * already updated above (update_parent_dir()) */
42797+ reiser4_update_sd(parent);
42798+ /* failure to create entry, remove object */
42799+ obj_plug->delete_object(object);
42800+ }
42801+
42802+ /* file has name now, clear immutable flag */
71430cf6 42803+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
44254afd
MT
42804+
42805+ /* on error, iput() will call ->delete_inode(). We should keep track
42806+ of the existence of stat-data for this inode and avoid attempt to
42807+ remove it in reiser4_delete_inode(). This is accomplished through
42808+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
42809+ */
42810+ return result;
42811+}
42812+
42813+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
42814+ reiser4_mknod and reiser4_symlink
42815+*/
42816+static int
42817+create_vfs_object(struct inode *parent,
42818+ struct dentry *dentry, reiser4_object_create_data * data)
42819+{
42820+ reiser4_context *ctx;
42821+ int result;
42822+ struct inode *child;
42823+
71430cf6 42824+ ctx = reiser4_init_context(parent->i_sb);
44254afd
MT
42825+ if (IS_ERR(ctx))
42826+ return PTR_ERR(ctx);
42827+ context_set_commit_async(ctx);
42828+
42829+ data->parent = parent;
42830+ data->dentry = dentry;
42831+ child = NULL;
42832+ result = do_create_vfs_child(data, &child);
42833+ if (unlikely(result != 0)) {
42834+ if (child != NULL) {
42835+ reiser4_make_bad_inode(child);
42836+ iput(child);
42837+ }
42838+ } else
42839+ d_instantiate(dentry, child);
42840+
42841+ reiser4_exit_context(ctx);
42842+ return result;
42843+}
42844+
42845+/* helper for link_common. Estimate disk space necessary to add a link
42846+ from @parent to @object
42847+*/
42848+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
42849+ struct inode *object
42850+ /* object to which new link is being cerated */
42851+ )
42852+{
42853+ reiser4_block_nr res = 0;
42854+ file_plugin *fplug;
42855+ dir_plugin *dplug;
42856+
42857+ assert("vpf-317", object != NULL);
42858+ assert("vpf-318", parent != NULL);
42859+
42860+ fplug = inode_file_plugin(object);
42861+ dplug = inode_dir_plugin(parent);
42862+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
42863+ /* reiser4_add_nlink(object) */
42864+ res += fplug->estimate.update(object);
42865+ /* add_entry(parent) */
42866+ res += dplug->estimate.add_entry(parent);
42867+ /* reiser4_del_nlink(object) */
42868+ res += fplug->estimate.update(object);
42869+ /* update_dir(parent) */
42870+ res += inode_file_plugin(parent)->estimate.update(parent);
42871+ /* safe-link */
71430cf6 42872+ res += estimate_one_item_removal(reiser4_tree_by_inode(object));
44254afd
MT
42873+
42874+ return res;
42875+}
42876+
42877+/* Estimate disk space necessary to remove a link between @parent and
42878+ @object.
42879+*/
42880+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
42881+ struct inode *object
42882+ /* object to which new link is being cerated */
42883+ )
42884+{
42885+ reiser4_block_nr res = 0;
42886+ file_plugin *fplug;
42887+ dir_plugin *dplug;
42888+
42889+ assert("vpf-317", object != NULL);
42890+ assert("vpf-318", parent != NULL);
42891+
42892+ fplug = inode_file_plugin(object);
42893+ dplug = inode_dir_plugin(parent);
42894+
42895+ /* rem_entry(parent) */
42896+ res += dplug->estimate.rem_entry(parent);
42897+ /* reiser4_del_nlink(object) */
42898+ res += fplug->estimate.update(object);
42899+ /* update_dir(parent) */
42900+ res += inode_file_plugin(parent)->estimate.update(parent);
42901+ /* fplug->unlink */
42902+ res += fplug->estimate.unlink(object, parent);
42903+ /* safe-link */
71430cf6 42904+ res += estimate_one_insert_item(reiser4_tree_by_inode(object));
44254afd
MT
42905+
42906+ return res;
42907+}
42908+
71430cf6 42909+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
44254afd
MT
42910+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
42911+{
42912+ file_plugin *fplug;
42913+ struct inode *child;
42914+ int result;
42915+
42916+ result = 0;
42917+ child = victim->d_inode;
42918+ fplug = inode_file_plugin(child);
42919+
42920+ /* check for race with create_object() */
71430cf6 42921+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
44254afd
MT
42922+ return RETERR(-E_REPEAT);
42923+ /* object being deleted should have stat data */
71430cf6 42924+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
44254afd
MT
42925+
42926+ /* ask object plugin */
42927+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
42928+ return RETERR(-ENOTEMPTY);
42929+
42930+ result = (int)estimate_unlink(parent, child);
42931+ if (result < 0)
42932+ return result;
42933+
42934+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
42935+}
42936+
71430cf6 42937+/* helper for reiser4_setattr_common */
44254afd
MT
42938+static int setattr_reserve(reiser4_tree * tree)
42939+{
42940+ assert("vs-1096", is_grab_enabled(get_current_context()));
42941+ return reiser4_grab_space(estimate_one_insert_into_item(tree),
42942+ BA_CAN_COMMIT);
42943+}
42944+
42945+/* helper function. Standards require that for many file-system operations
42946+ on success ctime and mtime of parent directory is to be updated. */
42947+int reiser4_update_dir(struct inode *dir)
42948+{
42949+ assert("nikita-2525", dir != NULL);
42950+
42951+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
42952+ return reiser4_update_sd(dir);
42953+}
71430cf6
MT
42954diff -urN linux-2.6.22.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.22/fs/reiser4/plugin/inode_ops_rename.c
42955--- linux-2.6.22.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 03:00:00.000000000 +0300
42956+++ linux-2.6.22/fs/reiser4/plugin/inode_ops_rename.c 2007-07-29 00:25:34.940713042 +0400
42957@@ -0,0 +1,914 @@
44254afd
MT
42958+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
42959+ * reiser4/README */
42960+
42961+#include "../inode.h"
42962+#include "../safe_link.h"
42963+
42964+static const char *possible_leak = "Possible disk space leak.";
42965+
42966+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
42967+
42968+ Helper function called from hashed_rename() */
42969+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
42970+ * to be re-targeted at */
42971+ struct inode *from_dir, /* directory where @from_coord
42972+ * lives */
42973+ struct inode *from_inode, /* inode @from_coord
42974+ * originally point to */
42975+ coord_t * from_coord, /* where directory entry is in
42976+ * the tree */
42977+ lock_handle * from_lh /* lock handle on @from_coord */ )
42978+{
42979+ item_plugin *from_item;
42980+ int result;
42981+ znode *node;
42982+
42983+ coord_clear_iplug(from_coord);
42984+ node = from_coord->node;
42985+ result = zload(node);
42986+ if (result != 0)
42987+ return result;
42988+ from_item = item_plugin_by_coord(from_coord);
71430cf6
MT
42989+ if (plugin_of_group(item_plugin_by_coord(from_coord),
42990+ DIR_ENTRY_ITEM_TYPE))
42991+ {
44254afd
MT
42992+ reiser4_key to_key;
42993+
42994+ build_sd_key(to_inode, &to_key);
42995+
42996+ /* everything is found and prepared to change directory entry
42997+ at @from_coord to point to @to_inode.
42998+
42999+ @to_inode is just about to get new name, so bump its link
43000+ counter.
43001+
43002+ */
43003+ result = reiser4_add_nlink(to_inode, from_dir, 0);
43004+ if (result != 0) {
43005+ /* Don't issue warning: this may be plain -EMLINK */
43006+ zrelse(node);
43007+ return result;
43008+ }
43009+
43010+ result =
43011+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43012+ if (result != 0) {
43013+ reiser4_del_nlink(to_inode, from_dir, 0);
43014+ zrelse(node);
43015+ return result;
43016+ }
43017+
43018+ /* @from_inode just lost its name, he-he.
43019+
43020+ If @from_inode was directory, it contained dotdot pointing
43021+ to @from_dir. @from_dir i_nlink will be decreased when
43022+ iput() will be called on @from_inode.
43023+
43024+ If file-system is not ADG (hard-links are
43025+ supported on directories), iput(from_inode) will not remove
43026+ @from_inode, and thus above is incorrect, but hard-links on
43027+ directories are problematic in many other respects.
43028+ */
43029+ result = reiser4_del_nlink(from_inode, from_dir, 0);
43030+ if (result != 0) {
43031+ warning("nikita-2330",
43032+ "Cannot remove link from source: %i. %s",
43033+ result, possible_leak);
43034+ }
43035+ /* Has to return success, because entry is already
43036+ * modified. */
43037+ result = 0;
43038+
43039+ /* NOTE-NIKITA consider calling plugin method in stead of
43040+ accessing inode fields directly. */
43041+ from_dir->i_mtime = CURRENT_TIME;
43042+ } else {
43043+ warning("nikita-2326", "Unexpected item type");
43044+ result = RETERR(-EIO);
43045+ }
43046+ zrelse(node);
43047+ return result;
43048+}
43049+
43050+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43051+
43052+ Helper function used by hashed_rename(). */
43053+static int add_name(struct inode *inode, /* inode where @coord is to be
43054+ * re-targeted at */
43055+ struct inode *dir, /* directory where @coord lives */
43056+ struct dentry *name, /* new name */
43057+ coord_t * coord, /* where directory entry is in the tree */
43058+ lock_handle * lh, /* lock handle on @coord */
43059+ int is_dir /* true, if @inode is directory */ )
43060+{
43061+ int result;
43062+ reiser4_dir_entry_desc entry;
43063+
43064+ assert("nikita-2333", lh->node == coord->node);
43065+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43066+
43067+ memset(&entry, 0, sizeof entry);
43068+ entry.obj = inode;
43069+ /* build key of directory entry description */
43070+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43071+
43072+ /* ext2 does this in different order: first inserts new entry,
43073+ then increases directory nlink. We don't want do this,
43074+ because reiser4_add_nlink() calls ->add_link() plugin
43075+ method that can fail for whatever reason, leaving as with
43076+ cleanup problems.
43077+ */
43078+ /* @inode is getting new name */
43079+ reiser4_add_nlink(inode, dir, 0);
43080+ /* create @new_name in @new_dir pointing to
43081+ @old_inode */
43082+ result = WITH_COORD(coord,
43083+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43084+ coord,
43085+ lh,
43086+ name,
43087+ &entry));
43088+ if (result != 0) {
43089+ int result2;
43090+ result2 = reiser4_del_nlink(inode, dir, 0);
43091+ if (result2 != 0) {
43092+ warning("nikita-2327",
43093+ "Cannot drop link on %lli %i. %s",
43094+ (unsigned long long)get_inode_oid(inode),
43095+ result2, possible_leak);
43096+ }
43097+ } else
43098+ INODE_INC_FIELD(dir, i_size);
43099+ return result;
43100+}
43101+
43102+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43103+ struct dentry *old_name, /* old name */
43104+ struct inode *new_dir, /* directory where @new is located */
43105+ struct dentry *new_name /* new name */ )
43106+{
43107+ reiser4_block_nr res1, res2;
43108+ dir_plugin *p_parent_old, *p_parent_new;
43109+ file_plugin *p_child_old, *p_child_new;
43110+
43111+ assert("vpf-311", old_dir != NULL);
43112+ assert("vpf-312", new_dir != NULL);
43113+ assert("vpf-313", old_name != NULL);
43114+ assert("vpf-314", new_name != NULL);
43115+
43116+ p_parent_old = inode_dir_plugin(old_dir);
43117+ p_parent_new = inode_dir_plugin(new_dir);
43118+ p_child_old = inode_file_plugin(old_name->d_inode);
43119+ if (new_name->d_inode)
43120+ p_child_new = inode_file_plugin(new_name->d_inode);
43121+ else
43122+ p_child_new = NULL;
43123+
43124+ /* find_entry - can insert one leaf. */
43125+ res1 = res2 = 1;
43126+
43127+ /* replace_name */
43128+ {
43129+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43130+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43131+ /* update key */
43132+ res1 += 1;
43133+ /* reiser4_del_nlink(p_child_new) */
43134+ if (p_child_new)
43135+ res1 += p_child_new->estimate.update(new_name->d_inode);
43136+ }
43137+
43138+ /* else add_name */
43139+ {
43140+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43141+ res2 +=
43142+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43143+ /* reiser4_add_nlink(p_parent_old) */
43144+ res2 += p_child_old->estimate.update(old_name->d_inode);
43145+ /* add_entry(p_parent_new) */
43146+ res2 += p_parent_new->estimate.add_entry(new_dir);
43147+ /* reiser4_del_nlink(p_parent_old) */
43148+ res2 += p_child_old->estimate.update(old_name->d_inode);
43149+ }
43150+
43151+ res1 = res1 < res2 ? res2 : res1;
43152+
43153+ /* reiser4_write_sd(p_parent_new) */
43154+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43155+
43156+ /* reiser4_write_sd(p_child_new) */
43157+ if (p_child_new)
43158+ res1 += p_child_new->estimate.update(new_name->d_inode);
43159+
43160+ /* hashed_rem_entry(p_parent_old) */
43161+ res1 += p_parent_old->estimate.rem_entry(old_dir);
43162+
43163+ /* reiser4_del_nlink(p_child_old) */
43164+ res1 += p_child_old->estimate.update(old_name->d_inode);
43165+
43166+ /* replace_name */
43167+ {
43168+ /* reiser4_add_nlink(p_parent_dir_new) */
43169+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43170+ /* update_key */
43171+ res1 += 1;
43172+ /* reiser4_del_nlink(p_parent_new) */
43173+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43174+ /* reiser4_del_nlink(p_parent_old) */
43175+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43176+ }
43177+
43178+ /* reiser4_write_sd(p_parent_old) */
43179+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43180+
43181+ /* reiser4_write_sd(p_child_old) */
43182+ res1 += p_child_old->estimate.update(old_name->d_inode);
43183+
43184+ return res1;
43185+}
43186+
43187+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43188+ struct dentry *old_name, /* old name */
43189+ struct inode *new_dir, /* directory where @new is located */
43190+ struct dentry *new_name
43191+ /* new name */ )
43192+{
43193+ reiser4_block_nr reserve;
43194+
43195+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43196+
43197+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43198+ return RETERR(-ENOSPC);
43199+
43200+ return 0;
43201+}
43202+
43203+/* check whether @old_inode and @new_inode can be moved within file system
43204+ * tree. This singles out attempts to rename pseudo-files, for example. */
43205+static int can_rename(struct inode *old_dir, struct inode *old_inode,
43206+ struct inode *new_dir, struct inode *new_inode)
43207+{
43208+ file_plugin *fplug;
43209+ dir_plugin *dplug;
43210+
43211+ assert("nikita-3370", old_inode != NULL);
43212+
43213+ dplug = inode_dir_plugin(new_dir);
43214+ fplug = inode_file_plugin(old_inode);
43215+
43216+ if (dplug == NULL)
43217+ return RETERR(-ENOTDIR);
43218+ else if (new_dir->i_op->create == NULL)
43219+ return RETERR(-EPERM);
43220+ else if (!fplug->can_add_link(old_inode))
43221+ return RETERR(-EMLINK);
43222+ else if (new_inode != NULL) {
43223+ fplug = inode_file_plugin(new_inode);
43224+ if (fplug->can_rem_link != NULL &&
43225+ !fplug->can_rem_link(new_inode))
43226+ return RETERR(-EBUSY);
43227+ }
43228+ return 0;
43229+}
43230+
71430cf6 43231+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
44254afd
MT
43232+ znode_lock_mode, reiser4_dir_entry_desc *);
43233+int reiser4_update_dir(struct inode *);
43234+
43235+/* this is common implementation of vfs's rename method of struct
43236+ inode_operations
43237+ See comments in the body.
43238+
43239+ It is arguable that this function can be made generic so, that it
43240+ will be applicable to any kind of directory plugin that deals with
43241+ directories composed out of directory entries. The only obstacle
43242+ here is that we don't have any data-type to represent directory
43243+ entry. This should be re-considered when more than one different
43244+ directory plugin will be implemented.
43245+*/
71430cf6
MT
43246+int reiser4_rename_common(struct inode *old_dir /* directory where @old
43247+ * is located */ ,
43248+ struct dentry *old_name /* old name */ ,
43249+ struct inode *new_dir /* directory where @new
43250+ * is located */ ,
43251+ struct dentry *new_name /* new name */ )
44254afd
MT
43252+{
43253+ /* From `The Open Group Base Specifications Issue 6'
43254+
43255+ If either the old or new argument names a symbolic link, rename()
43256+ shall operate on the symbolic link itself, and shall not resolve
43257+ the last component of the argument. If the old argument and the new
43258+ argument resolve to the same existing file, rename() shall return
43259+ successfully and perform no other action.
43260+
43261+ [this is done by VFS: vfs_rename()]
43262+
43263+ If the old argument points to the pathname of a file that is not a
43264+ directory, the new argument shall not point to the pathname of a
43265+ directory.
43266+
43267+ [checked by VFS: vfs_rename->may_delete()]
43268+
43269+ If the link named by the new argument exists, it shall
43270+ be removed and old renamed to new. In this case, a link named new
43271+ shall remain visible to other processes throughout the renaming
43272+ operation and refer either to the file referred to by new or old
43273+ before the operation began.
43274+
43275+ [we should assure this]
43276+
43277+ Write access permission is required for
43278+ both the directory containing old and the directory containing new.
43279+
43280+ [checked by VFS: vfs_rename->may_delete(), may_create()]
43281+
43282+ If the old argument points to the pathname of a directory, the new
43283+ argument shall not point to the pathname of a file that is not a
43284+ directory.
43285+
43286+ [checked by VFS: vfs_rename->may_delete()]
43287+
43288+ If the directory named by the new argument exists, it
43289+ shall be removed and old renamed to new. In this case, a link named
43290+ new shall exist throughout the renaming operation and shall refer
43291+ either to the directory referred to by new or old before the
43292+ operation began.
43293+
43294+ [we should assure this]
43295+
43296+ If new names an existing directory, it shall be
43297+ required to be an empty directory.
43298+
43299+ [we should check this]
43300+
43301+ If the old argument points to a pathname of a symbolic link, the
43302+ symbolic link shall be renamed. If the new argument points to a
43303+ pathname of a symbolic link, the symbolic link shall be removed.
43304+
43305+ The new pathname shall not contain a path prefix that names
43306+ old. Write access permission is required for the directory
43307+ containing old and the directory containing new. If the old
43308+ argument points to the pathname of a directory, write access
43309+ permission may be required for the directory named by old, and, if
43310+ it exists, the directory named by new.
43311+
43312+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
43313+
43314+ If the link named by the new argument exists and the file's link
43315+ count becomes 0 when it is removed and no process has the file
43316+ open, the space occupied by the file shall be freed and the file
43317+ shall no longer be accessible. If one or more processes have the
43318+ file open when the last link is removed, the link shall be removed
43319+ before rename() returns, but the removal of the file contents shall
43320+ be postponed until all references to the file are closed.
43321+
43322+ [iput() handles this, but we can do this manually, a la
43323+ reiser4_unlink()]
43324+
43325+ Upon successful completion, rename() shall mark for update the
43326+ st_ctime and st_mtime fields of the parent directory of each file.
43327+
43328+ [N/A]
43329+
43330+ */
43331+ reiser4_context *ctx;
43332+ int result;
43333+ int is_dir; /* is @old_name directory */
43334+
43335+ struct inode *old_inode;
43336+ struct inode *new_inode;
43337+ coord_t *new_coord;
43338+
71430cf6 43339+ struct reiser4_dentry_fsdata *new_fsdata;
44254afd
MT
43340+ dir_plugin *dplug;
43341+ file_plugin *fplug;
43342+
43343+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43344+ lock_handle *new_lh, *dotdot_lh;
43345+ struct dentry *dotdot_name;
71430cf6 43346+ struct reiser4_dentry_fsdata *dataonstack;
44254afd 43347+
71430cf6 43348+ ctx = reiser4_init_context(old_dir->i_sb);
44254afd
MT
43349+ if (IS_ERR(ctx))
43350+ return PTR_ERR(ctx);
43351+
43352+ old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43353+ sizeof(*dotdot_name) + sizeof(*dataonstack),
71430cf6 43354+ reiser4_ctx_gfp_mask_get());
44254afd
MT
43355+ if (old_entry == NULL) {
43356+ context_set_commit_async(ctx);
43357+ reiser4_exit_context(ctx);
43358+ return RETERR(-ENOMEM);
43359+ }
43360+ memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43361+ sizeof(*dotdot_name) + sizeof(*dataonstack));
43362+
43363+ new_entry = old_entry + 1;
43364+ dotdot_entry = old_entry + 2;
43365+ new_lh = (lock_handle *)(old_entry + 3);
43366+ dotdot_lh = new_lh + 1;
43367+ dotdot_name = (struct dentry *)(new_lh + 2);
71430cf6 43368+ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
44254afd
MT
43369+
43370+ assert("nikita-2318", old_dir != NULL);
43371+ assert("nikita-2319", new_dir != NULL);
43372+ assert("nikita-2320", old_name != NULL);
43373+ assert("nikita-2321", new_name != NULL);
43374+
43375+ old_inode = old_name->d_inode;
43376+ new_inode = new_name->d_inode;
43377+
43378+ dplug = inode_dir_plugin(old_dir);
43379+ fplug = NULL;
43380+
43381+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
43382+ if (IS_ERR(new_fsdata)) {
43383+ kfree(old_entry);
43384+ context_set_commit_async(ctx);
43385+ reiser4_exit_context(ctx);
43386+ return PTR_ERR(new_fsdata);
43387+ }
43388+
43389+ new_coord = &new_fsdata->dec.entry_coord;
43390+ coord_clear_iplug(new_coord);
43391+
43392+ is_dir = S_ISDIR(old_inode->i_mode);
43393+
43394+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43395+
43396+ /* if target is existing directory and it's not empty---return error.
43397+
43398+ This check is done specifically, because is_dir_empty() requires
43399+ tree traversal and have to be done before locks are taken.
43400+ */
43401+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43402+ kfree(old_entry);
43403+ context_set_commit_async(ctx);
43404+ reiser4_exit_context(ctx);
43405+ return RETERR(-ENOTEMPTY);
43406+ }
43407+
43408+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
43409+ if (result != 0) {
43410+ kfree(old_entry);
43411+ context_set_commit_async(ctx);
43412+ reiser4_exit_context(ctx);
43413+ return result;
43414+ }
43415+
43416+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
43417+ new_dir, new_name);
43418+ if (result != 0) {
43419+ kfree(old_entry);
43420+ context_set_commit_async(ctx);
43421+ reiser4_exit_context(ctx);
43422+ return result;
43423+ }
43424+
43425+ init_lh(new_lh);
43426+
43427+ /* find entry for @new_name */
71430cf6
MT
43428+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
43429+ new_entry);
44254afd
MT
43430+
43431+ if (IS_CBKERR(result)) {
43432+ done_lh(new_lh);
43433+ kfree(old_entry);
43434+ context_set_commit_async(ctx);
43435+ reiser4_exit_context(ctx);
43436+ return result;
43437+ }
43438+
71430cf6 43439+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
44254afd
MT
43440+
43441+ /* add or replace name for @old_inode as @new_name */
43442+ if (new_inode != NULL) {
43443+ /* target (@new_name) exists. */
43444+ /* Not clear what to do with objects that are
43445+ both directories and files at the same time. */
43446+ if (result == CBK_COORD_FOUND) {
43447+ result = replace_name(old_inode,
43448+ new_dir,
43449+ new_inode, new_coord, new_lh);
43450+ if (result == 0)
43451+ fplug = inode_file_plugin(new_inode);
43452+ } else if (result == CBK_COORD_NOTFOUND) {
43453+ /* VFS told us that @new_name is bound to existing
43454+ inode, but we failed to find directory entry. */
43455+ warning("nikita-2324", "Target not found");
43456+ result = RETERR(-ENOENT);
43457+ }
43458+ } else {
43459+ /* target (@new_name) doesn't exists. */
43460+ if (result == CBK_COORD_NOTFOUND)
43461+ result = add_name(old_inode,
43462+ new_dir,
43463+ new_name, new_coord, new_lh, is_dir);
43464+ else if (result == CBK_COORD_FOUND) {
43465+ /* VFS told us that @new_name is "negative" dentry,
43466+ but we found directory entry. */
43467+ warning("nikita-2331", "Target found unexpectedly");
43468+ result = RETERR(-EIO);
43469+ }
43470+ }
43471+
43472+ assert("nikita-3462", ergo(result == 0,
43473+ old_inode->i_nlink >= 2 + !!is_dir));
43474+
43475+ /* We are done with all modifications to the @new_dir, release lock on
43476+ node. */
43477+ done_lh(new_lh);
43478+
43479+ if (fplug != NULL) {
43480+ /* detach @new_inode from name-space */
43481+ result = fplug->detach(new_inode, new_dir);
43482+ if (result != 0)
43483+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
43484+ (unsigned long long)get_inode_oid(new_inode),
43485+ result, possible_leak);
43486+ }
43487+
43488+ if (new_inode != NULL)
43489+ reiser4_update_sd(new_inode);
43490+
43491+ if (result == 0) {
43492+ old_entry->obj = old_inode;
43493+
43494+ dplug->build_entry_key(old_dir,
43495+ &old_name->d_name, &old_entry->key);
43496+
43497+ /* At this stage new name was introduced for
43498+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43499+ counters were updated.
43500+
43501+ We want to remove @old_name now. If @old_inode wasn't
43502+ directory this is simple.
43503+ */
43504+ result = dplug->rem_entry(old_dir, old_name, old_entry);
43505+ if (result != 0 && result != -ENOMEM) {
43506+ warning("nikita-2335",
43507+ "Cannot remove old name: %i", result);
43508+ } else {
43509+ result = reiser4_del_nlink(old_inode, old_dir, 0);
43510+ if (result != 0 && result != -ENOMEM) {
43511+ warning("nikita-2337",
43512+ "Cannot drop link on old: %i", result);
43513+ }
43514+ }
43515+
43516+ if (result == 0 && is_dir) {
43517+ /* @old_inode is directory. We also have to update
43518+ dotdot entry. */
43519+ coord_t *dotdot_coord;
43520+
43521+ memset(dataonstack, 0, sizeof dataonstack);
43522+ memset(dotdot_entry, 0, sizeof dotdot_entry);
43523+ dotdot_entry->obj = old_dir;
43524+ memset(dotdot_name, 0, sizeof dotdot_name);
43525+ dotdot_name->d_name.name = "..";
43526+ dotdot_name->d_name.len = 2;
43527+ /*
43528+ * allocate ->d_fsdata on the stack to avoid using
43529+ * reiser4_get_dentry_fsdata(). Locking is not needed,
43530+ * because dentry is private to the current thread.
43531+ */
43532+ dotdot_name->d_fsdata = dataonstack;
43533+ init_lh(dotdot_lh);
43534+
43535+ dotdot_coord = &dataonstack->dec.entry_coord;
43536+ coord_clear_iplug(dotdot_coord);
43537+
71430cf6
MT
43538+ result = reiser4_find_entry(old_inode, dotdot_name,
43539+ dotdot_lh, ZNODE_WRITE_LOCK,
43540+ dotdot_entry);
44254afd
MT
43541+ if (result == 0) {
43542+ /* replace_name() decreases i_nlink on
43543+ * @old_dir */
43544+ result = replace_name(new_dir,
43545+ old_inode,
43546+ old_dir,
43547+ dotdot_coord, dotdot_lh);
43548+ } else
43549+ result = RETERR(-EIO);
43550+ done_lh(dotdot_lh);
43551+ }
43552+ }
43553+ reiser4_update_dir(new_dir);
43554+ reiser4_update_dir(old_dir);
43555+ reiser4_update_sd(old_inode);
43556+ if (result == 0) {
43557+ file_plugin *fplug;
43558+
43559+ if (new_inode != NULL) {
43560+ /* add safe-link for target file (in case we removed
43561+ * last reference to the poor fellow */
43562+ fplug = inode_file_plugin(new_inode);
43563+ if (new_inode->i_nlink == 0)
43564+ result = safe_link_add(new_inode, SAFE_UNLINK);
43565+ }
43566+ }
43567+ kfree(old_entry);
43568+ context_set_commit_async(ctx);
43569+ reiser4_exit_context(ctx);
43570+ return result;
43571+}
43572+
43573+#if 0
71430cf6
MT
43574+int reiser4_rename_common(struct inode *old_dir /* directory where @old
43575+ * is located */ ,
43576+ struct dentry *old_name /* old name */ ,
43577+ struct inode *new_dir /* directory where @new
43578+ * is located */ ,
43579+ struct dentry *new_name /* new name */ )
44254afd
MT
43580+{
43581+ /* From `The Open Group Base Specifications Issue 6'
43582+
43583+ If either the old or new argument names a symbolic link, rename()
43584+ shall operate on the symbolic link itself, and shall not resolve
43585+ the last component of the argument. If the old argument and the new
43586+ argument resolve to the same existing file, rename() shall return
43587+ successfully and perform no other action.
43588+
43589+ [this is done by VFS: vfs_rename()]
43590+
43591+ If the old argument points to the pathname of a file that is not a
43592+ directory, the new argument shall not point to the pathname of a
43593+ directory.
43594+
43595+ [checked by VFS: vfs_rename->may_delete()]
43596+
43597+ If the link named by the new argument exists, it shall
43598+ be removed and old renamed to new. In this case, a link named new
43599+ shall remain visible to other processes throughout the renaming
43600+ operation and refer either to the file referred to by new or old
43601+ before the operation began.
43602+
43603+ [we should assure this]
43604+
43605+ Write access permission is required for
43606+ both the directory containing old and the directory containing new.
43607+
43608+ [checked by VFS: vfs_rename->may_delete(), may_create()]
43609+
43610+ If the old argument points to the pathname of a directory, the new
43611+ argument shall not point to the pathname of a file that is not a
43612+ directory.
43613+
43614+ [checked by VFS: vfs_rename->may_delete()]
43615+
43616+ If the directory named by the new argument exists, it
43617+ shall be removed and old renamed to new. In this case, a link named
43618+ new shall exist throughout the renaming operation and shall refer
43619+ either to the directory referred to by new or old before the
43620+ operation began.
43621+
43622+ [we should assure this]
43623+
43624+ If new names an existing directory, it shall be
43625+ required to be an empty directory.
43626+
43627+ [we should check this]
43628+
43629+ If the old argument points to a pathname of a symbolic link, the
43630+ symbolic link shall be renamed. If the new argument points to a
43631+ pathname of a symbolic link, the symbolic link shall be removed.
43632+
43633+ The new pathname shall not contain a path prefix that names
43634+ old. Write access permission is required for the directory
43635+ containing old and the directory containing new. If the old
43636+ argument points to the pathname of a directory, write access
43637+ permission may be required for the directory named by old, and, if
43638+ it exists, the directory named by new.
43639+
43640+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
43641+
43642+ If the link named by the new argument exists and the file's link
43643+ count becomes 0 when it is removed and no process has the file
43644+ open, the space occupied by the file shall be freed and the file
43645+ shall no longer be accessible. If one or more processes have the
43646+ file open when the last link is removed, the link shall be removed
43647+ before rename() returns, but the removal of the file contents shall
43648+ be postponed until all references to the file are closed.
43649+
43650+ [iput() handles this, but we can do this manually, a la
43651+ reiser4_unlink()]
43652+
43653+ Upon successful completion, rename() shall mark for update the
43654+ st_ctime and st_mtime fields of the parent directory of each file.
43655+
43656+ [N/A]
43657+
43658+ */
43659+ reiser4_context *ctx;
43660+ int result;
43661+ int is_dir; /* is @old_name directory */
43662+ struct inode *old_inode;
43663+ struct inode *new_inode;
43664+ reiser4_dir_entry_desc old_entry;
43665+ reiser4_dir_entry_desc new_entry;
43666+ coord_t *new_coord;
71430cf6 43667+ struct reiser4_dentry_fsdata *new_fsdata;
44254afd
MT
43668+ lock_handle new_lh;
43669+ dir_plugin *dplug;
43670+ file_plugin *fplug;
43671+
71430cf6 43672+ ctx = reiser4_init_context(old_dir->i_sb);
44254afd
MT
43673+ if (IS_ERR(ctx))
43674+ return PTR_ERR(ctx);
43675+
43676+ assert("nikita-2318", old_dir != NULL);
43677+ assert("nikita-2319", new_dir != NULL);
43678+ assert("nikita-2320", old_name != NULL);
43679+ assert("nikita-2321", new_name != NULL);
43680+
43681+ old_inode = old_name->d_inode;
43682+ new_inode = new_name->d_inode;
43683+
43684+ dplug = inode_dir_plugin(old_dir);
43685+ fplug = NULL;
43686+
43687+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
43688+ if (IS_ERR(new_fsdata)) {
43689+ result = PTR_ERR(new_fsdata);
43690+ goto exit;
43691+ }
43692+
43693+ new_coord = &new_fsdata->dec.entry_coord;
43694+ coord_clear_iplug(new_coord);
43695+
43696+ is_dir = S_ISDIR(old_inode->i_mode);
43697+
43698+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43699+
43700+ /* if target is existing directory and it's not empty---return error.
43701+
43702+ This check is done specifically, because is_dir_empty() requires
43703+ tree traversal and have to be done before locks are taken.
43704+ */
43705+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
43706+ return RETERR(-ENOTEMPTY);
43707+
43708+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
43709+ if (result != 0)
43710+ goto exit;
43711+
43712+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
43713+ new_dir, new_name);
43714+ if (result != 0)
43715+ goto exit;
43716+
43717+ init_lh(&new_lh);
43718+
43719+ /* find entry for @new_name */
71430cf6
MT
43720+ result = reiser4_find_entry(new_dir, new_name, &new_lh,
43721+ ZNODE_WRITE_LOCK, &new_entry);
44254afd
MT
43722+
43723+ if (IS_CBKERR(result)) {
43724+ done_lh(&new_lh);
43725+ goto exit;
43726+ }
43727+
71430cf6 43728+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
44254afd
MT
43729+
43730+ /* add or replace name for @old_inode as @new_name */
43731+ if (new_inode != NULL) {
43732+ /* target (@new_name) exists. */
43733+ /* Not clear what to do with objects that are
43734+ both directories and files at the same time. */
43735+ if (result == CBK_COORD_FOUND) {
43736+ result = replace_name(old_inode,
43737+ new_dir,
43738+ new_inode, new_coord, &new_lh);
43739+ if (result == 0)
43740+ fplug = inode_file_plugin(new_inode);
43741+ } else if (result == CBK_COORD_NOTFOUND) {
43742+ /* VFS told us that @new_name is bound to existing
43743+ inode, but we failed to find directory entry. */
43744+ warning("nikita-2324", "Target not found");
43745+ result = RETERR(-ENOENT);
43746+ }
43747+ } else {
43748+ /* target (@new_name) doesn't exists. */
43749+ if (result == CBK_COORD_NOTFOUND)
43750+ result = add_name(old_inode,
43751+ new_dir,
43752+ new_name, new_coord, &new_lh, is_dir);
43753+ else if (result == CBK_COORD_FOUND) {
43754+ /* VFS told us that @new_name is "negative" dentry,
43755+ but we found directory entry. */
43756+ warning("nikita-2331", "Target found unexpectedly");
43757+ result = RETERR(-EIO);
43758+ }
43759+ }
43760+
43761+ assert("nikita-3462", ergo(result == 0,
43762+ old_inode->i_nlink >= 2 + !!is_dir));
43763+
43764+ /* We are done with all modifications to the @new_dir, release lock on
43765+ node. */
43766+ done_lh(&new_lh);
43767+
43768+ if (fplug != NULL) {
43769+ /* detach @new_inode from name-space */
43770+ result = fplug->detach(new_inode, new_dir);
43771+ if (result != 0)
43772+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
43773+ (unsigned long long)get_inode_oid(new_inode),
43774+ result, possible_leak);
43775+ }
43776+
43777+ if (new_inode != NULL)
43778+ reiser4_update_sd(new_inode);
43779+
43780+ if (result == 0) {
43781+ memset(&old_entry, 0, sizeof old_entry);
43782+ old_entry.obj = old_inode;
43783+
43784+ dplug->build_entry_key(old_dir,
43785+ &old_name->d_name, &old_entry.key);
43786+
43787+ /* At this stage new name was introduced for
43788+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43789+ counters were updated.
43790+
43791+ We want to remove @old_name now. If @old_inode wasn't
43792+ directory this is simple.
43793+ */
43794+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
43795+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
43796+ if (result != 0 && result != -ENOMEM) {
43797+ warning("nikita-2335",
43798+ "Cannot remove old name: %i", result);
43799+ } else {
43800+ result = reiser4_del_nlink(old_inode, old_dir, 0);
43801+ if (result != 0 && result != -ENOMEM) {
43802+ warning("nikita-2337",
43803+ "Cannot drop link on old: %i", result);
43804+ }
43805+ }
43806+
43807+ if (result == 0 && is_dir) {
43808+ /* @old_inode is directory. We also have to update
43809+ dotdot entry. */
43810+ coord_t *dotdot_coord;
43811+ lock_handle dotdot_lh;
43812+ struct dentry dotdot_name;
43813+ reiser4_dir_entry_desc dotdot_entry;
71430cf6
MT
43814+ struct reiser4_dentry_fsdata dataonstack;
43815+ struct reiser4_dentry_fsdata *fsdata;
44254afd
MT
43816+
43817+ memset(&dataonstack, 0, sizeof dataonstack);
43818+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
43819+ dotdot_entry.obj = old_dir;
43820+ memset(&dotdot_name, 0, sizeof dotdot_name);
43821+ dotdot_name.d_name.name = "..";
43822+ dotdot_name.d_name.len = 2;
43823+ /*
43824+ * allocate ->d_fsdata on the stack to avoid using
43825+ * reiser4_get_dentry_fsdata(). Locking is not needed,
43826+ * because dentry is private to the current thread.
43827+ */
43828+ dotdot_name.d_fsdata = &dataonstack;
43829+ init_lh(&dotdot_lh);
43830+
43831+ fsdata = &dataonstack;
43832+ dotdot_coord = &fsdata->dec.entry_coord;
43833+ coord_clear_iplug(dotdot_coord);
43834+
71430cf6
MT
43835+ result = reiser4_find_entry(old_inode,
43836+ &dotdot_name,
43837+ &dotdot_lh,
43838+ ZNODE_WRITE_LOCK,
43839+ &dotdot_entry);
44254afd
MT
43840+ if (result == 0) {
43841+ /* replace_name() decreases i_nlink on
43842+ * @old_dir */
43843+ result = replace_name(new_dir,
43844+ old_inode,
43845+ old_dir,
43846+ dotdot_coord, &dotdot_lh);
43847+ } else
43848+ result = RETERR(-EIO);
43849+ done_lh(&dotdot_lh);
43850+ }
43851+ }
43852+ reiser4_update_dir(new_dir);
43853+ reiser4_update_dir(old_dir);
43854+ reiser4_update_sd(old_inode);
43855+ if (result == 0) {
43856+ file_plugin *fplug;
43857+
43858+ if (new_inode != NULL) {
43859+ /* add safe-link for target file (in case we removed
43860+ * last reference to the poor fellow */
43861+ fplug = inode_file_plugin(new_inode);
43862+ if (new_inode->i_nlink == 0)
43863+ result = safe_link_add(new_inode, SAFE_UNLINK);
43864+ }
43865+ }
43866+ exit:
43867+ context_set_commit_async(ctx);
43868+ reiser4_exit_context(ctx);
43869+ return result;
43870+}
43871+#endif
71430cf6
MT
43872diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/acl.h linux-2.6.22/fs/reiser4/plugin/item/acl.h
43873--- linux-2.6.22.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 03:00:00.000000000 +0300
43874+++ linux-2.6.22/fs/reiser4/plugin/item/acl.h 2007-07-29 00:25:34.940713042 +0400
44254afd
MT
43875@@ -0,0 +1,66 @@
43876+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43877+
43878+/* Directory entry. */
43879+
43880+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
43881+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
43882+
43883+#include "../../forward.h"
43884+#include "../../dformat.h"
43885+#include "../../kassign.h"
43886+#include "../../key.h"
43887+
43888+#include <linux/fs.h>
43889+#include <linux/dcache.h> /* for struct dentry */
43890+
43891+typedef struct directory_entry_format {
43892+ /* key of object stat-data. It's not necessary to store whole
43893+ key here, because it's always key of stat-data, so minor
43894+ packing locality and offset can be omitted here. But this
43895+ relies on particular key allocation scheme for stat-data, so,
43896+ for extensibility sake, whole key can be stored here.
43897+
43898+ We store key as array of bytes, because we don't want 8-byte
43899+ alignment of dir entries.
43900+ */
43901+ obj_key_id id;
43902+ /* file name. Null terminated string. */
43903+ d8 name[0];
43904+} directory_entry_format;
43905+
43906+void print_de(const char *prefix, coord_t * coord);
43907+int extract_key_de(const coord_t * coord, reiser4_key * key);
43908+int update_key_de(const coord_t * coord, const reiser4_key * key,
43909+ lock_handle * lh);
43910+char *extract_name_de(const coord_t * coord, char *buf);
43911+unsigned extract_file_type_de(const coord_t * coord);
43912+int add_entry_de(struct inode *dir, coord_t * coord,
43913+ lock_handle * lh, const struct dentry *name,
43914+ reiser4_dir_entry_desc * entry);
43915+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
43916+ lock_handle * lh, reiser4_dir_entry_desc * entry);
43917+int max_name_len_de(const struct inode *dir);
43918+
43919+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
43920+
43921+char *extract_dent_name(const coord_t * coord,
43922+ directory_entry_format * dent, char *buf);
43923+
43924+#if REISER4_LARGE_KEY
43925+#define DE_NAME_BUF_LEN (24)
43926+#else
43927+#define DE_NAME_BUF_LEN (16)
43928+#endif
43929+
43930+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
43931+#endif
43932+
43933+/* Make Linus happy.
43934+ Local variables:
43935+ c-indentation-style: "K&R"
43936+ mode-name: "LC"
43937+ c-basic-offset: 8
43938+ tab-width: 8
43939+ fill-column: 120
43940+ End:
43941+*/
71430cf6
MT
43942diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.22/fs/reiser4/plugin/item/blackbox.c
43943--- linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
43944+++ linux-2.6.22/fs/reiser4/plugin/item/blackbox.c 2007-07-29 00:25:34.940713042 +0400
44254afd
MT
43945@@ -0,0 +1,142 @@
43946+/* Copyright 2003 by Hans Reiser, licensing governed by
43947+ * reiser4/README */
43948+
43949+/* Black box item implementation */
43950+
43951+#include "../../forward.h"
43952+#include "../../debug.h"
43953+#include "../../dformat.h"
43954+#include "../../kassign.h"
43955+#include "../../coord.h"
43956+#include "../../tree.h"
43957+#include "../../lock.h"
43958+
43959+#include "blackbox.h"
43960+#include "item.h"
43961+#include "../plugin.h"
43962+
43963+int
43964+store_black_box(reiser4_tree * tree,
43965+ const reiser4_key * key, void *data, int length)
43966+{
43967+ int result;
43968+ reiser4_item_data idata;
43969+ coord_t coord;
43970+ lock_handle lh;
43971+
43972+ memset(&idata, 0, sizeof idata);
43973+
43974+ idata.data = data;
43975+ idata.user = 0;
43976+ idata.length = length;
43977+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
43978+
43979+ init_lh(&lh);
43980+ result = insert_by_key(tree, key,
43981+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
43982+
43983+ assert("nikita-3413",
43984+ ergo(result == 0,
43985+ WITH_COORD(&coord,
43986+ item_length_by_coord(&coord) == length)));
43987+
43988+ done_lh(&lh);
43989+ return result;
43990+}
43991+
43992+int
43993+load_black_box(reiser4_tree * tree,
43994+ reiser4_key * key, void *data, int length, int exact)
43995+{
43996+ int result;
43997+ coord_t coord;
43998+ lock_handle lh;
43999+
44000+ init_lh(&lh);
44001+ result = coord_by_key(tree, key,
44002+ &coord, &lh, ZNODE_READ_LOCK,
44003+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44004+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44005+
44006+ if (result == 0) {
44007+ int ilen;
44008+
44009+ result = zload(coord.node);
44010+ if (result == 0) {
44011+ ilen = item_length_by_coord(&coord);
44012+ if (ilen <= length) {
44013+ memcpy(data, item_body_by_coord(&coord), ilen);
44014+ unit_key_by_coord(&coord, key);
44015+ } else if (exact) {
44016+ /*
44017+ * item is larger than buffer provided by the
44018+ * user. Only issue a warning if @exact is
44019+ * set. If @exact is false, we are iterating
44020+ * over all safe-links and here we are reaching
44021+ * the end of the iteration.
44022+ */
44023+ warning("nikita-3415",
44024+ "Wrong black box length: %i > %i",
44025+ ilen, length);
44026+ result = RETERR(-EIO);
44027+ }
44028+ zrelse(coord.node);
44029+ }
44030+ }
44031+
44032+ done_lh(&lh);
44033+ return result;
44034+
44035+}
44036+
44037+int
44038+update_black_box(reiser4_tree * tree,
44039+ const reiser4_key * key, void *data, int length)
44040+{
44041+ int result;
44042+ coord_t coord;
44043+ lock_handle lh;
44044+
44045+ init_lh(&lh);
44046+ result = coord_by_key(tree, key,
44047+ &coord, &lh, ZNODE_READ_LOCK,
44048+ FIND_EXACT,
44049+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44050+ if (result == 0) {
44051+ int ilen;
44052+
44053+ result = zload(coord.node);
44054+ if (result == 0) {
44055+ ilen = item_length_by_coord(&coord);
44056+ if (length <= ilen) {
44057+ memcpy(item_body_by_coord(&coord), data,
44058+ length);
44059+ } else {
44060+ warning("nikita-3437",
44061+ "Wrong black box length: %i < %i",
44062+ ilen, length);
44063+ result = RETERR(-EIO);
44064+ }
44065+ zrelse(coord.node);
44066+ }
44067+ }
44068+
44069+ done_lh(&lh);
44070+ return result;
44071+
44072+}
44073+
44074+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44075+{
71430cf6 44076+ return reiser4_cut_tree(tree, key, key, NULL, 1);
44254afd
MT
44077+}
44078+
44079+/* Make Linus happy.
44080+ Local variables:
44081+ c-indentation-style: "K&R"
44082+ mode-name: "LC"
44083+ c-basic-offset: 8
44084+ tab-width: 8
44085+ fill-column: 120
44086+ End:
44087+*/
71430cf6
MT
44088diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.22/fs/reiser4/plugin/item/blackbox.h
44089--- linux-2.6.22.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
44090+++ linux-2.6.22/fs/reiser4/plugin/item/blackbox.h 2007-07-29 00:25:34.940713042 +0400
44254afd
MT
44091@@ -0,0 +1,33 @@
44092+/* Copyright 2003 by Hans Reiser, licensing governed by
44093+ * reiser4/README */
44094+
44095+/* "Black box" entry to fixed-width contain user supplied data */
44096+
44097+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44098+#define __FS_REISER4_BLACK_BOX_H__
44099+
44100+#include "../../forward.h"
44101+#include "../../dformat.h"
44102+#include "../../kassign.h"
44103+#include "../../key.h"
44104+
44105+extern int store_black_box(reiser4_tree * tree,
44106+ const reiser4_key * key, void *data, int length);
44107+extern int load_black_box(reiser4_tree * tree,
44108+ reiser4_key * key, void *data, int length, int exact);
44109+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44110+extern int update_black_box(reiser4_tree * tree,
44111+ const reiser4_key * key, void *data, int length);
44112+
44113+/* __FS_REISER4_BLACK_BOX_H__ */
44114+#endif
44115+
44116+/* Make Linus happy.
44117+ Local variables:
44118+ c-indentation-style: "K&R"
44119+ mode-name: "LC"
44120+ c-basic-offset: 8
44121+ tab-width: 8
44122+ fill-column: 120
44123+ End:
44124+*/
71430cf6
MT
44125diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/cde.c linux-2.6.22/fs/reiser4/plugin/item/cde.c
44126--- linux-2.6.22.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 03:00:00.000000000 +0300
44127+++ linux-2.6.22/fs/reiser4/plugin/item/cde.c 2007-07-29 00:25:34.944714077 +0400
44128@@ -0,0 +1,1008 @@
44254afd
MT
44129+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44130+
44131+/* Directory entry implementation */
44132+
44133+/* DESCRIPTION:
44134+
44135+ This is "compound" directory item plugin implementation. This directory
44136+ item type is compound (as opposed to the "simple directory item" in
44137+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44138+ entries.
44139+
44140+ The reason behind this decision is disk space efficiency: all directory
44141+ entries inside the same directory have identical fragment in their
44142+ keys. This, of course, depends on key assignment policy. In our default key
44143+ assignment policy, all directory entries have the same locality which is
44144+ equal to the object id of their directory.
44145+
44146+ Composing directory item out of several directory entries for the same
44147+ directory allows us to store said key fragment only once. That is, this is
44148+ some ad hoc form of key compression (stem compression) that is implemented
44149+ here, because general key compression is not supposed to be implemented in
44150+ v4.0.
44151+
44152+ Another decision that was made regarding all directory item plugins, is
44153+ that they will store entry keys unaligned. This is for that sake of disk
44154+ space efficiency again.
44155+
44156+ In should be noted, that storing keys unaligned increases CPU consumption,
44157+ at least on some architectures.
44158+
44159+ Internal on-disk structure of the compound directory item is the following:
44160+
44161+ HEADER cde_item_format. Here number of entries is stored.
44162+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44163+ ENTRY_HEADER_1 offset of entry body are stored.
44164+ ENTRY_HEADER_2 (basically two last parts of key)
44165+ ...
44166+ ENTRY_HEADER_N
44167+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44168+ ENTRY_BODY_1 NUL-terminated name are stored.
44169+ ENTRY_BODY_2 (part of statadta key in the
44170+ sence that since all SDs have
44171+ zero offset, this offset is not
44172+ stored on disk).
44173+ ...
44174+ ENTRY_BODY_N
44175+
44176+ When it comes to the balancing, each directory entry in compound directory
44177+ item is unit, that is, something that can be cut from one item and pasted
44178+ into another item of the same type. Handling of unit cut and paste is major
44179+ reason for the complexity of code below.
44180+
44181+*/
44182+
44183+#include "../../forward.h"
44184+#include "../../debug.h"
44185+#include "../../dformat.h"
44186+#include "../../kassign.h"
44187+#include "../../key.h"
44188+#include "../../coord.h"
44189+#include "sde.h"
44190+#include "cde.h"
44191+#include "item.h"
44192+#include "../node/node.h"
44193+#include "../plugin.h"
44194+#include "../../znode.h"
44195+#include "../../carry.h"
44196+#include "../../tree.h"
44197+#include "../../inode.h"
44198+
44199+#include <linux/fs.h> /* for struct inode */
44200+#include <linux/dcache.h> /* for struct dentry */
44201+#include <linux/quotaops.h>
44202+
44203+#if 0
44204+#define CHECKME(coord) \
44205+({ \
44206+ const char *message; \
44207+ coord_t dup; \
44208+ \
44209+ coord_dup_nocheck(&dup, (coord)); \
44210+ dup.unit_pos = 0; \
44211+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
44212+})
44213+#else
44214+#define CHECKME(coord) noop
44215+#endif
44216+
44217+/* return body of compound directory item at @coord */
44218+static inline cde_item_format *formatted_at(const coord_t * coord)
44219+{
44220+ assert("nikita-1282", coord != NULL);
44221+ return item_body_by_coord(coord);
44222+}
44223+
44224+/* return entry header at @coord */
44225+static inline cde_unit_header *header_at(const coord_t *
44226+ coord /* coord of item */ ,
44227+ int idx /* index of unit */ )
44228+{
44229+ assert("nikita-1283", coord != NULL);
44230+ return &formatted_at(coord)->entry[idx];
44231+}
44232+
44233+/* return number of units in compound directory item at @coord */
44234+static int units(const coord_t * coord /* coord of item */ )
44235+{
44236+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44237+}
44238+
44239+/* return offset of the body of @idx-th entry in @coord */
44240+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44241+ int idx /* index of unit */ )
44242+{
44243+ if (idx < units(coord))
44244+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44245+ else if (idx == units(coord))
44246+ return item_length_by_coord(coord);
44247+ else
44248+ impossible("nikita-1308", "Wrong idx");
44249+ return 0;
44250+}
44251+
44252+/* set offset of the body of @idx-th entry in @coord */
44253+static void set_offset(const coord_t * coord /* coord of item */ ,
44254+ int idx /* index of unit */ ,
44255+ unsigned int offset /* new offset */ )
44256+{
44257+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44258+}
44259+
44260+static void adj_offset(const coord_t * coord /* coord of item */ ,
44261+ int idx /* index of unit */ ,
44262+ int delta /* offset change */ )
44263+{
44264+ d16 *doffset;
44265+ __u16 offset;
44266+
44267+ doffset = &header_at(coord, idx)->offset;
44268+ offset = le16_to_cpu(get_unaligned(doffset));
44269+ offset += delta;
44270+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
44271+}
44272+
44273+/* return pointer to @offset-th byte from the beginning of @coord */
44274+static char *address(const coord_t * coord /* coord of item */ ,
44275+ int offset)
44276+{
44277+ return ((char *)item_body_by_coord(coord)) + offset;
44278+}
44279+
44280+/* return pointer to the body of @idx-th entry in @coord */
44281+static directory_entry_format *entry_at(const coord_t * coord /* coord of
44282+ * item */ ,
44283+ int idx /* index of unit */ )
44284+{
44285+ return (directory_entry_format *) address(coord,
44286+ (int)offset_of(coord, idx));
44287+}
44288+
44289+/* return number of unit referenced by @coord */
44290+static int idx_of(const coord_t * coord /* coord of item */ )
44291+{
44292+ assert("nikita-1285", coord != NULL);
44293+ return coord->unit_pos;
44294+}
44295+
44296+/* find position where entry with @entry_key would be inserted into @coord */
44297+static int find(const coord_t * coord /* coord of item */ ,
44298+ const reiser4_key * entry_key /* key to look for */ ,
44299+ cmp_t * last /* result of last comparison */ )
44300+{
44301+ int entries;
44302+
44303+ int left;
44304+ int right;
44305+
44306+ cde_unit_header *header;
44307+
44308+ assert("nikita-1295", coord != NULL);
44309+ assert("nikita-1296", entry_key != NULL);
44310+ assert("nikita-1297", last != NULL);
44311+
44312+ entries = units(coord);
44313+ left = 0;
44314+ right = entries - 1;
44315+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44316+ int median;
44317+
44318+ median = (left + right) >> 1;
44319+
44320+ header = header_at(coord, median);
44321+ *last = de_id_key_cmp(&header->hash, entry_key);
44322+ switch (*last) {
44323+ case LESS_THAN:
44324+ left = median;
44325+ break;
44326+ case GREATER_THAN:
44327+ right = median;
44328+ break;
44329+ case EQUAL_TO:{
44330+ do {
44331+ median--;
44332+ header--;
44333+ } while (median >= 0 &&
44334+ de_id_key_cmp(&header->hash,
44335+ entry_key) == EQUAL_TO);
44336+ return median + 1;
44337+ }
44338+ }
44339+ }
44340+ header = header_at(coord, left);
44341+ for (; left < entries; ++left, ++header) {
44342+ prefetch(header + 1);
44343+ *last = de_id_key_cmp(&header->hash, entry_key);
44344+ if (*last != LESS_THAN)
44345+ break;
44346+ }
44347+ if (left < entries)
44348+ return left;
44349+ else
44350+ return RETERR(-ENOENT);
44351+
44352+}
44353+
44354+/* expand @coord as to accommodate for insertion of @no new entries starting
44355+ from @pos, with total bodies size @size. */
44356+static int expand_item(const coord_t * coord /* coord of item */ ,
44357+ int pos /* unit position */ , int no /* number of new
44358+ * units*/ ,
44359+ int size /* total size of new units' data */ ,
44360+ unsigned int data_size /* free space already reserved
44361+ * in the item for insertion */ )
44362+{
44363+ int entries;
44364+ cde_unit_header *header;
44365+ char *dent;
44366+ int i;
44367+
44368+ assert("nikita-1310", coord != NULL);
44369+ assert("nikita-1311", pos >= 0);
44370+ assert("nikita-1312", no > 0);
44371+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44372+ assert("nikita-1343",
44373+ item_length_by_coord(coord) >=
44374+ (int)(size + data_size + no * sizeof *header));
44375+
44376+ entries = units(coord);
44377+
44378+ if (pos == entries)
44379+ dent = address(coord, size);
44380+ else
44381+ dent = (char *)entry_at(coord, pos);
44382+ /* place where new header will be in */
44383+ header = header_at(coord, pos);
44384+ /* free space for new entry headers */
44385+ memmove(header + no, header,
44386+ (unsigned)(address(coord, size) - (char *)header));
44387+ /* if adding to the end initialise first new header */
44388+ if (pos == entries) {
44389+ set_offset(coord, pos, (unsigned)size);
44390+ }
44391+
44392+ /* adjust entry pointer and size */
44393+ dent = dent + no * sizeof *header;
44394+ size += no * sizeof *header;
44395+ /* free space for new entries */
44396+ memmove(dent + data_size, dent,
44397+ (unsigned)(address(coord, size) - dent));
44398+
44399+ /* increase counter */
44400+ entries += no;
44401+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44402+
44403+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44404+ bytes. */
44405+ for (i = 0; i <= pos; ++i)
44406+ adj_offset(coord, i, no * sizeof *header);
44407+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
44408+ sizeof *header + data_size ) bytes */
44409+ for (i = pos + no; i < entries; ++i)
44410+ adj_offset(coord, i, no * sizeof *header + data_size);
44411+ return 0;
44412+}
44413+
44414+/* insert new @entry into item */
44415+static int expand(const coord_t * coord /* coord of item */ ,
71430cf6 44416+ struct cde_entry * entry /* entry to insert */ ,
44254afd
MT
44417+ int len /* length of @entry data */ ,
44418+ int *pos /* position to insert */ ,
44419+ reiser4_dir_entry_desc * dir_entry /* parameters for new
44420+ * entry */ )
44421+{
44422+ cmp_t cmp_res;
44423+ int datasize;
44424+
44425+ *pos = find(coord, &dir_entry->key, &cmp_res);
44426+ if (*pos < 0)
44427+ *pos = units(coord);
44428+
44429+ datasize = sizeof(directory_entry_format);
44430+ if (is_longname(entry->name->name, entry->name->len))
44431+ datasize += entry->name->len + 1;
44432+
44433+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44434+ datasize);
44435+ return 0;
44436+}
44437+
44438+/* paste body of @entry into item */
44439+static int paste_entry(const coord_t * coord /* coord of item */ ,
71430cf6 44440+ struct cde_entry * entry /* new entry */ ,
44254afd
MT
44441+ int pos /* position to insert */ ,
44442+ reiser4_dir_entry_desc * dir_entry /* parameters for
44443+ * new entry */ )
44444+{
44445+ cde_unit_header *header;
44446+ directory_entry_format *dent;
44447+ const char *name;
44448+ int len;
44449+
44450+ header = header_at(coord, pos);
44451+ dent = entry_at(coord, pos);
44452+
44453+ build_de_id_by_key(&dir_entry->key, &header->hash);
44454+ build_inode_key_id(entry->obj, &dent->id);
44455+ /* AUDIT unsafe strcpy() operation! It should be replaced with
44456+ much less CPU hungry
44457+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44458+
44459+ Also a more major thing is that there should be a way to figure out
44460+ amount of space in dent -> name and be able to check that we are
44461+ not going to overwrite more than we supposed to */
44462+ name = entry->name->name;
44463+ len = entry->name->len;
44464+ if (is_longname(name, len)) {
44465+ strcpy((unsigned char *)dent->name, name);
44466+ put_unaligned(0, &dent->name[len]);
44467+ }
44468+ return 0;
44469+}
44470+
44471+/* estimate how much space is necessary in item to insert/paste set of entries
44472+ described in @data. */
44473+int estimate_cde(const coord_t * coord /* coord of item */ ,
44474+ const reiser4_item_data * data /* parameters for new item */ )
44475+{
71430cf6 44476+ struct cde_entry_data *e;
44254afd
MT
44477+ int result;
44478+ int i;
44479+
71430cf6 44480+ e = (struct cde_entry_data *) data->data;
44254afd
MT
44481+
44482+ assert("nikita-1288", e != NULL);
44483+ assert("nikita-1289", e->num_of_entries >= 0);
44484+
44485+ if (coord == NULL)
44486+ /* insert */
44487+ result = sizeof(cde_item_format);
44488+ else
44489+ /* paste */
44490+ result = 0;
44491+
44492+ result += e->num_of_entries *
44493+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44494+ for (i = 0; i < e->num_of_entries; ++i) {
44495+ const char *name;
44496+ int len;
44497+
44498+ name = e->entry[i].name->name;
44499+ len = e->entry[i].name->len;
44500+ assert("nikita-2054", strlen(name) == len);
44501+ if (is_longname(name, len))
44502+ result += len + 1;
44503+ }
44504+ ((reiser4_item_data *) data)->length = result;
44505+ return result;
44506+}
44507+
44508+/* ->nr_units() method for this item plugin. */
44509+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44510+{
44511+ return units(coord);
44512+}
44513+
44514+/* ->unit_key() method for this item plugin. */
44515+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44516+ reiser4_key * key /* resulting key */ )
44517+{
44518+ assert("nikita-1452", coord != NULL);
44519+ assert("nikita-1345", idx_of(coord) < units(coord));
44520+ assert("nikita-1346", key != NULL);
44521+
44522+ item_key_by_coord(coord, key);
44523+ extract_key_from_de_id(extract_dir_id_from_key(key),
44524+ &header_at(coord, idx_of(coord))->hash, key);
44525+ return key;
44526+}
44527+
44528+/* mergeable_cde(): implementation of ->mergeable() item method.
44529+
44530+ Two directory items are mergeable iff they are from the same
44531+ directory. That simple.
44532+
44533+*/
44534+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44535+ const coord_t * p2 /* coord of second item */ )
44536+{
44537+ reiser4_key k1;
44538+ reiser4_key k2;
44539+
44540+ assert("nikita-1339", p1 != NULL);
44541+ assert("nikita-1340", p2 != NULL);
44542+
44543+ return
44544+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
44545+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
44546+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
44547+
44548+}
44549+
44550+/* ->max_key_inside() method for this item plugin. */
44551+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
44552+ reiser4_key * result /* resulting key */ )
44553+{
44554+ assert("nikita-1342", coord != NULL);
44555+
44556+ item_key_by_coord(coord, result);
71430cf6
MT
44557+ set_key_ordering(result, get_key_ordering(reiser4_max_key()));
44558+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
44559+ set_key_offset(result, get_key_offset(reiser4_max_key()));
44254afd
MT
44560+ return result;
44561+}
44562+
44563+/* @data contains data which are to be put into tree */
44564+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
44565+ const reiser4_key * key /* key to check */ ,
44566+ const reiser4_item_data * data /* parameters of new
44567+ * item/unit being
44568+ * created */ )
44569+{
44570+ reiser4_key item_key;
44571+
44572+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
44573+ data->iplug is initialized */
44574+ assert("vs-457", data && data->iplug);
44575+/* assert( "vs-553", data -> user == 0 );*/
44576+ item_key_by_coord(coord, &item_key);
44577+
44578+ return (item_plugin_by_coord(coord) == data->iplug) &&
44579+ (extract_dir_id_from_key(&item_key) ==
44580+ extract_dir_id_from_key(key));
44581+}
44582+
44583+#if REISER4_DEBUG
44584+/* cde_check ->check() method for compressed directory items
44585+
44586+ used for debugging, every item should have here the most complete
44587+ possible check of the consistency of the item that the inventor can
44588+ construct
44589+*/
71430cf6
MT
44590+int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
44591+ const char **error /* where to store error message */)
44254afd
MT
44592+{
44593+ int i;
44594+ int result;
44595+ char *item_start;
44596+ char *item_end;
44597+ reiser4_key key;
44598+
44599+ coord_t c;
44600+
44601+ assert("nikita-1357", coord != NULL);
44602+ assert("nikita-1358", error != NULL);
44603+
44604+ if (!ergo(coord->item_pos != 0,
44605+ is_dot_key(item_key_by_coord(coord, &key)))) {
44606+ *error = "CDE doesn't start with dot";
44607+ return -1;
44608+ }
44609+ item_start = item_body_by_coord(coord);
44610+ item_end = item_start + item_length_by_coord(coord);
44611+
44612+ coord_dup(&c, coord);
44613+ result = 0;
44614+ for (i = 0; i < units(coord); ++i) {
44615+ directory_entry_format *entry;
44616+
44617+ if ((char *)(header_at(coord, i) + 1) >
44618+ item_end - units(coord) * sizeof *entry) {
44619+ *error = "CDE header is out of bounds";
44620+ result = -1;
44621+ break;
44622+ }
44623+ entry = entry_at(coord, i);
44624+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
44625+ *error = "CDE header is too low";
44626+ result = -1;
44627+ break;
44628+ }
44629+ if ((char *)(entry + 1) > item_end) {
44630+ *error = "CDE header is too high";
44631+ result = -1;
44632+ break;
44633+ }
44634+ }
44635+
44636+ return result;
44637+}
44638+#endif
44639+
44640+/* ->init() method for this item plugin. */
44641+int init_cde(coord_t * coord /* coord of item */ ,
44642+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
44643+ UNUSED_ARG)
44644+{
44645+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
44646+ return 0;
44647+}
44648+
44649+/* ->lookup() method for this item plugin. */
44650+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
44651+ lookup_bias bias /* search bias */ ,
44652+ coord_t * coord /* coord of item to lookup in */ )
44653+{
44654+ cmp_t last_comp;
44655+ int pos;
44656+
44657+ reiser4_key utmost_key;
44658+
44659+ assert("nikita-1293", coord != NULL);
44660+ assert("nikita-1294", key != NULL);
44661+
44662+ CHECKME(coord);
44663+
44664+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
44665+ coord->unit_pos = 0;
44666+ coord->between = BEFORE_UNIT;
44667+ return CBK_COORD_NOTFOUND;
44668+ }
44669+ pos = find(coord, key, &last_comp);
44670+ if (pos >= 0) {
44671+ coord->unit_pos = (int)pos;
44672+ switch (last_comp) {
44673+ case EQUAL_TO:
44674+ coord->between = AT_UNIT;
44675+ return CBK_COORD_FOUND;
44676+ case GREATER_THAN:
44677+ coord->between = BEFORE_UNIT;
44678+ return RETERR(-ENOENT);
44679+ case LESS_THAN:
44680+ default:
44681+ impossible("nikita-1298", "Broken find");
44682+ return RETERR(-EIO);
44683+ }
44684+ } else {
44685+ coord->unit_pos = units(coord) - 1;
44686+ coord->between = AFTER_UNIT;
44687+ return (bias ==
44688+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
44689+ CBK_COORD_NOTFOUND;
44690+ }
44691+}
44692+
44693+/* ->paste() method for this item plugin. */
44694+int paste_cde(coord_t * coord /* coord of item */ ,
44695+ reiser4_item_data * data /* parameters of new unit being
44696+ * inserted */ ,
44697+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
44698+{
71430cf6 44699+ struct cde_entry_data *e;
44254afd
MT
44700+ int result;
44701+ int i;
44702+
44703+ CHECKME(coord);
71430cf6 44704+ e = (struct cde_entry_data *) data->data;
44254afd
MT
44705+
44706+ result = 0;
44707+ for (i = 0; i < e->num_of_entries; ++i) {
44708+ int pos;
44709+ int phantom_size;
44710+
44711+ phantom_size = data->length;
44712+ if (units(coord) == 0)
44713+ phantom_size -= sizeof(cde_item_format);
44714+
44715+ result =
44716+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
44717+ if (result != 0)
44718+ break;
44719+ result = paste_entry(coord, e->entry + i, pos, data->arg);
44720+ if (result != 0)
44721+ break;
44722+ }
44723+ CHECKME(coord);
44724+ return result;
44725+}
44726+
44727+/* amount of space occupied by all entries starting from @idx both headers and
44728+ bodies. */
44729+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
44730+ int idx /* index of unit */ )
44731+{
44732+ assert("nikita-1299", coord != NULL);
44733+ assert("nikita-1300", idx < (int)units(coord));
44734+
44735+ return sizeof(cde_item_format) +
44736+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
44737+ idx + 1) -
44738+ offset_of(coord, 0);
44739+}
44740+
44741+/* how many but not more than @want units of @source can be merged with
44742+ item in @target node. If pend == append - we try to append last item
44743+ of @target by first units of @source. If pend == prepend - we try to
44744+ "prepend" first item in @target by last units of @source. @target
44745+ node has @free_space bytes of free space. Total size of those units
44746+ are returned via @size */
44747+int can_shift_cde(unsigned free_space /* free space in item */ ,
44748+ coord_t * coord /* coord of source item */ ,
44749+ znode * target /* target node */ ,
44750+ shift_direction pend /* shift direction */ ,
44751+ unsigned *size /* resulting number of shifted bytes */ ,
44752+ unsigned want /* maximal number of bytes to shift */ )
44753+{
44754+ int shift;
44755+
44756+ CHECKME(coord);
44757+ if (want == 0) {
44758+ *size = 0;
44759+ return 0;
44760+ }
44761+
44762+ /* pend == SHIFT_LEFT <==> shifting to the left */
44763+ if (pend == SHIFT_LEFT) {
44764+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
44765+ --shift) {
44766+ *size = part_size(coord, shift);
44767+ if (target != NULL)
44768+ *size -= sizeof(cde_item_format);
44769+ if (*size <= free_space)
44770+ break;
44771+ }
44772+ shift = shift + 1;
44773+ } else {
44774+ int total_size;
44775+
44776+ assert("nikita-1301", pend == SHIFT_RIGHT);
44777+
44778+ total_size = item_length_by_coord(coord);
44779+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
44780+ ++shift) {
44781+ *size = total_size - part_size(coord, shift);
44782+ if (target == NULL)
44783+ *size += sizeof(cde_item_format);
44784+ if (*size <= free_space)
44785+ break;
44786+ }
44787+ shift = units(coord) - shift - 1;
44788+ }
44789+ if (shift == 0)
44790+ *size = 0;
44791+ CHECKME(coord);
44792+ return shift;
44793+}
44794+
44795+/* ->copy_units() method for this item plugin. */
44796+void copy_units_cde(coord_t * target /* coord of target item */ ,
44797+ coord_t * source /* coord of source item */ ,
44798+ unsigned from /* starting unit */ ,
44799+ unsigned count /* how many units to copy */ ,
44800+ shift_direction where_is_free_space /* shift direction */ ,
44801+ unsigned free_space /* free space in item */ )
44802+{
44803+ char *header_from;
44804+ char *header_to;
44805+
44806+ char *entry_from;
44807+ char *entry_to;
44808+
44809+ int pos_in_target;
44810+ int data_size;
44811+ int data_delta;
44812+ int i;
44813+
44814+ assert("nikita-1303", target != NULL);
44815+ assert("nikita-1304", source != NULL);
44816+ assert("nikita-1305", (int)from < units(source));
44817+ assert("nikita-1307", (int)(from + count) <= units(source));
44818+
44819+ if (where_is_free_space == SHIFT_LEFT) {
44820+ assert("nikita-1453", from == 0);
44821+ pos_in_target = units(target);
44822+ } else {
44823+ assert("nikita-1309", (int)(from + count) == units(source));
44824+ pos_in_target = 0;
44825+ memmove(item_body_by_coord(target),
44826+ (char *)item_body_by_coord(target) + free_space,
44827+ item_length_by_coord(target) - free_space);
44828+ }
44829+
44830+ CHECKME(target);
44831+ CHECKME(source);
44832+
44833+ /* expand @target */
44834+ data_size =
44835+ offset_of(source, (int)(from + count)) - offset_of(source,
44836+ (int)from);
44837+
44838+ if (units(target) == 0)
44839+ free_space -= sizeof(cde_item_format);
44840+
44841+ expand_item(target, pos_in_target, (int)count,
44842+ (int)(item_length_by_coord(target) - free_space),
44843+ (unsigned)data_size);
44844+
44845+ /* copy first @count units of @source into @target */
44846+ data_delta =
44847+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
44848+
44849+ /* copy entries */
44850+ entry_from = (char *)entry_at(source, (int)from);
44851+ entry_to = (char *)entry_at(source, (int)(from + count));
44852+ memmove(entry_at(target, pos_in_target), entry_from,
44853+ (unsigned)(entry_to - entry_from));
44854+
44855+ /* copy headers */
44856+ header_from = (char *)header_at(source, (int)from);
44857+ header_to = (char *)header_at(source, (int)(from + count));
44858+ memmove(header_at(target, pos_in_target), header_from,
44859+ (unsigned)(header_to - header_from));
44860+
44861+ /* update offsets */
44862+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
44863+ adj_offset(target, i, data_delta);
44864+ CHECKME(target);
44865+ CHECKME(source);
44866+}
44867+
44868+/* ->cut_units() method for this item plugin. */
44869+int cut_units_cde(coord_t * coord /* coord of item */ ,
44870+ pos_in_node_t from /* start unit pos */ ,
44871+ pos_in_node_t to /* stop unit pos */ ,
44872+ struct carry_cut_data *cdata UNUSED_ARG,
44873+ reiser4_key * smallest_removed, reiser4_key * new_first)
44874+{
44875+ char *header_from;
44876+ char *header_to;
44877+
44878+ char *entry_from;
44879+ char *entry_to;
44880+
44881+ int size;
44882+ int entry_delta;
44883+ int header_delta;
44884+ int i;
44885+
44886+ unsigned count;
44887+
44888+ CHECKME(coord);
44889+
44890+ count = to - from + 1;
44891+
44892+ assert("nikita-1454", coord != NULL);
44893+ assert("nikita-1455", (int)(from + count) <= units(coord));
44894+
44895+ if (smallest_removed)
44896+ unit_key_by_coord(coord, smallest_removed);
44897+
44898+ if (new_first) {
44899+ coord_t next;
44900+
44901+ /* not everything is cut from item head */
44902+ assert("vs-1527", from == 0);
44903+ assert("vs-1528", to < units(coord) - 1);
44904+
44905+ coord_dup(&next, coord);
44906+ next.unit_pos++;
44907+ unit_key_by_coord(&next, new_first);
44908+ }
44909+
44910+ size = item_length_by_coord(coord);
44911+ if (count == (unsigned)units(coord)) {
44912+ return size;
44913+ }
44914+
44915+ header_from = (char *)header_at(coord, (int)from);
44916+ header_to = (char *)header_at(coord, (int)(from + count));
44917+
44918+ entry_from = (char *)entry_at(coord, (int)from);
44919+ entry_to = (char *)entry_at(coord, (int)(from + count));
44920+
44921+ /* move headers */
44922+ memmove(header_from, header_to,
44923+ (unsigned)(address(coord, size) - header_to));
44924+
44925+ header_delta = header_to - header_from;
44926+
44927+ entry_from -= header_delta;
44928+ entry_to -= header_delta;
44929+ size -= header_delta;
44930+
44931+ /* copy entries */
44932+ memmove(entry_from, entry_to,
44933+ (unsigned)(address(coord, size) - entry_to));
44934+
44935+ entry_delta = entry_to - entry_from;
44936+ size -= entry_delta;
44937+
44938+ /* update offsets */
44939+
44940+ for (i = 0; i < (int)from; ++i)
44941+ adj_offset(coord, i, -header_delta);
44942+
44943+ for (i = from; i < units(coord) - (int)count; ++i)
44944+ adj_offset(coord, i, -header_delta - entry_delta);
44945+
44946+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
44947+ &formatted_at(coord)->num_of_entries);
44948+
44949+ if (from == 0) {
44950+ /* entries from head was removed - move remaining to right */
44951+ memmove((char *)item_body_by_coord(coord) +
44952+ header_delta + entry_delta, item_body_by_coord(coord),
44953+ (unsigned)size);
44954+ if (REISER4_DEBUG)
44955+ memset(item_body_by_coord(coord), 0,
44956+ (unsigned)header_delta + entry_delta);
44957+ } else {
44958+ /* freed space is already at the end of item */
44959+ if (REISER4_DEBUG)
44960+ memset((char *)item_body_by_coord(coord) + size, 0,
44961+ (unsigned)header_delta + entry_delta);
44962+ }
44963+
44964+ return header_delta + entry_delta;
44965+}
44966+
44967+int kill_units_cde(coord_t * coord /* coord of item */ ,
44968+ pos_in_node_t from /* start unit pos */ ,
44969+ pos_in_node_t to /* stop unit pos */ ,
44970+ struct carry_kill_data *kdata UNUSED_ARG,
44971+ reiser4_key * smallest_removed, reiser4_key * new_first)
44972+{
44973+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
44974+}
44975+
44976+/* ->s.dir.extract_key() method for this item plugin. */
44977+int extract_key_cde(const coord_t * coord /* coord of item */ ,
44978+ reiser4_key * key /* resulting key */ )
44979+{
44980+ directory_entry_format *dent;
44981+
44982+ assert("nikita-1155", coord != NULL);
44983+ assert("nikita-1156", key != NULL);
44984+
44985+ dent = entry_at(coord, idx_of(coord));
44986+ return extract_key_from_id(&dent->id, key);
44987+}
44988+
44989+int
44990+update_key_cde(const coord_t * coord, const reiser4_key * key,
44991+ lock_handle * lh UNUSED_ARG)
44992+{
44993+ directory_entry_format *dent;
44994+ obj_key_id obj_id;
44995+ int result;
44996+
44997+ assert("nikita-2344", coord != NULL);
44998+ assert("nikita-2345", key != NULL);
44999+
45000+ dent = entry_at(coord, idx_of(coord));
45001+ result = build_obj_key_id(key, &obj_id);
45002+ if (result == 0) {
45003+ dent->id = obj_id;
45004+ znode_make_dirty(coord->node);
45005+ }
45006+ return 0;
45007+}
45008+
45009+/* ->s.dir.extract_name() method for this item plugin. */
45010+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45011+{
45012+ directory_entry_format *dent;
45013+
45014+ assert("nikita-1157", coord != NULL);
45015+
45016+ dent = entry_at(coord, idx_of(coord));
45017+ return extract_dent_name(coord, dent, buf);
45018+}
45019+
45020+static int cde_bytes(int pasting, const reiser4_item_data * data)
45021+{
45022+ int result;
45023+
45024+ result = data->length;
45025+ if (!pasting)
45026+ result -= sizeof(cde_item_format);
45027+ return result;
45028+}
45029+
45030+/* ->s.dir.add_entry() method for this item plugin */
45031+int add_entry_cde(struct inode *dir /* directory object */ ,
45032+ coord_t * coord /* coord of item */ ,
45033+ lock_handle * lh /* lock handle for insertion */ ,
45034+ const struct dentry *name /* name to insert */ ,
45035+ reiser4_dir_entry_desc * dir_entry /* parameters of new
45036+ * directory entry */ )
45037+{
45038+ reiser4_item_data data;
71430cf6
MT
45039+ struct cde_entry entry;
45040+ struct cde_entry_data edata;
44254afd
MT
45041+ int result;
45042+
45043+ assert("nikita-1656", coord->node == lh->node);
45044+ assert("nikita-1657", znode_is_write_locked(coord->node));
45045+
45046+ edata.num_of_entries = 1;
45047+ edata.entry = &entry;
45048+
45049+ entry.dir = dir;
45050+ entry.obj = dir_entry->obj;
45051+ entry.name = &name->d_name;
45052+
45053+ data.data = (char *)&edata;
45054+ data.user = 0; /* &edata is not user space */
45055+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45056+ data.arg = dir_entry;
45057+ assert("nikita-1302", data.iplug != NULL);
45058+
45059+ result = is_dot_key(&dir_entry->key);
45060+ data.length = estimate_cde(result ? coord : NULL, &data);
45061+
45062+ /* NOTE-NIKITA quota plugin? */
45063+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45064+ return RETERR(-EDQUOT);
45065+
45066+ if (result)
45067+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45068+ else
71430cf6
MT
45069+ result = reiser4_resize_item(coord, &data, &dir_entry->key,
45070+ lh, 0);
44254afd
MT
45071+ return result;
45072+}
45073+
45074+/* ->s.dir.rem_entry() */
45075+int rem_entry_cde(struct inode *dir /* directory of item */ ,
45076+ const struct qstr *name, coord_t * coord /* coord of item */ ,
45077+ lock_handle * lh UNUSED_ARG /* lock handle for
45078+ * removal */ ,
45079+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45080+ * directory entry
45081+ * being removed */ )
45082+{
45083+ coord_t shadow;
45084+ int result;
45085+ int length;
45086+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45087+
45088+ assert("nikita-2870", strlen(name->name) == name->len);
45089+ assert("nikita-2869",
45090+ !strcmp(name->name, extract_name_cde(coord, buf)));
45091+
45092+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45093+ if (is_longname(name->name, name->len))
45094+ length += name->len + 1;
45095+
45096+ if (inode_get_bytes(dir) < length) {
45097+ warning("nikita-2628", "Dir is broke: %llu: %llu",
45098+ (unsigned long long)get_inode_oid(dir),
45099+ inode_get_bytes(dir));
45100+
45101+ return RETERR(-EIO);
45102+ }
45103+
45104+ /* cut_node() is supposed to take pointers to _different_
45105+ coords, because it will modify them without respect to
45106+ possible aliasing. To work around this, create temporary copy
45107+ of @coord.
45108+ */
45109+ coord_dup(&shadow, coord);
45110+ result =
45111+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45112+ if (result == 0) {
45113+ /* NOTE-NIKITA quota plugin? */
45114+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
45115+ }
45116+ return result;
45117+}
45118+
45119+/* ->s.dir.max_name_len() method for this item plugin */
45120+int max_name_len_cde(const struct inode *dir /* directory */ )
45121+{
45122+ return
71430cf6
MT
45123+ reiser4_tree_by_inode(dir)->nplug->max_item_size() -
45124+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
45125+ sizeof(cde_unit_header) - 2;
44254afd
MT
45126+}
45127+
45128+/* Make Linus happy.
45129+ Local variables:
45130+ c-indentation-style: "K&R"
45131+ mode-name: "LC"
45132+ c-basic-offset: 8
45133+ tab-width: 8
45134+ fill-column: 120
45135+ End:
45136+*/
71430cf6
MT
45137diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/cde.h linux-2.6.22/fs/reiser4/plugin/item/cde.h
45138--- linux-2.6.22.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 03:00:00.000000000 +0300
45139+++ linux-2.6.22/fs/reiser4/plugin/item/cde.h 2007-07-29 00:25:34.944714077 +0400
44254afd
MT
45140@@ -0,0 +1,87 @@
45141+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45142+
45143+/* Compound directory item. See cde.c for description. */
45144+
45145+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45146+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45147+
45148+#include "../../forward.h"
45149+#include "../../kassign.h"
45150+#include "../../dformat.h"
45151+
45152+#include <linux/fs.h> /* for struct inode */
45153+#include <linux/dcache.h> /* for struct dentry, etc */
45154+
45155+typedef struct cde_unit_header {
45156+ de_id hash;
45157+ d16 offset;
45158+} cde_unit_header;
45159+
45160+typedef struct cde_item_format {
45161+ d16 num_of_entries;
45162+ cde_unit_header entry[0];
45163+} cde_item_format;
45164+
71430cf6 45165+struct cde_entry {
44254afd
MT
45166+ const struct inode *dir;
45167+ const struct inode *obj;
45168+ const struct qstr *name;
71430cf6 45169+};
44254afd 45170+
71430cf6 45171+struct cde_entry_data {
44254afd 45172+ int num_of_entries;
71430cf6
MT
45173+ struct cde_entry *entry;
45174+};
44254afd
MT
45175+
45176+/* plugin->item.b.* */
45177+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45178+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45179+ const reiser4_item_data *);
45180+int mergeable_cde(const coord_t * p1, const coord_t * p2);
45181+pos_in_node_t nr_units_cde(const coord_t * coord);
45182+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45183+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45184+void print_cde(const char *prefix, coord_t * coord);
45185+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45186+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45187+ coord_t * coord);
45188+int paste_cde(coord_t * coord, reiser4_item_data * data,
45189+ carry_plugin_info * info UNUSED_ARG);
45190+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45191+ shift_direction pend, unsigned *size, unsigned want);
45192+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45193+ unsigned count, shift_direction where_is_free_space,
45194+ unsigned free_space);
45195+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45196+ struct carry_cut_data *, reiser4_key * smallest_removed,
45197+ reiser4_key * new_first);
45198+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45199+ struct carry_kill_data *, reiser4_key * smallest_removed,
45200+ reiser4_key * new_first);
45201+void print_cde(const char *prefix, coord_t * coord);
71430cf6 45202+int reiser4_check_cde(const coord_t * coord, const char **error);
44254afd
MT
45203+
45204+/* plugin->u.item.s.dir.* */
45205+int extract_key_cde(const coord_t * coord, reiser4_key * key);
45206+int update_key_cde(const coord_t * coord, const reiser4_key * key,
45207+ lock_handle * lh);
45208+char *extract_name_cde(const coord_t * coord, char *buf);
45209+int add_entry_cde(struct inode *dir, coord_t * coord,
45210+ lock_handle * lh, const struct dentry *name,
45211+ reiser4_dir_entry_desc * entry);
45212+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45213+ lock_handle * lh, reiser4_dir_entry_desc * entry);
45214+int max_name_len_cde(const struct inode *dir);
45215+
45216+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45217+#endif
45218+
45219+/* Make Linus happy.
45220+ Local variables:
45221+ c-indentation-style: "K&R"
45222+ mode-name: "LC"
45223+ c-basic-offset: 8
45224+ tab-width: 8
45225+ fill-column: 120
45226+ End:
45227+*/
71430cf6
MT
45228diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.22/fs/reiser4/plugin/item/ctail.c
45229--- linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 03:00:00.000000000 +0300
45230+++ linux-2.6.22/fs/reiser4/plugin/item/ctail.c 2007-07-29 00:25:34.948715113 +0400
45231@@ -0,0 +1,1614 @@
44254afd
MT
45232+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45233+
45234+/* ctails (aka "clustered tails") are items for cryptcompress objects */
45235+
45236+/* DESCRIPTION:
45237+
45238+Each cryptcompress object is stored on disk as a set of clusters sliced
45239+into ctails.
45240+
45241+Internal on-disk structure:
45242+
45243+ HEADER (1) Here stored disk cluster shift
45244+ BODY
45245+*/
45246+
45247+#include "../../forward.h"
45248+#include "../../debug.h"
45249+#include "../../dformat.h"
45250+#include "../../kassign.h"
45251+#include "../../key.h"
45252+#include "../../coord.h"
45253+#include "item.h"
45254+#include "../node/node.h"
45255+#include "../plugin.h"
45256+#include "../object.h"
45257+#include "../../znode.h"
45258+#include "../../carry.h"
45259+#include "../../tree.h"
45260+#include "../../inode.h"
45261+#include "../../super.h"
45262+#include "../../context.h"
45263+#include "../../page_cache.h"
45264+#include "../cluster.h"
45265+#include "../../flush.h"
45266+#include "../../tree_walk.h"
45267+
45268+#include <linux/pagevec.h>
45269+#include <linux/swap.h>
45270+#include <linux/fs.h>
45271+
45272+/* return body of ctail item at @coord */
45273+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45274+{
45275+ assert("edward-60", coord != NULL);
45276+ return item_body_by_coord(coord);
45277+}
45278+
71430cf6 45279+static int cluster_shift_by_coord(const coord_t * coord)
44254afd
MT
45280+{
45281+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45282+}
45283+
71430cf6
MT
45284+static inline void dclust_set_extension_shift(hint_t * hint)
45285+{
45286+ assert("edward-1270",
45287+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
45288+ hint->ext_coord.extension.ctail.shift =
45289+ cluster_shift_by_coord(&hint->ext_coord.coord);
45290+}
45291+
44254afd
MT
45292+static loff_t off_by_coord(const coord_t * coord)
45293+{
45294+ reiser4_key key;
45295+ return get_key_offset(item_key_by_coord(coord, &key));
45296+}
45297+
71430cf6 45298+int coord_is_unprepped_ctail(const coord_t * coord)
44254afd
MT
45299+{
45300+ assert("edward-1233", coord != NULL);
45301+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45302+ assert("edward-1235",
45303+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45304+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45305+
45306+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45307+}
45308+
45309+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45310+{
45311+ int shift;
45312+
45313+ if (inode != NULL) {
45314+ shift = inode_cluster_shift(inode);
45315+ assert("edward-1236",
45316+ ergo(!coord_is_unprepped_ctail(coord),
45317+ shift == cluster_shift_by_coord(coord)));
45318+ } else {
45319+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
45320+ shift = cluster_shift_by_coord(coord);
45321+ }
45322+ return off_by_coord(coord) >> shift;
45323+}
45324+
45325+static int disk_cluster_size(const coord_t * coord)
45326+{
45327+ assert("edward-1156",
45328+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45329+ /* calculation of disk cluster size
45330+ is meaninless if ctail is unprepped */
45331+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
45332+
45333+ return 1 << cluster_shift_by_coord(coord);
45334+}
45335+
45336+/* true if the key is of first disk cluster item */
45337+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45338+{
45339+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45340+
45341+ return coord_is_unprepped_ctail(coord) ||
45342+ ((get_key_offset(key) &
45343+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45344+}
45345+
45346+static char *first_unit(coord_t * coord)
45347+{
45348+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
45349+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45350+}
45351+
45352+/* plugin->u.item.b.max_key_inside :
45353+ tail_max_key_inside */
45354+
45355+/* plugin->u.item.b.can_contain_key */
45356+int
45357+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45358+ const reiser4_item_data * data)
45359+{
45360+ reiser4_key item_key;
45361+
45362+ if (item_plugin_by_coord(coord) != data->iplug)
45363+ return 0;
45364+
45365+ item_key_by_coord(coord, &item_key);
45366+ if (get_key_locality(key) != get_key_locality(&item_key) ||
45367+ get_key_objectid(key) != get_key_objectid(&item_key))
45368+ return 0;
45369+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45370+ get_key_offset(key))
45371+ return 0;
45372+ if (is_disk_cluster_key(key, coord))
45373+ return 0;
45374+ return 1;
45375+}
45376+
71430cf6 45377+/* plugin->u.item.b.mergeable */
44254afd
MT
45378+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45379+{
45380+ reiser4_key key1, key2;
45381+
45382+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
71430cf6
MT
45383+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
45384+ UNIX_FILE_METADATA_ITEM_TYPE));
44254afd
MT
45385+
45386+ if (item_id_by_coord(p2) != CTAIL_ID) {
45387+ /* second item is of another type */
45388+ return 0;
45389+ }
45390+
45391+ item_key_by_coord(p1, &key1);
45392+ item_key_by_coord(p2, &key2);
45393+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
45394+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
45395+ get_key_type(&key1) != get_key_type(&key2)) {
45396+ /* items of different objects */
45397+ return 0;
45398+ }
45399+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45400+ /* not adjacent items */
45401+ return 0;
45402+ if (is_disk_cluster_key(&key2, p2))
45403+ return 0;
45404+ return 1;
45405+}
45406+
45407+/* plugin->u.item.b.nr_units */
45408+pos_in_node_t nr_units_ctail(const coord_t * coord)
45409+{
45410+ return (item_length_by_coord(coord) -
45411+ sizeof(ctail_formatted_at(coord)->cluster_shift));
45412+}
45413+
45414+/* plugin->u.item.b.estimate:
45415+ estimate how much space is needed to insert/paste @data->length bytes
45416+ into ctail at @coord */
45417+int estimate_ctail(const coord_t * coord /* coord of item */ ,
45418+ const reiser4_item_data *
45419+ data /* parameters for new item */ )
45420+{
45421+ if (coord == NULL)
45422+ /* insert */
45423+ return (sizeof(ctail_item_format) + data->length);
45424+ else
45425+ /* paste */
45426+ return data->length;
45427+}
45428+
45429+/* ->init() method for this item plugin. */
45430+int init_ctail(coord_t * to /* coord of item */ ,
45431+ coord_t * from /* old_item */ ,
45432+ reiser4_item_data * data /* structure used for insertion */ )
45433+{
45434+ int cluster_shift; /* cpu value to convert */
45435+
45436+ if (data) {
45437+ assert("edward-463", data->length > sizeof(ctail_item_format));
45438+ cluster_shift = *((int *)(data->arg));
45439+ data->length -= sizeof(ctail_item_format);
45440+ } else {
45441+ assert("edward-464", from != NULL);
45442+ assert("edward-855", ctail_ok(from));
45443+ cluster_shift = (int)(cluster_shift_by_coord(from));
45444+ }
45445+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45446+ assert("edward-856", ctail_ok(to));
45447+ return 0;
45448+}
45449+
45450+/* plugin->u.item.b.lookup:
45451+ NULL: We are looking for item keys only */
45452+
45453+#if REISER4_DEBUG
45454+int ctail_ok(const coord_t * coord)
45455+{
45456+ return coord_is_unprepped_ctail(coord) ||
45457+ cluster_shift_ok(cluster_shift_by_coord(coord));
45458+}
45459+
45460+/* plugin->u.item.b.check */
45461+int check_ctail(const coord_t * coord, const char **error)
45462+{
45463+ if (!ctail_ok(coord)) {
45464+ if (error)
45465+ *error = "bad cluster shift in ctail";
45466+ return 1;
45467+ }
45468+ return 0;
45469+}
45470+#endif
45471+
45472+/* plugin->u.item.b.paste */
45473+int
45474+paste_ctail(coord_t * coord, reiser4_item_data * data,
45475+ carry_plugin_info * info UNUSED_ARG)
45476+{
45477+ unsigned old_nr_units;
45478+
45479+ assert("edward-268", data->data != NULL);
45480+ /* copy only from kernel space */
45481+ assert("edward-66", data->user == 0);
45482+
45483+ old_nr_units =
45484+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
45485+ data->length;
45486+
45487+ /* ctail items never get pasted in the middle */
45488+
45489+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45490+
45491+ /* paste at the beginning when create new item */
45492+ assert("edward-450",
45493+ item_length_by_coord(coord) ==
45494+ data->length + sizeof(ctail_item_format));
45495+ assert("edward-451", old_nr_units == 0);
45496+ } else if (coord->unit_pos == old_nr_units - 1
45497+ && coord->between == AFTER_UNIT) {
45498+
45499+ /* paste at the end */
45500+ coord->unit_pos++;
45501+ } else
45502+ impossible("edward-453", "bad paste position");
45503+
45504+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45505+
45506+ assert("edward-857", ctail_ok(coord));
45507+
45508+ return 0;
45509+}
45510+
45511+/* plugin->u.item.b.fast_paste */
45512+
45513+/* plugin->u.item.b.can_shift
45514+ number of units is returned via return value, number of bytes via @size. For
45515+ ctail items they coincide */
45516+int
45517+can_shift_ctail(unsigned free_space, coord_t * source,
45518+ znode * target, shift_direction direction UNUSED_ARG,
45519+ unsigned *size /* number of bytes */ , unsigned want)
45520+{
45521+ /* make sure that that we do not want to shift more than we have */
45522+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45523+
45524+ *size = min(want, free_space);
45525+
45526+ if (!target) {
45527+ /* new item will be created */
45528+ if (*size <= sizeof(ctail_item_format)) {
45529+ *size = 0;
45530+ return 0;
45531+ }
45532+ return *size - sizeof(ctail_item_format);
45533+ }
45534+ return *size;
45535+}
45536+
45537+/* plugin->u.item.b.copy_units
45538+ cooperates with ->can_shift() */
45539+void
45540+copy_units_ctail(coord_t * target, coord_t * source,
45541+ unsigned from, unsigned count /* units */ ,
45542+ shift_direction where_is_free_space,
45543+ unsigned free_space /* bytes */ )
45544+{
45545+ /* make sure that item @target is expanded already */
45546+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45547+ assert("edward-70", free_space == count || free_space == count + 1);
45548+
45549+ assert("edward-858", ctail_ok(source));
45550+
45551+ if (where_is_free_space == SHIFT_LEFT) {
45552+ /* append item @target with @count first bytes of @source:
45553+ this restriction came from ordinary tails */
45554+ assert("edward-71", from == 0);
45555+ assert("edward-860", ctail_ok(target));
45556+
45557+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
45558+ first_unit(source), count);
45559+ } else {
45560+ /* target item is moved to right already */
45561+ reiser4_key key;
45562+
45563+ assert("edward-72", nr_units_ctail(source) == from + count);
45564+
45565+ if (free_space == count) {
45566+ init_ctail(target, source, NULL);
45567+ } else {
45568+ /* new item has been created */
45569+ assert("edward-862", ctail_ok(target));
45570+ }
45571+ memcpy(first_unit(target), first_unit(source) + from, count);
45572+
45573+ assert("edward-863", ctail_ok(target));
45574+
45575+ /* new units are inserted before first unit in an item,
45576+ therefore, we have to update item key */
45577+ item_key_by_coord(source, &key);
45578+ set_key_offset(&key, get_key_offset(&key) + from);
45579+
45580+ node_plugin_by_node(target->node)->update_item_key(target, &key,
45581+ NULL /*info */);
45582+ }
45583+}
45584+
45585+/* plugin->u.item.b.create_hook */
45586+int create_hook_ctail(const coord_t * coord, void *arg)
45587+{
45588+ assert("edward-864", znode_is_loaded(coord->node));
45589+
45590+ znode_set_convertible(coord->node);
45591+ return 0;
45592+}
45593+
45594+/* plugin->u.item.b.kill_hook */
71430cf6
MT
45595+int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
45596+ pos_in_node_t count, carry_kill_data * kdata)
44254afd
MT
45597+{
45598+ struct inode *inode;
45599+
45600+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
45601+ assert("edward-291", znode_is_write_locked(coord->node));
45602+
45603+ inode = kdata->inode;
45604+ if (inode) {
45605+ reiser4_key key;
71430cf6
MT
45606+ struct cryptcompress_info * info;
45607+ cloff_t index;
45608+
44254afd 45609+ item_key_by_coord(coord, &key);
71430cf6
MT
45610+ info = cryptcompress_inode_data(inode);
45611+ index = off_to_clust(get_key_offset(&key), inode);
44254afd 45612+
71430cf6
MT
45613+ if (from == 0) {
45614+ info->trunc_index = index;
45615+ if (is_disk_cluster_key(&key, coord)) {
45616+ /*
45617+ * first item of disk cluster is to be killed
45618+ */
45619+ truncate_complete_page_cluster(
45620+ inode, index, kdata->params.truncate);
45621+ inode_sub_bytes(inode,
45622+ inode_cluster_size(inode));
45623+ }
44254afd
MT
45624+ }
45625+ }
45626+ return 0;
45627+}
45628+
45629+/* for shift_hook_ctail(),
45630+ return true if the first disk cluster item has dirty child
45631+*/
45632+static int ctail_convertible(const coord_t * coord)
45633+{
45634+ int result;
45635+ reiser4_key key;
45636+ jnode *child = NULL;
45637+
45638+ assert("edward-477", coord != NULL);
45639+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
45640+
45641+ if (coord_is_unprepped_ctail(coord))
45642+ /* unprepped ctail should be converted */
45643+ return 1;
45644+
45645+ item_key_by_coord(coord, &key);
45646+ child = jlookup(current_tree,
45647+ get_key_objectid(&key),
45648+ off_to_pg(off_by_coord(coord)));
45649+ if (!child)
45650+ return 0;
45651+ result = JF_ISSET(child, JNODE_DIRTY);
45652+ jput(child);
45653+ return result;
45654+}
45655+
45656+/* FIXME-EDWARD */
45657+/* plugin->u.item.b.shift_hook */
45658+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
45659+ unsigned from UNUSED_ARG /* start unit */ ,
45660+ unsigned count UNUSED_ARG /* stop unit */ ,
45661+ znode * old_node /* old parent */ )
45662+{
45663+ assert("edward-479", item != NULL);
45664+ assert("edward-480", item->node != old_node);
45665+
45666+ if (!znode_convertible(old_node) || znode_convertible(item->node))
45667+ return 0;
45668+ if (ctail_convertible(item))
45669+ znode_set_convertible(item->node);
45670+ return 0;
45671+}
45672+
45673+static int
45674+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45675+ int cut, void *p, reiser4_key * smallest_removed,
45676+ reiser4_key * new_first)
45677+{
45678+ pos_in_node_t count; /* number of units to cut */
45679+ char *item;
45680+
45681+ count = to - from + 1;
45682+ item = item_body_by_coord(coord);
45683+
45684+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
45685+
45686+ if (smallest_removed) {
45687+ /* store smallest key removed */
45688+ item_key_by_coord(coord, smallest_removed);
45689+ set_key_offset(smallest_removed,
45690+ get_key_offset(smallest_removed) + from);
45691+ }
45692+
45693+ if (new_first) {
45694+ assert("vs-1531", from == 0);
45695+
45696+ item_key_by_coord(coord, new_first);
45697+ set_key_offset(new_first,
45698+ get_key_offset(new_first) + from + count);
45699+ }
45700+
45701+ if (!cut)
45702+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
45703+
45704+ if (from == 0) {
45705+ if (count != nr_units_ctail(coord)) {
45706+ /* part of item is removed, so move free space at the beginning
45707+ of the item and update item key */
45708+ reiser4_key key;
45709+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
45710+ item_key_by_coord(coord, &key);
45711+ set_key_offset(&key, get_key_offset(&key) + count);
45712+ node_plugin_by_node(coord->node)->update_item_key(coord,
45713+ &key,
45714+ NULL);
45715+ } else {
45716+ /* cut_units should not be called to cut evrything */
45717+ assert("vs-1532", ergo(cut, 0));
45718+ /* whole item is cut, so more then amount of space occupied
45719+ by units got freed */
45720+ count += sizeof(ctail_item_format);
45721+ }
45722+ if (REISER4_DEBUG)
45723+ memset(item, 0, count);
45724+ } else if (REISER4_DEBUG)
45725+ memset(item + sizeof(ctail_item_format) + from, 0, count);
45726+ return count;
45727+}
45728+
45729+/* plugin->u.item.b.cut_units */
45730+int
45731+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45732+ carry_cut_data * cdata, reiser4_key * smallest_removed,
45733+ reiser4_key * new_first)
45734+{
45735+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
45736+ smallest_removed, new_first);
45737+}
45738+
45739+/* plugin->u.item.b.kill_units */
45740+int
45741+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
45742+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
45743+ reiser4_key * new_first)
45744+{
45745+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
45746+ smallest_removed, new_first);
45747+}
45748+
45749+/* plugin->u.item.s.file.read */
45750+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
45751+{
45752+ uf_coord_t *uf_coord;
45753+ coord_t *coord;
45754+
45755+ uf_coord = &hint->ext_coord;
45756+ coord = &uf_coord->coord;
45757+ assert("edward-127", f->user == 0);
45758+ assert("edward-129", coord && coord->node);
45759+ assert("edward-130", coord_is_existing_unit(coord));
45760+ assert("edward-132", znode_is_loaded(coord->node));
45761+
45762+ /* start read only from the beginning of ctail */
45763+ assert("edward-133", coord->unit_pos == 0);
45764+ /* read only whole ctails */
45765+ assert("edward-135", nr_units_ctail(coord) <= f->length);
45766+
71430cf6 45767+ assert("edward-136", reiser4_schedulable());
44254afd
MT
45768+ assert("edward-886", ctail_ok(coord));
45769+
45770+ if (f->data)
45771+ memcpy(f->data, (char *)first_unit(coord),
45772+ (size_t) nr_units_ctail(coord));
45773+
71430cf6 45774+ dclust_set_extension_shift(hint);
44254afd
MT
45775+ mark_page_accessed(znode_page(coord->node));
45776+ move_flow_forward(f, nr_units_ctail(coord));
45777+
45778+ return 0;
45779+}
45780+
71430cf6
MT
45781+/**
45782+ * Prepare transform stream with plain text for page
45783+ * @page taking into account synchronization issues.
45784+ */
45785+int ctail_read_disk_cluster(struct cluster_handle * clust, struct inode * inode,
45786+ struct page * page, znode_lock_mode mode)
44254afd
MT
45787+{
45788+ int result;
71430cf6
MT
45789+
45790+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
44254afd
MT
45791+ assert("edward-671", clust->hint != NULL);
45792+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
71430cf6
MT
45793+ assert("edward-672", cryptcompress_inode_ok(inode));
45794+ assert("edward-1527", PageLocked(page));
45795+
45796+ unlock_page(page);
44254afd
MT
45797+
45798+ /* set input stream */
45799+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
71430cf6
MT
45800+ if (result) {
45801+ lock_page(page);
44254afd 45802+ return result;
71430cf6
MT
45803+ }
45804+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
45805+ lock_page(page);
44254afd
MT
45806+ if (result)
45807+ return result;
71430cf6
MT
45808+ /*
45809+ * at this point we have locked position in the tree
45810+ */
45811+ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
44254afd 45812+
71430cf6
MT
45813+ if (page->mapping != inode->i_mapping) {
45814+ /* page was truncated */
45815+ reiser4_unset_hint(clust->hint);
45816+ reset_cluster_params(clust);
45817+ return AOP_TRUNCATED_PAGE;
45818+ }
45819+ if (PageUptodate(page)) {
45820+ /* disk cluster can be obsolete, don't use it! */
45821+ reiser4_unset_hint(clust->hint);
45822+ reset_cluster_params(clust);
45823+ return 0;
45824+ }
44254afd 45825+ if (clust->dstat == FAKE_DISK_CLUSTER ||
71430cf6
MT
45826+ clust->dstat == UNPR_DISK_CLUSTER ||
45827+ clust->dstat == TRNC_DISK_CLUSTER) {
45828+ /*
45829+ * this information about disk cluster will be valid
45830+ * as long as we keep the position in the tree locked
45831+ */
44254afd
MT
45832+ tfm_cluster_set_uptodate(&clust->tc);
45833+ return 0;
45834+ }
71430cf6 45835+ /* now prepare output stream.. */
44254afd
MT
45836+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
45837+ if (result)
45838+ return result;
71430cf6
MT
45839+ /* ..and fill this with plain text */
45840+ result = reiser4_inflate_cluster(clust, inode);
44254afd
MT
45841+ if (result)
45842+ return result;
71430cf6
MT
45843+ /*
45844+ * The stream is ready! It won't be obsolete as
45845+ * long as we keep last disk cluster item locked.
45846+ */
44254afd
MT
45847+ tfm_cluster_set_uptodate(&clust->tc);
45848+ return 0;
45849+}
45850+
71430cf6
MT
45851+/*
45852+ * fill one page with plain text.
45853+ */
45854+int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
45855+ struct page *page, znode_lock_mode mode)
44254afd
MT
45856+{
45857+ int ret;
45858+ unsigned cloff;
45859+ char *data;
71430cf6
MT
45860+ size_t to_page;
45861+ struct tfm_cluster * tc = &clust->tc;
44254afd
MT
45862+
45863+ assert("edward-212", PageLocked(page));
45864+
71430cf6
MT
45865+ if (unlikely(page->mapping != inode->i_mapping))
45866+ return AOP_TRUNCATED_PAGE;
44254afd
MT
45867+ if (PageUptodate(page))
45868+ goto exit;
71430cf6
MT
45869+ to_page = pbytes(page_index(page), inode);
45870+ if (to_page == 0) {
45871+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
45872+ SetPageUptodate(page);
45873+ goto exit;
45874+ }
44254afd
MT
45875+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
45876+ clust->index = pg_to_clust(page->index, inode);
71430cf6
MT
45877+
45878+ /* this will unlock/lock the page */
45879+ ret = ctail_read_disk_cluster(clust, inode, page, mode);
45880+
45881+ assert("edward-212", PageLocked(page));
44254afd
MT
45882+ if (ret)
45883+ return ret;
71430cf6
MT
45884+
45885+ /* refresh bytes */
45886+ to_page = pbytes(page_index(page), inode);
45887+ if (to_page == 0) {
45888+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
45889+ SetPageUptodate(page);
45890+ goto exit;
45891+ }
44254afd
MT
45892+ }
45893+ if (PageUptodate(page))
71430cf6 45894+ /* somebody else fill it already */
44254afd
MT
45895+ goto exit;
45896+
44254afd 45897+ assert("edward-119", tfm_cluster_is_uptodate(tc));
71430cf6 45898+ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
44254afd
MT
45899+
45900+ switch (clust->dstat) {
45901+ case UNPR_DISK_CLUSTER:
71430cf6
MT
45902+ BUG_ON(1);
45903+ case TRNC_DISK_CLUSTER:
45904+ /*
45905+ * Race with truncate!
45906+ * We resolve it in favour of the last one (the only way,
45907+ * as in this case plain text is unrecoverable)
45908+ */
44254afd
MT
45909+ case FAKE_DISK_CLUSTER:
45910+ /* fill the page by zeroes */
71430cf6 45911+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
44254afd
MT
45912+ SetPageUptodate(page);
45913+ break;
45914+ case PREP_DISK_CLUSTER:
71430cf6 45915+ /* fill page by transformed stream with plain text */
44254afd
MT
45916+ assert("edward-1058", !PageUptodate(page));
45917+ assert("edward-120", tc->len <= inode_cluster_size(inode));
45918+
71430cf6 45919+ /* page index in this logical cluster */
44254afd
MT
45920+ cloff = pg_to_off_to_cloff(page->index, inode);
45921+
45922+ data = kmap(page);
71430cf6
MT
45923+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
45924+ memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
44254afd
MT
45925+ flush_dcache_page(page);
45926+ kunmap(page);
45927+ SetPageUptodate(page);
45928+ break;
45929+ default:
45930+ impossible("edward-1169", "bad disk cluster state");
45931+ }
45932+ exit:
45933+ return 0;
45934+}
45935+
45936+/* plugin->u.item.s.file.readpage */
45937+int readpage_ctail(void *vp, struct page *page)
45938+{
45939+ int result;
71430cf6
MT
45940+ hint_t * hint;
45941+ struct cluster_handle * clust = vp;
44254afd
MT
45942+
45943+ assert("edward-114", clust != NULL);
45944+ assert("edward-115", PageLocked(page));
45945+ assert("edward-116", !PageUptodate(page));
44254afd
MT
45946+ assert("edward-118", page->mapping && page->mapping->host);
45947+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
45948+
71430cf6
MT
45949+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
45950+ if (hint == NULL) {
45951+ unlock_page(page);
44254afd 45952+ return RETERR(-ENOMEM);
71430cf6 45953+ }
44254afd
MT
45954+ clust->hint = hint;
45955+ result = load_file_hint(clust->file, hint);
45956+ if (result) {
45957+ kfree(hint);
71430cf6 45958+ unlock_page(page);
44254afd
MT
45959+ return result;
45960+ }
45961+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
44254afd 45962+
71430cf6
MT
45963+ result = do_readpage_ctail(page->mapping->host, clust, page,
45964+ ZNODE_READ_LOCK);
44254afd
MT
45965+ assert("edward-213", PageLocked(page));
45966+ assert("edward-1163", ergo(!result, PageUptodate(page)));
44254afd
MT
45967+
45968+ unlock_page(page);
45969+ done_lh(&hint->lh);
45970+ hint->ext_coord.valid = 0;
45971+ save_file_hint(clust->file, hint);
45972+ kfree(hint);
45973+ tfm_cluster_clr_uptodate(&clust->tc);
45974+
45975+ return result;
45976+}
45977+
71430cf6
MT
45978+/* Helper function for ->readpages() */
45979+static int ctail_read_page_cluster(struct cluster_handle * clust,
45980+ struct inode *inode)
44254afd
MT
45981+{
45982+ int i;
45983+ int result;
45984+ assert("edward-779", clust != NULL);
45985+ assert("edward-1059", clust->win == NULL);
45986+ assert("edward-780", inode != NULL);
45987+
71430cf6 45988+ result = prepare_page_cluster(inode, clust, READ_OP);
44254afd
MT
45989+ if (result)
45990+ return result;
71430cf6
MT
45991+
45992+ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
44254afd
MT
45993+
45994+ for (i = 0; i < clust->nr_pages; i++) {
45995+ struct page *page = clust->pages[i];
45996+ lock_page(page);
71430cf6 45997+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
44254afd
MT
45998+ unlock_page(page);
45999+ if (result)
46000+ break;
46001+ }
46002+ tfm_cluster_clr_uptodate(&clust->tc);
71430cf6 46003+ put_page_cluster(clust, inode, READ_OP);
44254afd
MT
46004+ return result;
46005+}
46006+
71430cf6
MT
46007+/* filler for read_cache_pages() */
46008+static int ctail_readpages_filler(void * data, struct page * page)
46009+{
46010+ int ret = 0;
46011+ struct cluster_handle * clust = data;
46012+ struct inode * inode = clust->file->f_dentry->d_inode;
44254afd 46013+
71430cf6 46014+ assert("edward-1525", page->mapping == inode->i_mapping);
44254afd 46015+
71430cf6
MT
46016+ if (PageUptodate(page)) {
46017+ unlock_page(page);
46018+ return 0;
46019+ }
46020+ if (pbytes(page_index(page), inode) == 0) {
46021+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
46022+ SetPageUptodate(page);
46023+ unlock_page(page);
46024+ return 0;
46025+ }
46026+ move_cluster_forward(clust, inode, page->index);
46027+ unlock_page(page);
46028+ /*
46029+ * read the whole page cluster
46030+ */
46031+ ret = ctail_read_page_cluster(clust, inode);
46032+
46033+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
46034+ return ret;
46035+}
46036+
46037+/*
46038+ * We populate a bit more then upper readahead suggests:
46039+ * with each nominated page we read the whole page cluster
46040+ * this page belongs to.
46041+ */
46042+int readpages_ctail(struct file *file, struct address_space *mapping,
46043+ struct list_head *pages)
44254afd
MT
46044+{
46045+ int ret = 0;
46046+ hint_t *hint;
71430cf6 46047+ struct cluster_handle clust;
44254afd 46048+ struct inode *inode = mapping->host;
44254afd 46049+
71430cf6
MT
46050+ assert("edward-1521", inode == file->f_dentry->d_inode);
46051+
44254afd 46052+ cluster_init_read(&clust, NULL);
71430cf6
MT
46053+ clust.file = file;
46054+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44254afd
MT
46055+ if (hint == NULL) {
46056+ warning("vs-28", "failed to allocate hint");
71430cf6 46057+ ret = RETERR(-ENOMEM);
44254afd
MT
46058+ goto exit1;
46059+ }
46060+ clust.hint = hint;
46061+ ret = load_file_hint(clust.file, hint);
71430cf6
MT
46062+ if (ret) {
46063+ warning("edward-1522", "failed to load hint");
44254afd 46064+ goto exit2;
71430cf6
MT
46065+ }
46066+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
44254afd 46067+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
71430cf6
MT
46068+ if (ret) {
46069+ warning("edward-1523", "failed to alloc pgset");
44254afd 46070+ goto exit3;
44254afd 46071+ }
71430cf6
MT
46072+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
46073+
44254afd
MT
46074+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46075+ exit3:
46076+ done_lh(&hint->lh);
71430cf6 46077+ save_file_hint(file, hint);
44254afd
MT
46078+ hint->ext_coord.valid = 0;
46079+ exit2:
46080+ kfree(hint);
46081+ exit1:
44254afd 46082+ put_cluster_handle(&clust);
71430cf6 46083+ return ret;
44254afd
MT
46084+}
46085+
46086+/*
46087+ plugin->u.item.s.file.append_key
46088+ key of the first item of the next disk cluster
46089+*/
46090+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46091+{
46092+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46093+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46094+
46095+ item_key_by_coord(coord, key);
71430cf6
MT
46096+ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
46097+ << cluster_shift_by_coord(coord));
44254afd
MT
46098+ return key;
46099+}
46100+
71430cf6
MT
46101+static int insert_unprepped_ctail(struct cluster_handle * clust,
46102+ struct inode *inode)
44254afd
MT
46103+{
46104+ int result;
46105+ char buf[UCTAIL_NR_UNITS];
46106+ reiser4_item_data data;
46107+ reiser4_key key;
46108+ int shift = (int)UCTAIL_SHIFT;
46109+
46110+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46111+ result = key_by_inode_cryptcompress(inode,
46112+ clust_to_off(clust->index, inode),
46113+ &key);
46114+ if (result)
46115+ return result;
46116+ data.user = 0;
46117+ data.iplug = item_plugin_by_id(CTAIL_ID);
46118+ data.arg = &shift;
46119+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46120+ data.data = buf;
46121+
46122+ result = insert_by_coord(&clust->hint->ext_coord.coord,
46123+ &data, &key, clust->hint->ext_coord.lh, 0);
46124+ return result;
46125+}
46126+
46127+static int
71430cf6
MT
46128+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46129+ struct inode *inode)
44254afd
MT
46130+{
46131+ int result;
46132+ carry_pool *pool;
46133+ carry_level *lowest_level;
46134+ reiser4_item_data *data;
46135+ carry_op *op;
46136+ int cluster_shift = inode_cluster_shift(inode);
46137+
46138+ pool =
46139+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46140+ sizeof(*data));
46141+ if (IS_ERR(pool))
46142+ return PTR_ERR(pool);
46143+ lowest_level = (carry_level *) (pool + 1);
46144+ init_carry_level(lowest_level, pool);
46145+ data = (reiser4_item_data *) (lowest_level + 3);
46146+
46147+ assert("edward-466", coord->between == AFTER_ITEM
46148+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46149+ || coord->between == EMPTY_NODE
46150+ || coord->between == BEFORE_UNIT);
46151+
46152+ if (coord->between == AFTER_UNIT) {
46153+ coord->unit_pos = 0;
46154+ coord->between = AFTER_ITEM;
46155+ }
71430cf6
MT
46156+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46157+ 0 /* operate directly on coord -> node */);
44254afd
MT
46158+ if (IS_ERR(op) || (op == NULL)) {
46159+ done_carry_pool(pool);
46160+ return RETERR(op ? PTR_ERR(op) : -EIO);
46161+ }
46162+ data->user = 0;
46163+ data->iplug = item_plugin_by_id(CTAIL_ID);
46164+ data->arg = &cluster_shift;
46165+
46166+ data->length = 0;
46167+ data->data = NULL;
46168+
46169+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46170+ op->u.insert_flow.insert_point = coord;
46171+ op->u.insert_flow.flow = f;
46172+ op->u.insert_flow.data = data;
46173+ op->u.insert_flow.new_nodes = 0;
46174+
46175+ lowest_level->track_type = CARRY_TRACK_CHANGE;
46176+ lowest_level->tracked = lh;
46177+
71430cf6 46178+ result = reiser4_carry(lowest_level, NULL);
44254afd
MT
46179+ done_carry_pool(pool);
46180+
46181+ return result;
46182+}
46183+
46184+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
71430cf6
MT
46185+static int insert_cryptcompress_flow_in_place(coord_t * coord,
46186+ lock_handle * lh, flow_t * f,
46187+ struct inode *inode)
44254afd
MT
46188+{
46189+ int ret;
46190+ coord_t pos;
46191+ lock_handle lock;
46192+
46193+ assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46194+ assert("edward-484", coord->between == AT_UNIT
46195+ || coord->between == AFTER_ITEM);
46196+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46197+
46198+ coord_dup(&pos, coord);
46199+ pos.unit_pos = 0;
46200+ pos.between = AFTER_ITEM;
46201+
46202+ init_lh(&lock);
46203+ copy_lh(&lock, lh);
46204+
71430cf6 46205+ ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
44254afd
MT
46206+ done_lh(&lock);
46207+ assert("edward-1347", znode_is_write_locked(lh->node));
46208+ assert("edward-1228", !ret);
46209+ return ret;
46210+}
46211+
46212+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46213+static int overwrite_ctail(coord_t * coord, flow_t * f)
46214+{
46215+ unsigned count;
46216+
46217+ assert("edward-269", f->user == 0);
46218+ assert("edward-270", f->data != NULL);
46219+ assert("edward-271", f->length > 0);
46220+ assert("edward-272", coord_is_existing_unit(coord));
46221+ assert("edward-273", coord->unit_pos == 0);
46222+ assert("edward-274", znode_is_write_locked(coord->node));
71430cf6 46223+ assert("edward-275", reiser4_schedulable());
44254afd
MT
46224+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46225+ assert("edward-1243", ctail_ok(coord));
46226+
46227+ count = nr_units_ctail(coord);
46228+
46229+ if (count > f->length)
46230+ count = f->length;
46231+ memcpy(first_unit(coord), f->data, count);
46232+ move_flow_forward(f, count);
46233+ coord->unit_pos += count;
46234+ return 0;
46235+}
46236+
46237+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46238+ cut ctail (part or whole) starting from next unit position */
46239+static int cut_ctail(coord_t * coord)
46240+{
46241+ coord_t stop;
46242+
46243+ assert("edward-435", coord->between == AT_UNIT &&
46244+ coord->item_pos < coord_num_items(coord) &&
46245+ coord->unit_pos <= coord_num_units(coord));
46246+
46247+ if (coord->unit_pos == coord_num_units(coord))
46248+ /* nothing to cut */
46249+ return 0;
46250+ coord_dup(&stop, coord);
46251+ stop.unit_pos = coord_last_unit_pos(coord);
46252+
46253+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
46254+}
46255+
71430cf6
MT
46256+int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
46257+ struct inode * inode)
44254afd
MT
46258+{
46259+ int result;
46260+ assert("edward-1244", inode != NULL);
46261+ assert("edward-1245", clust->hint != NULL);
46262+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46263+ assert("edward-1247", clust->reserved == 1);
44254afd
MT
46264+
46265+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46266+ if (cbk_errored(result))
46267+ return result;
46268+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
46269+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46270+
46271+ assert("edward-1295",
46272+ clust->hint->ext_coord.lh->node ==
46273+ clust->hint->ext_coord.coord.node);
46274+
46275+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
46276+
46277+ result = insert_unprepped_ctail(clust, inode);
46278+ all_grabbed2free();
46279+
46280+ assert("edward-1251", !result);
71430cf6 46281+ assert("edward-1252", cryptcompress_inode_ok(inode));
44254afd
MT
46282+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46283+ assert("edward-1254",
46284+ reiser4_clustered_blocks(reiser4_get_current_sb()));
46285+ assert("edward-1255",
46286+ znode_convertible(clust->hint->ext_coord.coord.node));
46287+
46288+ return result;
46289+}
46290+
71430cf6 46291+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44254afd
MT
46292+{
46293+ int result = 0;
71430cf6 46294+ struct convert_item_info * info;
44254afd
MT
46295+
46296+ assert("edward-468", pos != NULL);
46297+ assert("edward-469", pos->sq != NULL);
46298+ assert("edward-845", item_convert_data(pos) != NULL);
46299+
46300+ info = item_convert_data(pos);
46301+ assert("edward-679", info->flow.data != NULL);
46302+
46303+ switch (mode) {
46304+ case CRC_APPEND_ITEM:
46305+ assert("edward-1229", info->flow.length != 0);
46306+ assert("edward-1256",
46307+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46308+ result =
71430cf6
MT
46309+ insert_cryptcompress_flow_in_place(&pos->coord,
46310+ &pos->lock,
46311+ &info->flow,
46312+ info->inode);
44254afd
MT
46313+ break;
46314+ case CRC_OVERWRITE_ITEM:
46315+ assert("edward-1230", info->flow.length != 0);
46316+ overwrite_ctail(&pos->coord, &info->flow);
46317+ if (info->flow.length != 0)
46318+ break;
46319+ case CRC_CUT_ITEM:
46320+ assert("edward-1231", info->flow.length == 0);
46321+ result = cut_ctail(&pos->coord);
46322+ break;
46323+ default:
46324+ result = RETERR(-EIO);
46325+ impossible("edward-244", "bad convert mode");
46326+ }
46327+ return result;
46328+}
46329+
46330+/* plugin->u.item.f.scan */
46331+int scan_ctail(flush_scan * scan)
46332+{
46333+ int result = 0;
46334+ struct page *page;
46335+ struct inode *inode;
46336+ jnode *node = scan->node;
46337+
46338+ assert("edward-227", scan->node != NULL);
46339+ assert("edward-228", jnode_is_cluster_page(scan->node));
46340+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46341+
46342+ page = jnode_page(node);
46343+ inode = page->mapping->host;
46344+
71430cf6 46345+ if (!reiser4_scanning_left(scan))
44254afd
MT
46346+ return result;
46347+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46348+ znode_make_dirty(scan->parent_lock.node);
46349+
46350+ if (!znode_convertible(scan->parent_lock.node)) {
46351+ if (JF_ISSET(scan->node, JNODE_DIRTY))
46352+ znode_set_convertible(scan->parent_lock.node);
46353+ else {
46354+ warning("edward-681",
46355+ "cluster page is already processed");
46356+ return -EAGAIN;
46357+ }
46358+ }
46359+ return result;
46360+}
46361+
46362+/* If true, this function attaches children */
46363+static int should_attach_convert_idata(flush_pos_t * pos)
46364+{
46365+ int result;
46366+ assert("edward-431", pos != NULL);
46367+ assert("edward-432", pos->child == NULL);
46368+ assert("edward-619", znode_is_write_locked(pos->coord.node));
46369+ assert("edward-470",
46370+ item_plugin_by_coord(&pos->coord) ==
46371+ item_plugin_by_id(CTAIL_ID));
46372+
46373+ /* check for leftmost child */
46374+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46375+
46376+ if (!pos->child)
46377+ return 0;
46378+ spin_lock_jnode(pos->child);
46379+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46380+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
46381+ spin_unlock_jnode(pos->child);
46382+ if (!result && pos->child) {
46383+ /* existing child isn't to attach, clear up this one */
46384+ jput(pos->child);
46385+ pos->child = NULL;
46386+ }
46387+ return result;
46388+}
46389+
46390+/* plugin->init_convert_data() */
46391+static int
71430cf6 46392+init_convert_data_ctail(struct convert_item_info * idata, struct inode *inode)
44254afd
MT
46393+{
46394+ assert("edward-813", idata != NULL);
46395+ assert("edward-814", inode != NULL);
46396+
46397+ idata->inode = inode;
46398+ idata->d_cur = DC_FIRST_ITEM;
46399+ idata->d_next = DC_INVALID_STATE;
46400+
46401+ return 0;
46402+}
46403+
71430cf6 46404+static int alloc_item_convert_data(struct convert_info * sq)
44254afd
MT
46405+{
46406+ assert("edward-816", sq != NULL);
46407+ assert("edward-817", sq->itm == NULL);
46408+
71430cf6 46409+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44254afd
MT
46410+ if (sq->itm == NULL)
46411+ return RETERR(-ENOMEM);
46412+ return 0;
46413+}
46414+
71430cf6 46415+static void free_item_convert_data(struct convert_info * sq)
44254afd
MT
46416+{
46417+ assert("edward-818", sq != NULL);
46418+ assert("edward-819", sq->itm != NULL);
46419+ assert("edward-820", sq->iplug != NULL);
46420+
46421+ kfree(sq->itm);
46422+ sq->itm = NULL;
46423+ return;
46424+}
46425+
46426+static int alloc_convert_data(flush_pos_t * pos)
46427+{
46428+ assert("edward-821", pos != NULL);
46429+ assert("edward-822", pos->sq == NULL);
46430+
71430cf6 46431+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44254afd
MT
46432+ if (!pos->sq)
46433+ return RETERR(-ENOMEM);
46434+ memset(pos->sq, 0, sizeof(*pos->sq));
71430cf6 46435+ cluster_init_write(&pos->sq->clust, NULL);
44254afd
MT
46436+ return 0;
46437+}
46438+
46439+void free_convert_data(flush_pos_t * pos)
46440+{
71430cf6 46441+ struct convert_info *sq;
44254afd
MT
46442+
46443+ assert("edward-823", pos != NULL);
46444+ assert("edward-824", pos->sq != NULL);
46445+
46446+ sq = pos->sq;
46447+ if (sq->itm)
46448+ free_item_convert_data(sq);
46449+ put_cluster_handle(&sq->clust);
46450+ kfree(pos->sq);
46451+ pos->sq = NULL;
46452+ return;
46453+}
46454+
46455+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46456+{
71430cf6 46457+ struct convert_info *sq;
44254afd
MT
46458+
46459+ assert("edward-825", pos != NULL);
46460+ assert("edward-826", pos->sq != NULL);
46461+ assert("edward-827", item_convert_data(pos) != NULL);
46462+ assert("edward-828", inode != NULL);
46463+
46464+ sq = pos->sq;
46465+
46466+ memset(sq->itm, 0, sizeof(*sq->itm));
46467+
46468+ /* iplug->init_convert_data() */
46469+ return init_convert_data_ctail(sq->itm, inode);
46470+}
46471+
46472+/* create and attach disk cluster info used by 'convert' phase of the flush
46473+ squalloc() */
46474+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46475+{
46476+ int ret = 0;
71430cf6
MT
46477+ struct convert_item_info *info;
46478+ struct cluster_handle *clust;
44254afd
MT
46479+ file_plugin *fplug = inode_file_plugin(inode);
46480+ compression_plugin *cplug = inode_compression_plugin(inode);
46481+
46482+ assert("edward-248", pos != NULL);
46483+ assert("edward-249", pos->child != NULL);
46484+ assert("edward-251", inode != NULL);
71430cf6
MT
46485+ assert("edward-682", cryptcompress_inode_ok(inode));
46486+ assert("edward-252",
46487+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44254afd
MT
46488+ assert("edward-473",
46489+ item_plugin_by_coord(&pos->coord) ==
46490+ item_plugin_by_id(CTAIL_ID));
46491+
46492+ if (!pos->sq) {
46493+ ret = alloc_convert_data(pos);
46494+ if (ret)
46495+ return ret;
46496+ }
46497+ clust = &pos->sq->clust;
46498+ ret = grab_coa(&clust->tc, cplug);
46499+ if (ret)
46500+ goto err;
46501+ ret = set_cluster_by_page(clust,
46502+ jnode_page(pos->child),
46503+ MAX_CLUSTER_NRPAGES);
46504+ if (ret)
46505+ goto err;
46506+
46507+ assert("edward-829", pos->sq != NULL);
46508+ assert("edward-250", item_convert_data(pos) == NULL);
46509+
46510+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46511+
46512+ ret = alloc_item_convert_data(pos->sq);
46513+ if (ret)
46514+ goto err;
46515+ ret = init_item_convert_data(pos, inode);
46516+ if (ret)
46517+ goto err;
46518+ info = item_convert_data(pos);
46519+
71430cf6 46520+ ret = checkout_logical_cluster(clust, pos->child, inode);
44254afd
MT
46521+ if (ret)
46522+ goto err;
46523+
71430cf6 46524+ reiser4_deflate_cluster(clust, inode);
44254afd
MT
46525+ inc_item_convert_count(pos);
46526+
71430cf6 46527+ /* prepare flow for insertion */
44254afd
MT
46528+ fplug->flow_by_inode(info->inode,
46529+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46530+ 0 /* kernel space */ ,
46531+ clust->tc.len,
46532+ clust_to_off(clust->index, inode),
46533+ WRITE_OP, &info->flow);
46534+ jput(pos->child);
46535+
71430cf6 46536+ assert("edward-683", cryptcompress_inode_ok(inode));
44254afd
MT
46537+ return 0;
46538+ err:
46539+ jput(pos->child);
46540+ free_convert_data(pos);
46541+ return ret;
46542+}
46543+
46544+/* clear up disk cluster info */
71430cf6 46545+static void detach_convert_idata(struct convert_info * sq)
44254afd 46546+{
71430cf6 46547+ struct convert_item_info *info;
44254afd
MT
46548+
46549+ assert("edward-253", sq != NULL);
46550+ assert("edward-840", sq->itm != NULL);
46551+
46552+ info = sq->itm;
46553+ assert("edward-255", info->inode != NULL);
46554+ assert("edward-1212", info->flow.length == 0);
46555+
46556+ free_item_convert_data(sq);
46557+ return;
46558+}
46559+
46560+/* plugin->u.item.f.utmost_child */
46561+
46562+/* This function sets leftmost child for a first cluster item,
46563+ if the child exists, and NULL in other cases.
46564+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46565+
46566+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
46567+{
46568+ reiser4_key key;
46569+
46570+ item_key_by_coord(coord, &key);
46571+
46572+ assert("edward-257", coord != NULL);
46573+ assert("edward-258", child != NULL);
46574+ assert("edward-259", side == LEFT_SIDE);
46575+ assert("edward-260",
46576+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46577+
46578+ if (!is_disk_cluster_key(&key, coord))
46579+ *child = NULL;
46580+ else
46581+ *child = jlookup(current_tree,
46582+ get_key_objectid(item_key_by_coord
46583+ (coord, &key)),
46584+ off_to_pg(get_key_offset(&key)));
46585+ return 0;
46586+}
46587+
46588+/* Returns true if @p2 is the next item to @p1
46589+ in the _same_ disk cluster.
46590+ Disk cluster is a set of items. If ->clustered() != NULL,
46591+ with each item the whole disk cluster should be read/modified
46592+*/
44254afd
MT
46593+
46594+/* Go rightward and check for next disk cluster item, set
71430cf6
MT
46595+ * d_next to DC_CHAINED_ITEM, if the last one exists.
46596+ * If the current position is last item, go to right neighbor.
46597+ * Skip empty nodes. Note, that right neighbors may be not in
46598+ * the slum because of races. If so, make it dirty and
46599+ * convertible.
46600+ */
44254afd
MT
46601+static int next_item_dc_stat(flush_pos_t * pos)
46602+{
46603+ int ret = 0;
46604+ int stop = 0;
46605+ znode *cur;
46606+ coord_t coord;
46607+ lock_handle lh;
46608+ lock_handle right_lock;
46609+
46610+ assert("edward-1232", !node_is_empty(pos->coord.node));
46611+ assert("edward-1014",
46612+ pos->coord.item_pos < coord_num_items(&pos->coord));
46613+ assert("edward-1015", chaining_data_present(pos));
46614+ assert("edward-1017",
46615+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
46616+
46617+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
46618+
46619+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
46620+ return ret;
46621+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
46622+ return ret;
46623+
71430cf6
MT
46624+ /* Check next slum item.
46625+ * Note, that it can not be killed by concurrent truncate,
46626+ * as the last one will want the lock held by us.
46627+ */
44254afd
MT
46628+ init_lh(&right_lock);
46629+ cur = pos->coord.node;
46630+
46631+ while (!stop) {
46632+ init_lh(&lh);
46633+ ret = reiser4_get_right_neighbor(&lh,
46634+ cur,
46635+ ZNODE_WRITE_LOCK,
46636+ GN_CAN_USE_UPPER_LEVELS);
46637+ if (ret)
46638+ break;
46639+ ret = zload(lh.node);
46640+ if (ret) {
46641+ done_lh(&lh);
46642+ break;
46643+ }
46644+ coord_init_before_first_item(&coord, lh.node);
46645+
46646+ if (node_is_empty(lh.node)) {
46647+ znode_make_dirty(lh.node);
46648+ znode_set_convertible(lh.node);
46649+ stop = 0;
71430cf6 46650+ } else if (same_disk_cluster(&pos->coord, &coord)) {
44254afd
MT
46651+
46652+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
46653+
46654+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
46655+ /*
46656+ warning("edward-1024",
46657+ "next slum item mergeable, "
46658+ "but znode %p isn't dirty\n",
46659+ lh.node);
46660+ */
46661+ znode_make_dirty(lh.node);
46662+ }
46663+ if (!znode_convertible(lh.node)) {
46664+ /*
46665+ warning("edward-1272",
46666+ "next slum item mergeable, "
46667+ "but znode %p isn't convertible\n",
46668+ lh.node);
46669+ */
46670+ znode_set_convertible(lh.node);
46671+ }
46672+ stop = 1;
46673+ } else
46674+ stop = 1;
46675+ zrelse(lh.node);
46676+ done_lh(&right_lock);
46677+ copy_lh(&right_lock, &lh);
46678+ done_lh(&lh);
46679+ cur = right_lock.node;
46680+ }
46681+ done_lh(&right_lock);
46682+
46683+ if (ret == -E_NO_NEIGHBOR)
46684+ ret = 0;
46685+ return ret;
46686+}
46687+
46688+static int
71430cf6
MT
46689+assign_convert_mode(struct convert_item_info * idata,
46690+ cryptcompress_write_mode_t * mode)
44254afd
MT
46691+{
46692+ int result = 0;
46693+
46694+ assert("edward-1025", idata != NULL);
46695+
46696+ if (idata->flow.length) {
46697+ /* append or overwrite */
46698+ switch (idata->d_cur) {
46699+ case DC_FIRST_ITEM:
46700+ case DC_CHAINED_ITEM:
46701+ *mode = CRC_OVERWRITE_ITEM;
46702+ break;
46703+ case DC_AFTER_CLUSTER:
46704+ *mode = CRC_APPEND_ITEM;
46705+ break;
46706+ default:
46707+ impossible("edward-1018", "wrong current item state");
46708+ }
46709+ } else {
46710+ /* cut or invalidate */
46711+ switch (idata->d_cur) {
46712+ case DC_FIRST_ITEM:
46713+ case DC_CHAINED_ITEM:
46714+ *mode = CRC_CUT_ITEM;
46715+ break;
46716+ case DC_AFTER_CLUSTER:
46717+ result = 1;
46718+ break;
46719+ default:
46720+ impossible("edward-1019", "wrong current item state");
46721+ }
46722+ }
46723+ return result;
46724+}
46725+
46726+/* plugin->u.item.f.convert */
46727+/* write ctail in guessed mode */
46728+int convert_ctail(flush_pos_t * pos)
46729+{
46730+ int result;
46731+ int nr_items;
71430cf6 46732+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44254afd
MT
46733+
46734+ assert("edward-1020", pos != NULL);
46735+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
46736+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
46737+ assert("edward-1258", ctail_ok(&pos->coord));
46738+ assert("edward-261", pos->coord.node != NULL);
46739+
46740+ nr_items = coord_num_items(&pos->coord);
46741+ if (!chaining_data_present(pos)) {
46742+ if (should_attach_convert_idata(pos)) {
46743+ /* attach convert item info */
46744+ struct inode *inode;
46745+
46746+ assert("edward-264", pos->child != NULL);
46747+ assert("edward-265", jnode_page(pos->child) != NULL);
46748+ assert("edward-266",
46749+ jnode_page(pos->child)->mapping != NULL);
46750+
46751+ inode = jnode_page(pos->child)->mapping->host;
46752+
46753+ assert("edward-267", inode != NULL);
46754+
46755+ /* attach item convert info by child and put the last one */
46756+ result = attach_convert_idata(pos, inode);
46757+ pos->child = NULL;
46758+ if (result == -E_REPEAT) {
46759+ /* jnode became clean, or there is no dirty
46760+ pages (nothing to update in disk cluster) */
46761+ warning("edward-1021",
46762+ "convert_ctail: nothing to attach");
46763+ return 0;
46764+ }
46765+ if (result != 0)
46766+ return result;
46767+ } else
46768+ /* unconvertible */
46769+ return 0;
46770+ } else {
46771+ /* use old convert info */
46772+
71430cf6 46773+ struct convert_item_info *idata;
44254afd
MT
46774+
46775+ idata = item_convert_data(pos);
46776+
46777+ result = assign_convert_mode(idata, &mode);
46778+ if (result) {
46779+ /* disk cluster is over,
46780+ nothing to update anymore */
46781+ detach_convert_idata(pos->sq);
46782+ return 0;
46783+ }
46784+ }
46785+
46786+ assert("edward-433", chaining_data_present(pos));
46787+ assert("edward-1022",
46788+ pos->coord.item_pos < coord_num_items(&pos->coord));
46789+
71430cf6 46790+ /* check if next item is of current disk cluster */
44254afd
MT
46791+ result = next_item_dc_stat(pos);
46792+ if (result) {
46793+ detach_convert_idata(pos->sq);
46794+ return result;
46795+ }
46796+ result = do_convert_ctail(pos, mode);
46797+ if (result) {
46798+ detach_convert_idata(pos->sq);
46799+ return result;
46800+ }
46801+ switch (mode) {
46802+ case CRC_CUT_ITEM:
46803+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
46804+ assert("edward-1215",
46805+ coord_num_items(&pos->coord) == nr_items ||
46806+ coord_num_items(&pos->coord) == nr_items - 1);
46807+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
46808+ break;
46809+ if (coord_num_items(&pos->coord) != nr_items) {
46810+ /* the item was killed, no more chained items */
46811+ detach_convert_idata(pos->sq);
46812+ if (!node_is_empty(pos->coord.node))
46813+ /* make sure the next item will be scanned */
46814+ coord_init_before_item(&pos->coord);
46815+ break;
46816+ }
46817+ case CRC_APPEND_ITEM:
46818+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
46819+ detach_convert_idata(pos->sq);
46820+ break;
46821+ case CRC_OVERWRITE_ITEM:
46822+ if (coord_is_unprepped_ctail(&pos->coord)) {
46823+ /* convert unpprepped ctail to prepped one */
46824+ int shift;
46825+ shift =
46826+ inode_cluster_shift(item_convert_data(pos)->inode);
46827+ assert("edward-1259", cluster_shift_ok(shift));
46828+ put_unaligned((d8)shift,
46829+ &ctail_formatted_at(&pos->coord)->
46830+ cluster_shift);
46831+ }
46832+ break;
46833+ }
46834+ return result;
46835+}
46836+
46837+/* Make Linus happy.
46838+ Local variables:
46839+ c-indentation-style: "K&R"
46840+ mode-name: "LC"
46841+ c-basic-offset: 8
46842+ tab-width: 8
46843+ fill-column: 120
46844+ End:
46845+*/
71430cf6
MT
46846diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.22/fs/reiser4/plugin/item/ctail.h
46847--- linux-2.6.22.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 03:00:00.000000000 +0300
46848+++ linux-2.6.22/fs/reiser4/plugin/item/ctail.h 2007-07-29 00:25:34.948715113 +0400
46849@@ -0,0 +1,102 @@
44254afd
MT
46850+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46851+
71430cf6
MT
46852+/* Ctail items are fragments (or bodies) of special tipe to provide
46853+ optimal storage of encrypted and(or) compressed files. */
44254afd 46854+
44254afd 46855+
71430cf6
MT
46856+#if !defined( __FS_REISER4_CTAIL_H__ )
46857+#define __FS_REISER4_CTAIL_H__
44254afd 46858+
71430cf6 46859+/* Disk format of ctail item */
44254afd 46860+typedef struct ctail_item_format {
71430cf6
MT
46861+ /* packed shift;
46862+ if its value is different from UCTAIL_SHIFT (see below), then
46863+ size of disk cluster is calculated as (1 << cluster_shift) */
44254afd
MT
46864+ d8 cluster_shift;
46865+ /* ctail body */
46866+ d8 body[0];
46867+} __attribute__ ((packed)) ctail_item_format;
46868+
71430cf6
MT
46869+/* "Unprepped" disk cluster is represented by a single ctail item
46870+ with the following "magic" attributes: */
46871+/* "magic" cluster_shift */
46872+#define UCTAIL_SHIFT 0xff
46873+/* How many units unprepped ctail item has */
46874+#define UCTAIL_NR_UNITS 1
46875+
44254afd
MT
46876+/* The following is a set of various item states in a disk cluster.
46877+ Disk cluster is a set of items whose keys belong to the interval
46878+ [dc_key , dc_key + disk_cluster_size - 1] */
46879+typedef enum {
46880+ DC_INVALID_STATE = 0,
46881+ DC_FIRST_ITEM = 1,
46882+ DC_CHAINED_ITEM = 2,
46883+ DC_AFTER_CLUSTER = 3
46884+} dc_item_stat;
46885+
71430cf6
MT
46886+/* ctail-specific extension.
46887+ In particular this describes parameters of disk cluster an item belongs to */
46888+struct ctail_coord_extension {
46889+ int shift; /* this contains cluster_shift extracted from
46890+ ctail_item_format (above), or UCTAIL_SHIFT
46891+ (the last one is the "magic" of unprepped disk clusters)*/
46892+ int dsize; /* size of a prepped disk cluster */
46893+ int ncount; /* count of nodes occupied by a disk cluster */
46894+};
44254afd
MT
46895+
46896+struct cut_list;
46897+
46898+/* plugin->item.b.* */
46899+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
46900+ const reiser4_item_data *);
46901+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
46902+pos_in_node_t nr_units_ctail(const coord_t * coord);
46903+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
46904+void print_ctail(const char *prefix, coord_t * coord);
46905+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
46906+
46907+int paste_ctail(coord_t * coord, reiser4_item_data * data,
46908+ carry_plugin_info * info UNUSED_ARG);
46909+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
46910+int can_shift_ctail(unsigned free_space, coord_t * coord,
46911+ znode * target, shift_direction pend, unsigned *size,
46912+ unsigned want);
46913+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
46914+ unsigned count, shift_direction where_is_free_space,
46915+ unsigned free_space);
46916+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46917+ carry_cut_data *, reiser4_key * smallest_removed,
46918+ reiser4_key * new_first);
46919+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46920+ carry_kill_data *, reiser4_key * smallest_removed,
46921+ reiser4_key * new_first);
46922+int ctail_ok(const coord_t * coord);
46923+int check_ctail(const coord_t * coord, const char **error);
46924+
46925+/* plugin->u.item.s.* */
46926+int read_ctail(struct file *, flow_t *, hint_t *);
46927+int readpage_ctail(void *, struct page *);
71430cf6 46928+int readpages_ctail(struct file *, struct address_space *, struct list_head *);
44254afd
MT
46929+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
46930+int create_hook_ctail(const coord_t * coord, void *arg);
46931+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
46932+ carry_kill_data *);
46933+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
46934+
46935+/* plugin->u.item.f */
46936+int utmost_child_ctail(const coord_t *, sideof, jnode **);
46937+int scan_ctail(flush_scan *);
46938+int convert_ctail(flush_pos_t *);
46939+size_t inode_scaled_cluster_size(struct inode *);
44254afd
MT
46940+
46941+#endif /* __FS_REISER4_CTAIL_H__ */
46942+
46943+/* Make Linus happy.
46944+ Local variables:
46945+ c-indentation-style: "K&R"
46946+ mode-name: "LC"
46947+ c-basic-offset: 8
46948+ tab-width: 8
46949+ fill-column: 120
46950+ End:
46951+*/
71430cf6
MT
46952diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent.c linux-2.6.22/fs/reiser4/plugin/item/extent.c
46953--- linux-2.6.22.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 03:00:00.000000000 +0300
46954+++ linux-2.6.22/fs/reiser4/plugin/item/extent.c 2007-07-29 00:25:34.948715113 +0400
44254afd
MT
46955@@ -0,0 +1,197 @@
46956+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46957+
46958+#include "item.h"
46959+#include "../../key.h"
46960+#include "../../super.h"
46961+#include "../../carry.h"
46962+#include "../../inode.h"
46963+#include "../../page_cache.h"
46964+#include "../../flush.h"
46965+#include "../object.h"
46966+
46967+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
46968+/* Audited by: green(2002.06.13) */
46969+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
46970+ int nr_extents)
46971+{
46972+ data->data = ext_unit;
46973+ /* data->data is kernel space */
46974+ data->user = 0;
46975+ data->length = sizeof(reiser4_extent) * nr_extents;
46976+ data->arg = NULL;
46977+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
46978+ return data;
46979+}
46980+
46981+/* how many bytes are addressed by @nr first extents of the extent item */
71430cf6 46982+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
44254afd
MT
46983+{
46984+ pos_in_node_t i;
46985+ reiser4_block_nr blocks;
46986+ reiser4_extent *ext;
46987+
46988+ ext = item_body_by_coord(coord);
46989+ assert("vs-263", nr <= nr_units_extent(coord));
46990+
46991+ blocks = 0;
46992+ for (i = 0; i < nr; i++, ext++) {
46993+ blocks += extent_get_width(ext);
46994+ }
46995+
46996+ return blocks * current_blocksize;
46997+}
46998+
46999+extent_state state_of_extent(reiser4_extent * ext)
47000+{
47001+ switch ((int)extent_get_start(ext)) {
47002+ case 0:
47003+ return HOLE_EXTENT;
47004+ case 1:
47005+ return UNALLOCATED_EXTENT;
47006+ default:
47007+ break;
47008+ }
47009+ return ALLOCATED_EXTENT;
47010+}
47011+
47012+int extent_is_unallocated(const coord_t * item)
47013+{
47014+ assert("jmacd-5133", item_is_extent(item));
47015+
47016+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47017+}
47018+
47019+/* set extent's start and width */
71430cf6
MT
47020+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
47021+ reiser4_block_nr width)
44254afd
MT
47022+{
47023+ extent_set_start(ext, start);
47024+ extent_set_width(ext, width);
47025+}
47026+
44254afd 47027+/**
71430cf6 47028+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it
44254afd
MT
47029+ * @un_extent: coordinate of extent to be overwritten
47030+ * @lh: need better comment
47031+ * @key: need better comment
47032+ * @exts_to_add: data prepared for insertion into tree
47033+ * @replace: need better comment
47034+ * @flags: need better comment
47035+ * @return_insert_position: need better comment
47036+ *
47037+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47038+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47039+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47040+ * set to extent which was overwritten.
47041+ */
71430cf6
MT
47042+int reiser4_replace_extent(struct replace_handle *h,
47043+ int return_inserted_position)
44254afd
MT
47044+{
47045+ int result;
47046+ znode *orig_znode;
47047+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47048+
47049+ assert("vs-990", coord_is_existing_unit(h->coord));
47050+ assert("vs-1375", znode_is_write_locked(h->coord->node));
47051+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47052+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47053+ assert("vs-1427", ergo(h->nr_new_extents == 2,
47054+ extent_get_width(&h->new_extents[1]) != 0));
47055+
47056+ /* compose structure for paste */
47057+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47058+
47059+ coord_dup(&h->coord_after, h->coord);
47060+ init_lh(&h->lh_after);
47061+ copy_lh(&h->lh_after, h->lh);
71430cf6
MT
47062+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47063+ reiser4_tap_monitor(&h->watch);
44254afd
MT
47064+
47065+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47066+ orig_znode = h->coord->node;
47067+
47068+#if REISER4_DEBUG
47069+ /* make sure that key is set properly */
47070+ unit_key_by_coord(h->coord, &h->tmp);
47071+ set_key_offset(&h->tmp,
47072+ get_key_offset(&h->tmp) +
47073+ extent_get_width(&h->overwrite) * current_blocksize);
47074+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47075+#endif
47076+
47077+ /* set insert point after unit to be replaced */
47078+ h->coord->between = AFTER_UNIT;
47079+
47080+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47081+ &h->paste_key, &h->item, h->flags);
47082+ if (!result) {
47083+ /* now we have to replace the unit after which new units were
47084+ inserted. Its position is tracked by @watch */
47085+ reiser4_extent *ext;
47086+ znode *node;
47087+
47088+ node = h->coord_after.node;
47089+ if (node != orig_znode) {
47090+ coord_clear_iplug(&h->coord_after);
47091+ result = zload(node);
47092+ }
47093+
47094+ if (likely(!result)) {
47095+ ext = extent_by_coord(&h->coord_after);
47096+
47097+ assert("vs-987", znode_is_loaded(node));
47098+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47099+
47100+ /* overwrite extent unit */
47101+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47102+ znode_make_dirty(node);
47103+
47104+ if (node != orig_znode)
47105+ zrelse(node);
47106+
47107+ if (return_inserted_position == 0) {
47108+ /* coord and lh are to be set to overwritten
47109+ extent */
47110+ assert("vs-1662",
47111+ WITH_DATA(node, !memcmp(&h->overwrite,
47112+ extent_by_coord(
47113+ &h->coord_after),
47114+ sizeof(reiser4_extent))));
47115+
47116+ *h->coord = h->coord_after;
47117+ done_lh(h->lh);
47118+ copy_lh(h->lh, &h->lh_after);
47119+ } else {
47120+ /* h->coord and h->lh are to be set to first of
47121+ inserted units */
47122+ assert("vs-1663",
47123+ WITH_DATA(h->coord->node,
47124+ !memcmp(&h->new_extents[0],
47125+ extent_by_coord(h->coord),
47126+ sizeof(reiser4_extent))));
47127+ assert("vs-1664", h->lh->node == h->coord->node);
47128+ }
47129+ }
47130+ }
71430cf6 47131+ reiser4_tap_done(&h->watch);
44254afd
MT
47132+
47133+ return result;
47134+}
47135+
47136+lock_handle *znode_lh(znode *node)
47137+{
47138+ assert("vs-1371", znode_is_write_locked(node));
47139+ assert("vs-1372", znode_is_wlocked_once(node));
47140+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
47141+}
47142+
47143+/*
47144+ * Local variables:
47145+ * c-indentation-style: "K&R"
47146+ * mode-name: "LC"
47147+ * c-basic-offset: 8
47148+ * tab-width: 8
47149+ * fill-column: 79
47150+ * scroll-step: 1
47151+ * End:
47152+ */
71430cf6
MT
47153diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.22/fs/reiser4/plugin/item/extent_file_ops.c
47154--- linux-2.6.22.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 03:00:00.000000000 +0300
47155+++ linux-2.6.22/fs/reiser4/plugin/item/extent_file_ops.c 2007-07-29 00:25:34.952716148 +0400
47156@@ -0,0 +1,1453 @@
44254afd
MT
47157+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47158+
47159+#include "item.h"
47160+#include "../../inode.h"
47161+#include "../../page_cache.h"
47162+#include "../object.h"
47163+
47164+#include <linux/quotaops.h>
47165+#include <linux/swap.h>
44254afd
MT
47166+
47167+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47168+{
47169+ reiser4_extent *ext;
47170+
47171+ ext = (reiser4_extent *) (zdata(node) + offset);
47172+ return ext;
47173+}
47174+
47175+/**
47176+ * check_uf_coord - verify coord extension
47177+ * @uf_coord:
47178+ * @key:
47179+ *
47180+ * Makes sure that all fields of @uf_coord are set properly. If @key is
47181+ * specified - check whether @uf_coord is set correspondingly.
47182+ */
47183+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47184+{
47185+#if REISER4_DEBUG
47186+ const coord_t *coord;
71430cf6 47187+ const struct extent_coord_extension *ext_coord;
44254afd
MT
47188+ reiser4_extent *ext;
47189+
47190+ coord = &uf_coord->coord;
47191+ ext_coord = &uf_coord->extension.extent;
47192+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47193+
47194+ assert("",
47195+ WITH_DATA(coord->node,
47196+ (uf_coord->valid == 1 &&
47197+ coord_is_iplug_set(coord) &&
47198+ item_is_extent(coord) &&
47199+ ext_coord->nr_units == nr_units_extent(coord) &&
47200+ ext == extent_by_coord(coord) &&
47201+ ext_coord->width == extent_get_width(ext) &&
47202+ coord->unit_pos < ext_coord->nr_units &&
47203+ ext_coord->pos_in_unit < ext_coord->width &&
47204+ memcmp(ext, &ext_coord->extent,
47205+ sizeof(reiser4_extent)) == 0)));
47206+ if (key) {
47207+ reiser4_key coord_key;
71430cf6 47208+
44254afd
MT
47209+ unit_key_by_coord(&uf_coord->coord, &coord_key);
47210+ set_key_offset(&coord_key,
47211+ get_key_offset(&coord_key) +
47212+ (uf_coord->extension.extent.
47213+ pos_in_unit << PAGE_CACHE_SHIFT));
47214+ assert("", keyeq(key, &coord_key));
47215+ }
47216+#endif
47217+}
47218+
47219+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47220+{
47221+ check_uf_coord(uf_coord, NULL);
47222+
47223+ return ext_by_offset(uf_coord->coord.node,
47224+ uf_coord->extension.extent.ext_offset);
47225+}
47226+
47227+#if REISER4_DEBUG
47228+
47229+/**
47230+ * offset_is_in_unit
47231+ *
47232+ *
47233+ *
47234+ */
47235+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47236+ pos_in_unit inside of unit correspondingly */
47237+static int offset_is_in_unit(const coord_t *coord, loff_t off)
47238+{
47239+ reiser4_key unit_key;
47240+ __u64 unit_off;
47241+ reiser4_extent *ext;
47242+
47243+ ext = extent_by_coord(coord);
47244+
47245+ unit_key_extent(coord, &unit_key);
47246+ unit_off = get_key_offset(&unit_key);
47247+ if (off < unit_off)
47248+ return 0;
47249+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47250+ return 0;
47251+ return 1;
47252+}
47253+
47254+static int
47255+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47256+{
47257+ reiser4_key item_key;
47258+
47259+ assert("vs-771", coord_is_existing_unit(coord));
47260+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47261+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47262+
47263+ return offset_is_in_unit(coord, get_key_offset(key));
47264+}
47265+
47266+#endif
47267+
47268+/**
71430cf6 47269+ * can_append -
44254afd
MT
47270+ * @key:
47271+ * @coord:
47272+ *
47273+ * Returns 1 if @key is equal to an append key of item @coord is set to
47274+ */
47275+static int can_append(const reiser4_key *key, const coord_t *coord)
47276+{
47277+ reiser4_key append_key;
47278+
47279+ return keyeq(key, append_key_extent(coord, &append_key));
47280+}
47281+
47282+/**
47283+ * append_hole
47284+ * @coord:
47285+ * @lh:
47286+ * @key:
47287+ *
47288+ */
47289+static int append_hole(coord_t *coord, lock_handle *lh,
47290+ const reiser4_key *key)
47291+{
47292+ reiser4_key append_key;
47293+ reiser4_block_nr hole_width;
47294+ reiser4_extent *ext, new_ext;
47295+ reiser4_item_data idata;
47296+
47297+ /* last item of file may have to be appended with hole */
47298+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47299+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47300+
47301+ /* key of first byte which is not addressed by this extent */
47302+ append_key_extent(coord, &append_key);
47303+
47304+ assert("", keyle(&append_key, key));
71430cf6 47305+
44254afd
MT
47306+ /*
47307+ * extent item has to be appended with hole. Calculate length of that
47308+ * hole
47309+ */
47310+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47311+ current_blocksize - 1) >> current_blocksize_bits);
47312+ assert("vs-954", hole_width > 0);
47313+
47314+ /* set coord after last unit */
47315+ coord_init_after_item_end(coord);
47316+
47317+ /* get last extent in the item */
47318+ ext = extent_by_coord(coord);
47319+ if (state_of_extent(ext) == HOLE_EXTENT) {
47320+ /*
47321+ * last extent of a file is hole extent. Widen that extent by
47322+ * @hole_width blocks. Note that we do not worry about
47323+ * overflowing - extent width is 64 bits
47324+ */
71430cf6
MT
47325+ reiser4_set_extent(ext, HOLE_EXTENT_START,
47326+ extent_get_width(ext) + hole_width);
44254afd
MT
47327+ znode_make_dirty(coord->node);
47328+ return 0;
47329+ }
47330+
47331+ /* append last item of the file with hole extent unit */
47332+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47333+ state_of_extent(ext) == UNALLOCATED_EXTENT));
47334+
71430cf6 47335+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
44254afd
MT
47336+ init_new_extent(&idata, &new_ext, 1);
47337+ return insert_into_item(coord, lh, &append_key, &idata, 0);
47338+}
47339+
47340+/**
47341+ * check_jnodes
47342+ * @twig: longterm locked twig node
71430cf6 47343+ * @key:
44254afd
MT
47344+ *
47345+ */
47346+static void check_jnodes(znode *twig, const reiser4_key *key, int count)
47347+{
47348+#if REISER4_DEBUG
47349+ coord_t c;
47350+ reiser4_key node_key, jnode_key;
47351+
47352+ jnode_key = *key;
47353+
47354+ assert("", twig != NULL);
47355+ assert("", znode_get_level(twig) == TWIG_LEVEL);
47356+ assert("", znode_is_write_locked(twig));
47357+
47358+ zload(twig);
47359+ /* get the smallest key in twig node */
47360+ coord_init_first_unit(&c, twig);
47361+ unit_key_by_coord(&c, &node_key);
47362+ assert("", keyle(&node_key, &jnode_key));
47363+
47364+ coord_init_last_unit(&c, twig);
47365+ unit_key_by_coord(&c, &node_key);
47366+ if (item_plugin_by_coord(&c)->s.file.append_key)
47367+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
47368+ set_key_offset(&jnode_key,
47369+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
47370+ assert("", keylt(&jnode_key, &node_key));
47371+ zrelse(twig);
47372+#endif
47373+}
47374+
47375+/**
47376+ * append_last_extent - append last file item
47377+ * @uf_coord: coord to start insertion from
47378+ * @jnodes: array of jnodes
47379+ * @count: number of jnodes in the array
47380+ *
47381+ * There is already at least one extent item of file @inode in the tree. Append
47382+ * the last of them with unallocated extent unit of width @count. Assign
47383+ * fake block numbers to jnodes corresponding to the inserted extent.
47384+ */
47385+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47386+ jnode **jnodes, int count)
47387+{
47388+ int result;
47389+ reiser4_extent new_ext;
47390+ reiser4_item_data idata;
47391+ coord_t *coord;
71430cf6 47392+ struct extent_coord_extension *ext_coord;
44254afd
MT
47393+ reiser4_extent *ext;
47394+ reiser4_block_nr block;
47395+ jnode *node;
47396+ int i;
47397+
47398+ coord = &uf_coord->coord;
47399+ ext_coord = &uf_coord->extension.extent;
47400+ ext = ext_by_ext_coord(uf_coord);
47401+
47402+ /* check correctness of position in the item */
47403+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
47404+ assert("vs-1311", coord->between == AFTER_UNIT);
47405+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
47406+
47407+ if (!can_append(key, coord)) {
47408+ /* hole extent has to be inserted */
47409+ result = append_hole(coord, uf_coord->lh, key);
47410+ uf_coord->valid = 0;
47411+ return result;
47412+ }
47413+
47414+ if (count == 0)
47415+ return 0;
47416+
47417+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
47418+
47419+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
47420+ count);
47421+ BUG_ON(result != 0);
47422+
47423+ switch (state_of_extent(ext)) {
47424+ case UNALLOCATED_EXTENT:
47425+ /*
47426+ * last extent unit of the file is unallocated one. Increase
47427+ * its width by @count
47428+ */
71430cf6
MT
47429+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
47430+ extent_get_width(ext) + count);
44254afd
MT
47431+ znode_make_dirty(coord->node);
47432+
47433+ /* update coord extension */
47434+ ext_coord->width += count;
47435+ ON_DEBUG(extent_set_width
47436+ (&uf_coord->extension.extent.extent,
47437+ ext_coord->width));
47438+ break;
47439+
47440+ case HOLE_EXTENT:
47441+ case ALLOCATED_EXTENT:
47442+ /*
47443+ * last extent unit of the file is either hole or allocated
47444+ * one. Append one unallocated extent of width @count
47445+ */
71430cf6 47446+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
44254afd
MT
47447+ init_new_extent(&idata, &new_ext, 1);
47448+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
47449+ uf_coord->valid = 0;
47450+ if (result)
47451+ return result;
47452+ break;
47453+
47454+ default:
47455+ return RETERR(-EIO);
47456+ }
47457+
47458+ /*
47459+ * make sure that we hold long term locked twig node containing all
47460+ * jnodes we are about to capture
47461+ */
47462+ check_jnodes(uf_coord->lh->node, key, count);
47463+
47464+ /*
47465+ * assign fake block numbers to all jnodes. FIXME: make sure whether
47466+ * twig node containing inserted extent item is locked
47467+ */
47468+ block = fake_blocknr_unformatted(count);
47469+ for (i = 0; i < count; i ++, block ++) {
47470+ node = jnodes[i];
47471+ spin_lock_jnode(node);
47472+ JF_SET(node, JNODE_CREATED);
47473+ jnode_set_block(node, &block);
71430cf6 47474+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
44254afd
MT
47475+ BUG_ON(result != 0);
47476+ jnode_make_dirty_locked(node);
71430cf6 47477+ spin_unlock_jnode(node);
44254afd
MT
47478+ }
47479+ return count;
47480+}
47481+
47482+/**
47483+ * insert_first_hole - inser hole extent into tree
47484+ * @coord:
47485+ * @lh:
47486+ * @key:
47487+ *
47488+ *
47489+ */
47490+static int insert_first_hole(coord_t *coord, lock_handle *lh,
47491+ const reiser4_key *key)
47492+{
47493+ reiser4_extent new_ext;
47494+ reiser4_item_data idata;
47495+ reiser4_key item_key;
47496+ reiser4_block_nr hole_width;
47497+
47498+ /* @coord must be set for inserting of new item */
47499+ assert("vs-711", coord_is_between_items(coord));
47500+
47501+ item_key = *key;
47502+ set_key_offset(&item_key, 0ull);
71430cf6 47503+
44254afd
MT
47504+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
47505+ current_blocksize_bits);
47506+ assert("vs-710", hole_width > 0);
47507+
47508+ /* compose body of hole extent and insert item into tree */
71430cf6 47509+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
44254afd
MT
47510+ init_new_extent(&idata, &new_ext, 1);
47511+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
47512+}
47513+
47514+
47515+/**
47516+ * insert_first_extent - insert first file item
47517+ * @inode: inode of file
47518+ * @uf_coord: coord to start insertion from
47519+ * @jnodes: array of jnodes
47520+ * @count: number of jnodes in the array
47521+ * @inode:
47522+ *
47523+ * There are no items of file @inode in the tree yet. Insert unallocated extent
47524+ * of width @count into tree or hole extent if writing not to the
47525+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
47526+ * unallocated extent. Returns number of jnodes or error code.
47527+ */
47528+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47529+ jnode **jnodes, int count,
47530+ struct inode *inode)
47531+{
47532+ int result;
47533+ int i;
47534+ reiser4_extent new_ext;
47535+ reiser4_item_data idata;
47536+ reiser4_block_nr block;
71430cf6 47537+ struct unix_file_info *uf_info;
44254afd
MT
47538+ jnode *node;
47539+
47540+ /* first extent insertion starts at leaf level */
47541+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
47542+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
47543+
47544+ if (get_key_offset(key) != 0) {
47545+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
47546+ uf_coord->valid = 0;
47547+ uf_info = unix_file_inode_data(inode);
47548+
47549+ /*
47550+ * first item insertion is only possible when writing to empty
47551+ * file or performing tail conversion
47552+ */
47553+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
71430cf6
MT
47554+ (reiser4_inode_get_flag(inode,
47555+ REISER4_PART_MIXED) &&
47556+ reiser4_inode_get_flag(inode,
47557+ REISER4_PART_IN_CONV))));
44254afd
MT
47558+ /* if file was empty - update its state */
47559+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47560+ uf_info->container = UF_CONTAINER_EXTENTS;
47561+ return result;
47562+ }
47563+
47564+ if (count == 0)
47565+ return 0;
47566+
47567+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47568+ BUG_ON(result != 0);
47569+
47570+ /*
47571+ * prepare for tree modification: compose body of item and item data
47572+ * structure needed for insertion
47573+ */
71430cf6 47574+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
44254afd
MT
47575+ init_new_extent(&idata, &new_ext, 1);
47576+
47577+ /* insert extent item into the tree */
47578+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47579+ uf_coord->lh);
47580+ if (result)
47581+ return result;
47582+
47583+ /*
47584+ * make sure that we hold long term locked twig node containing all
47585+ * jnodes we are about to capture
47586+ */
47587+ check_jnodes(uf_coord->lh->node, key, count);
47588+ /*
47589+ * assign fake block numbers to all jnodes, capture and mark them dirty
47590+ */
47591+ block = fake_blocknr_unformatted(count);
47592+ for (i = 0; i < count; i ++, block ++) {
47593+ node = jnodes[i];
47594+ spin_lock_jnode(node);
47595+ JF_SET(node, JNODE_CREATED);
47596+ jnode_set_block(node, &block);
71430cf6 47597+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
44254afd
MT
47598+ BUG_ON(result != 0);
47599+ jnode_make_dirty_locked(node);
71430cf6 47600+ spin_unlock_jnode(node);
44254afd
MT
47601+ }
47602+
47603+ /*
47604+ * invalidate coordinate, research must be performed to continue
47605+ * because write will continue on twig level
47606+ */
47607+ uf_coord->valid = 0;
47608+ return count;
47609+}
47610+
47611+/**
47612+ * plug_hole - replace hole extent with unallocated and holes
47613+ * @uf_coord:
47614+ * @key:
47615+ * @node:
47616+ * @h: structure containing coordinate, lock handle, key, etc
47617+ *
47618+ * Creates an unallocated extent of width 1 within a hole. In worst case two
47619+ * additional extents can be created.
47620+ */
47621+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47622+{
47623+ struct replace_handle rh;
47624+ reiser4_extent *ext;
47625+ reiser4_block_nr width, pos_in_unit;
47626+ coord_t *coord;
71430cf6 47627+ struct extent_coord_extension *ext_coord;
44254afd
MT
47628+ int return_inserted_position;
47629+
47630+ check_uf_coord(uf_coord, key);
47631+
47632+ rh.coord = coord_by_uf_coord(uf_coord);
47633+ rh.lh = uf_coord->lh;
47634+ rh.flags = 0;
47635+
47636+ coord = coord_by_uf_coord(uf_coord);
47637+ ext_coord = ext_coord_by_uf_coord(uf_coord);
47638+ ext = ext_by_ext_coord(uf_coord);
47639+
47640+ width = ext_coord->width;
47641+ pos_in_unit = ext_coord->pos_in_unit;
47642+
47643+ *how = 0;
47644+ if (width == 1) {
71430cf6 47645+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
44254afd
MT
47646+ znode_make_dirty(coord->node);
47647+ /* update uf_coord */
47648+ ON_DEBUG(ext_coord->extent = *ext);
47649+ *how = 1;
47650+ return 0;
47651+ } else if (pos_in_unit == 0) {
47652+ /* we deal with first element of extent */
47653+ if (coord->unit_pos) {
47654+ /* there is an extent to the left */
47655+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
47656+ /*
47657+ * left neighboring unit is an unallocated
47658+ * extent. Increase its width and decrease
47659+ * width of hole
47660+ */
47661+ extent_set_width(ext - 1,
47662+ extent_get_width(ext - 1) + 1);
47663+ extent_set_width(ext, width - 1);
47664+ znode_make_dirty(coord->node);
47665+
47666+ /* update coord extension */
47667+ coord->unit_pos--;
47668+ ext_coord->width = extent_get_width(ext - 1);
47669+ ext_coord->pos_in_unit = ext_coord->width - 1;
47670+ ext_coord->ext_offset -= sizeof(reiser4_extent);
47671+ ON_DEBUG(ext_coord->extent =
47672+ *extent_by_coord(coord));
47673+ *how = 2;
47674+ return 0;
47675+ }
47676+ }
47677+ /* extent for replace */
71430cf6 47678+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
44254afd 47679+ /* extent to be inserted */
71430cf6
MT
47680+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
47681+ width - 1);
44254afd
MT
47682+ rh.nr_new_extents = 1;
47683+
71430cf6
MT
47684+ /* have reiser4_replace_extent to return with @coord and
47685+ @uf_coord->lh set to unit which was replaced */
44254afd
MT
47686+ return_inserted_position = 0;
47687+ *how = 3;
47688+ } else if (pos_in_unit == width - 1) {
47689+ /* we deal with last element of extent */
47690+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
47691+ /* there is an extent unit to the right */
47692+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
47693+ /*
47694+ * right neighboring unit is an unallocated
47695+ * extent. Increase its width and decrease
47696+ * width of hole
47697+ */
47698+ extent_set_width(ext + 1,
47699+ extent_get_width(ext + 1) + 1);
47700+ extent_set_width(ext, width - 1);
47701+ znode_make_dirty(coord->node);
47702+
47703+ /* update coord extension */
47704+ coord->unit_pos++;
47705+ ext_coord->width = extent_get_width(ext + 1);
47706+ ext_coord->pos_in_unit = 0;
47707+ ext_coord->ext_offset += sizeof(reiser4_extent);
47708+ ON_DEBUG(ext_coord->extent =
47709+ *extent_by_coord(coord));
47710+ *how = 4;
47711+ return 0;
47712+ }
47713+ }
47714+ /* extent for replace */
71430cf6 47715+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
44254afd 47716+ /* extent to be inserted */
71430cf6
MT
47717+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47718+ 1);
44254afd
MT
47719+ rh.nr_new_extents = 1;
47720+
71430cf6
MT
47721+ /* have reiser4_replace_extent to return with @coord and
47722+ @uf_coord->lh set to unit which was inserted */
44254afd
MT
47723+ return_inserted_position = 1;
47724+ *how = 5;
47725+ } else {
47726+ /* extent for replace */
71430cf6
MT
47727+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
47728+ pos_in_unit);
44254afd 47729+ /* extents to be inserted */
71430cf6
MT
47730+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47731+ 1);
47732+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
47733+ width - pos_in_unit - 1);
44254afd
MT
47734+ rh.nr_new_extents = 2;
47735+
71430cf6
MT
47736+ /* have reiser4_replace_extent to return with @coord and
47737+ @uf_coord->lh set to first of units which were inserted */
44254afd
MT
47738+ return_inserted_position = 1;
47739+ *how = 6;
47740+ }
47741+ unit_key_by_coord(coord, &rh.paste_key);
47742+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
47743+ extent_get_width(&rh.overwrite) * current_blocksize);
47744+
47745+ uf_coord->valid = 0;
71430cf6 47746+ return reiser4_replace_extent(&rh, return_inserted_position);
44254afd
MT
47747+}
47748+
47749+/**
47750+ * overwrite_one_block -
47751+ * @uf_coord:
47752+ * @key:
47753+ * @node:
47754+ *
47755+ * If @node corresponds to hole extent - create unallocated extent for it and
47756+ * assign fake block number. If @node corresponds to allocated extent - assign
47757+ * block number of jnode
47758+ */
47759+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
47760+ jnode *node, int *hole_plugged)
47761+{
47762+ int result;
71430cf6 47763+ struct extent_coord_extension *ext_coord;
44254afd
MT
47764+ reiser4_extent *ext;
47765+ reiser4_block_nr block;
47766+ int how;
47767+
47768+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
47769+
47770+ result = 0;
47771+ ext_coord = ext_coord_by_uf_coord(uf_coord);
47772+ ext = ext_by_ext_coord(uf_coord);
47773+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
47774+
47775+ switch (state_of_extent(ext)) {
47776+ case ALLOCATED_EXTENT:
47777+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
47778+ break;
47779+
47780+ case HOLE_EXTENT:
47781+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
47782+ BUG_ON(result != 0);
47783+ result = plug_hole(uf_coord, key, &how);
47784+ if (result)
47785+ return result;
47786+ block = fake_blocknr_unformatted(1);
47787+ if (hole_plugged)
47788+ *hole_plugged = 1;
47789+ JF_SET(node, JNODE_CREATED);
47790+ break;
47791+
47792+ default:
47793+ return RETERR(-EIO);
47794+ }
47795+
47796+ jnode_set_block(node, &block);
47797+ return 0;
47798+}
47799+
47800+/**
47801+ * move_coord - move coordinate forward
47802+ * @uf_coord:
47803+ *
47804+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
47805+ * the last one already or is invalid.
47806+ */
47807+static int move_coord(uf_coord_t *uf_coord)
47808+{
71430cf6 47809+ struct extent_coord_extension *ext_coord;
44254afd
MT
47810+
47811+ if (uf_coord->valid == 0)
47812+ return 1;
47813+ ext_coord = &uf_coord->extension.extent;
47814+ ext_coord->pos_in_unit ++;
47815+ if (ext_coord->pos_in_unit < ext_coord->width)
47816+ /* coordinate moved within the unit */
47817+ return 0;
47818+
47819+ /* end of unit is reached. Try to move to next unit */
47820+ ext_coord->pos_in_unit = 0;
47821+ uf_coord->coord.unit_pos ++;
47822+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
47823+ /* coordinate moved to next unit */
47824+ ext_coord->ext_offset += sizeof(reiser4_extent);
47825+ ext_coord->width =
47826+ extent_get_width(ext_by_offset
47827+ (uf_coord->coord.node,
47828+ ext_coord->ext_offset));
47829+ ON_DEBUG(ext_coord->extent =
47830+ *ext_by_offset(uf_coord->coord.node,
47831+ ext_coord->ext_offset));
47832+ return 0;
47833+ }
47834+ /* end of item is reached */
47835+ uf_coord->valid = 0;
47836+ return 1;
47837+}
47838+
47839+/**
71430cf6 47840+ * overwrite_extent -
44254afd
MT
47841+ * @inode:
47842+ *
47843+ * Returns number of handled jnodes.
47844+ */
47845+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47846+ jnode **jnodes, int count, int *plugged_hole)
47847+{
47848+ int result;
47849+ reiser4_key k;
47850+ int i;
47851+ jnode *node;
47852+
47853+ k = *key;
47854+ for (i = 0; i < count; i ++) {
47855+ node = jnodes[i];
47856+ if (*jnode_get_block(node) == 0) {
47857+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
47858+ if (result)
47859+ return result;
47860+ }
47861+ /*
47862+ * make sure that we hold long term locked twig node containing
47863+ * all jnodes we are about to capture
47864+ */
47865+ check_jnodes(uf_coord->lh->node, &k, 1);
47866+ /*
47867+ * assign fake block numbers to all jnodes, capture and mark
47868+ * them dirty
47869+ */
47870+ spin_lock_jnode(node);
71430cf6 47871+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
44254afd
MT
47872+ BUG_ON(result != 0);
47873+ jnode_make_dirty_locked(node);
47874+ spin_unlock_jnode(node);
47875+
47876+ if (uf_coord->valid == 0)
47877+ return i + 1;
47878+
47879+ check_uf_coord(uf_coord, &k);
47880+
47881+ if (move_coord(uf_coord)) {
47882+ /*
47883+ * failed to move to the next node pointer. Either end
47884+ * of file or end of twig node is reached. In the later
47885+ * case we might go to the right neighbor.
47886+ */
47887+ uf_coord->valid = 0;
47888+ return i + 1;
47889+ }
47890+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
47891+ }
47892+
47893+ return count;
47894+}
47895+
44254afd 47896+/**
71430cf6 47897+ * reiser4_update_extent
44254afd
MT
47898+ * @file:
47899+ * @jnodes:
47900+ * @count:
47901+ * @off:
71430cf6 47902+ *
44254afd 47903+ */
71430cf6 47904+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
44254afd
MT
47905+ int *plugged_hole)
47906+{
47907+ int result;
47908+ znode *loaded;
47909+ uf_coord_t uf_coord;
47910+ coord_t *coord;
47911+ lock_handle lh;
47912+ reiser4_key key;
47913+
71430cf6 47914+ assert("", reiser4_lock_counters()->d_refs == 0);
44254afd
MT
47915+
47916+ key_by_inode_and_offset_common(inode, pos, &key);
47917+
47918+ init_uf_coord(&uf_coord, &lh);
47919+ coord = &uf_coord.coord;
47920+ result = find_file_item_nohint(coord, &lh, &key,
47921+ ZNODE_WRITE_LOCK, inode);
47922+ if (IS_CBKERR(result)) {
71430cf6 47923+ assert("", reiser4_lock_counters()->d_refs == 0);
44254afd
MT
47924+ return result;
47925+ }
71430cf6 47926+
44254afd
MT
47927+ result = zload(coord->node);
47928+ BUG_ON(result != 0);
47929+ loaded = coord->node;
47930+
47931+ if (coord->between == AFTER_UNIT) {
47932+ /*
47933+ * append existing extent item with unallocated extent of width
47934+ * nr_jnodes
47935+ */
47936+ init_coord_extension_extent(&uf_coord,
47937+ get_key_offset(&key));
47938+ result = append_last_extent(&uf_coord, &key,
47939+ &node, 1);
47940+ } else if (coord->between == AT_UNIT) {
47941+ /*
47942+ * overwrite
47943+ * not optimal yet. Will be optimized if new write will show
47944+ * performance win.
47945+ */
47946+ init_coord_extension_extent(&uf_coord,
47947+ get_key_offset(&key));
47948+ result = overwrite_extent(&uf_coord, &key,
47949+ &node, 1, plugged_hole);
47950+ } else {
47951+ /*
47952+ * there are no items of this file in the tree yet. Create
47953+ * first item of the file inserting one unallocated extent of
47954+ * width nr_jnodes
47955+ */
47956+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
47957+ }
47958+ assert("", result == 1 || result < 0);
47959+ zrelse(loaded);
47960+ done_lh(&lh);
71430cf6 47961+ assert("", reiser4_lock_counters()->d_refs == 0);
44254afd
MT
47962+ return (result == 1) ? 0 : result;
47963+}
47964+
47965+/**
47966+ * update_extents
47967+ * @file:
47968+ * @jnodes:
47969+ * @count:
47970+ * @off:
71430cf6 47971+ *
44254afd
MT
47972+ */
47973+static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
47974+{
47975+ struct inode *inode;
47976+ struct hint hint;
47977+ reiser4_key key;
47978+ int result;
47979+ znode *loaded;
71430cf6 47980+
44254afd
MT
47981+ result = load_file_hint(file, &hint);
47982+ BUG_ON(result != 0);
71430cf6 47983+
44254afd
MT
47984+ inode = file->f_dentry->d_inode;
47985+ if (count != 0)
47986+ /*
47987+ * count == 0 is special case: expanding truncate
47988+ */
47989+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
47990+ key_by_inode_and_offset_common(inode, pos, &key);
47991+
71430cf6
MT
47992+ assert("", reiser4_lock_counters()->d_refs == 0);
47993+
44254afd
MT
47994+ do {
47995+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
47996+ if (IS_CBKERR(result)) {
71430cf6 47997+ assert("", reiser4_lock_counters()->d_refs == 0);
44254afd
MT
47998+ return result;
47999+ }
48000+
48001+ result = zload(hint.ext_coord.coord.node);
48002+ BUG_ON(result != 0);
48003+ loaded = hint.ext_coord.coord.node;
48004+
48005+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
48006+ /*
48007+ * append existing extent item with unallocated extent
48008+ * of width nr_jnodes
48009+ */
48010+ if (hint.ext_coord.valid == 0)
48011+ /* NOTE: get statistics on this */
48012+ init_coord_extension_extent(&hint.ext_coord,
48013+ get_key_offset(&key));
48014+ result = append_last_extent(&hint.ext_coord, &key,
48015+ jnodes, count);
48016+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
48017+ /*
48018+ * overwrite
48019+ * not optimal yet. Will be optimized if new write will
48020+ * show performance win.
48021+ */
48022+ if (hint.ext_coord.valid == 0)
48023+ /* NOTE: get statistics on this */
48024+ init_coord_extension_extent(&hint.ext_coord,
48025+ get_key_offset(&key));
48026+ result = overwrite_extent(&hint.ext_coord, &key,
48027+ jnodes, count, NULL);
48028+ } else {
48029+ /*
48030+ * there are no items of this file in the tree
48031+ * yet. Create first item of the file inserting one
48032+ * unallocated extent of * width nr_jnodes
48033+ */
48034+ result = insert_first_extent(&hint.ext_coord, &key,
48035+ jnodes, count, inode);
48036+ }
48037+ zrelse(loaded);
48038+ if (result < 0) {
48039+ done_lh(hint.ext_coord.lh);
48040+ break;
48041+ }
48042+
48043+ jnodes += result;
48044+ count -= result;
48045+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48046+
48047+ /* seal and unlock znode */
48048+ if (hint.ext_coord.valid)
71430cf6 48049+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
44254afd 48050+ else
71430cf6 48051+ reiser4_unset_hint(&hint);
44254afd
MT
48052+
48053+ } while (count > 0);
48054+
48055+ save_file_hint(file, &hint);
71430cf6 48056+ assert("", reiser4_lock_counters()->d_refs == 0);
44254afd
MT
48057+ return result;
48058+}
48059+
48060+/**
48061+ * write_extent_reserve_space - reserve space for extent write operation
48062+ * @inode:
48063+ *
48064+ * Estimates and reserves space which may be required for writing
48065+ * WRITE_GRANULARITY pages of file.
48066+ */
48067+static int write_extent_reserve_space(struct inode *inode)
48068+{
48069+ __u64 count;
48070+ reiser4_tree *tree;
48071+
48072+ /*
48073+ * to write WRITE_GRANULARITY pages to a file by extents we have to
71430cf6
MT
48074+ * reserve disk space for:
48075+
44254afd
MT
48076+ * 1. find_file_item may have to insert empty node to the tree (empty
48077+ * leaf node between two extent items). This requires 1 block and
48078+ * number of blocks which are necessary to perform insertion of an
48079+ * internal item into twig level.
48080+
48081+ * 2. for each of written pages there might be needed 1 block and
48082+ * number of blocks which might be necessary to perform insertion of or
48083+ * paste to an extent item.
48084+
48085+ * 3. stat data update
48086+ */
71430cf6 48087+ tree = reiser4_tree_by_inode(inode);
44254afd
MT
48088+ count = estimate_one_insert_item(tree) +
48089+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48090+ estimate_one_insert_item(tree);
48091+ grab_space_enable();
48092+ return reiser4_grab_space(count, 0 /* flags */);
48093+}
48094+
71430cf6
MT
48095+/*
48096+ * filemap_copy_from_user no longer exists in generic code, because it
48097+ * is deadlocky (copying from user while holding the page lock is bad).
48098+ * As a temporary fix for reiser4, just define it here.
48099+ */
48100+static inline size_t
48101+filemap_copy_from_user(struct page *page, unsigned long offset,
48102+ const char __user *buf, unsigned bytes)
48103+{
48104+ char *kaddr;
48105+ int left;
48106+
48107+ kaddr = kmap_atomic(page, KM_USER0);
48108+ left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
48109+ kunmap_atomic(kaddr, KM_USER0);
48110+
48111+ if (left != 0) {
48112+ /* Do it the slow way */
48113+ kaddr = kmap(page);
48114+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
48115+ kunmap(page);
48116+ }
48117+ return bytes - left;
48118+}
48119+
44254afd 48120+/**
71430cf6 48121+ * reiser4_write_extent - write method of extent item plugin
44254afd
MT
48122+ * @file: file to write to
48123+ * @buf: address of user-space buffer
71430cf6
MT
48124+ * @count: number of bytes to write
48125+ * @pos: position in file to write to
44254afd
MT
48126+ *
48127+ */
71430cf6
MT
48128+ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
48129+ size_t count, loff_t *pos)
44254afd
MT
48130+{
48131+ int have_to_update_extent;
71430cf6 48132+ int nr_pages, nr_dirty;
44254afd
MT
48133+ struct page *page;
48134+ jnode *jnodes[WRITE_GRANULARITY + 1];
48135+ struct inode *inode;
48136+ unsigned long index;
48137+ unsigned long end;
48138+ int i;
48139+ int to_page, page_off;
48140+ size_t left, written;
71430cf6 48141+ int result = 0;
44254afd
MT
48142+
48143+ inode = file->f_dentry->d_inode;
48144+ if (write_extent_reserve_space(inode))
48145+ return RETERR(-ENOSPC);
48146+
48147+ if (count == 0) {
48148+ /* truncate case */
48149+ update_extents(file, jnodes, 0, *pos);
48150+ return 0;
48151+ }
48152+
48153+ BUG_ON(get_current_context()->trans->atom != NULL);
48154+
71430cf6 48155+ left = count;
44254afd
MT
48156+ index = *pos >> PAGE_CACHE_SHIFT;
48157+ /* calculate number of pages which are to be written */
48158+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48159+ nr_pages = end - index + 1;
71430cf6 48160+ nr_dirty = 0;
44254afd
MT
48161+ assert("", nr_pages <= WRITE_GRANULARITY + 1);
48162+
48163+ /* get pages and jnodes */
48164+ for (i = 0; i < nr_pages; i ++) {
71430cf6
MT
48165+ page = find_or_create_page(inode->i_mapping, index + i,
48166+ reiser4_ctx_gfp_mask_get());
44254afd 48167+ if (page == NULL) {
71430cf6
MT
48168+ nr_pages = i;
48169+ result = RETERR(-ENOMEM);
48170+ goto out;
44254afd
MT
48171+ }
48172+
48173+ jnodes[i] = jnode_of_page(page);
48174+ if (IS_ERR(jnodes[i])) {
48175+ unlock_page(page);
48176+ page_cache_release(page);
71430cf6
MT
48177+ nr_pages = i;
48178+ result = RETERR(-ENOMEM);
48179+ goto out;
44254afd
MT
48180+ }
48181+ /* prevent jnode and page from disconnecting */
48182+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48183+ unlock_page(page);
48184+ }
48185+
48186+ BUG_ON(get_current_context()->trans->atom != NULL);
48187+
48188+ have_to_update_extent = 0;
48189+
44254afd
MT
48190+ page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48191+ for (i = 0; i < nr_pages; i ++) {
48192+ to_page = PAGE_CACHE_SIZE - page_off;
48193+ if (to_page > left)
48194+ to_page = left;
48195+ page = jnode_page(jnodes[i]);
71430cf6 48196+ if (page_offset(page) < inode->i_size &&
44254afd
MT
48197+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48198+ /*
48199+ * the above is not optimal for partial write to last
48200+ * page of file when file size is not at boundary of
48201+ * page
48202+ */
48203+ lock_page(page);
48204+ if (!PageUptodate(page)) {
48205+ result = readpage_unix_file(NULL, page);
48206+ BUG_ON(result != 0);
48207+ /* wait for read completion */
48208+ lock_page(page);
48209+ BUG_ON(!PageUptodate(page));
44254afd
MT
48210+ } else
48211+ result = 0;
71430cf6 48212+ unlock_page(page);
44254afd
MT
48213+ }
48214+
48215+ BUG_ON(get_current_context()->trans->atom != NULL);
48216+ fault_in_pages_readable(buf, to_page);
48217+ BUG_ON(get_current_context()->trans->atom != NULL);
48218+
48219+ lock_page(page);
71430cf6
MT
48220+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
48221+ simple_prepare_write(file, page, page_off,
48222+ page_off + to_page);
44254afd
MT
48223+
48224+ written = filemap_copy_from_user(page, page_off, buf, to_page);
71430cf6 48225+ if (unlikely(written != to_page)) {
44254afd 48226+ unlock_page(page);
44254afd
MT
48227+ result = RETERR(-EFAULT);
48228+ break;
48229+ }
71430cf6 48230+
44254afd 48231+ flush_dcache_page(page);
71430cf6 48232+ reiser4_set_page_dirty_internal(page);
44254afd 48233+ unlock_page(page);
71430cf6
MT
48234+ nr_dirty++;
48235+
44254afd
MT
48236+ mark_page_accessed(page);
48237+ SetPageUptodate(page);
44254afd
MT
48238+
48239+ if (jnodes[i]->blocknr == 0)
48240+ have_to_update_extent ++;
48241+
48242+ page_off = 0;
48243+ buf += to_page;
48244+ left -= to_page;
48245+ BUG_ON(get_current_context()->trans->atom != NULL);
48246+ }
71430cf6 48247+
44254afd 48248+ if (have_to_update_extent) {
71430cf6 48249+ update_extents(file, jnodes, nr_dirty, *pos);
44254afd 48250+ } else {
71430cf6
MT
48251+ for (i = 0; i < nr_dirty; i ++) {
48252+ int ret;
44254afd 48253+ spin_lock_jnode(jnodes[i]);
71430cf6
MT
48254+ ret = reiser4_try_capture(jnodes[i],
48255+ ZNODE_WRITE_LOCK, 0);
48256+ BUG_ON(ret != 0);
44254afd
MT
48257+ jnode_make_dirty_locked(jnodes[i]);
48258+ spin_unlock_jnode(jnodes[i]);
48259+ }
48260+ }
71430cf6 48261+out:
44254afd 48262+ for (i = 0; i < nr_pages; i ++) {
71430cf6 48263+ page_cache_release(jnode_page(jnodes[i]));
44254afd
MT
48264+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48265+ jput(jnodes[i]);
48266+ }
48267+
71430cf6
MT
48268+ /* the only errors handled so far is ENOMEM and
48269+ EFAULT on copy_from_user */
44254afd 48270+
71430cf6 48271+ return (count - left) ? (count - left) : result;
44254afd
MT
48272+}
48273+
71430cf6
MT
48274+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48275+ struct page *page)
44254afd
MT
48276+{
48277+ jnode *j;
48278+ struct address_space *mapping;
48279+ unsigned long index;
48280+ oid_t oid;
48281+ reiser4_block_nr block;
48282+
48283+ mapping = page->mapping;
48284+ oid = get_inode_oid(mapping->host);
48285+ index = page->index;
48286+
48287+ switch (state_of_extent(ext)) {
48288+ case HOLE_EXTENT:
48289+ /*
48290+ * it is possible to have hole page with jnode, if page was
48291+ * eflushed previously.
48292+ */
48293+ j = jfind(mapping, index);
48294+ if (j == NULL) {
71430cf6
MT
48295+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
48296+ SetPageUptodate(page);
48297+ unlock_page(page);
44254afd
MT
48298+ return 0;
48299+ }
48300+ spin_lock_jnode(j);
48301+ if (!jnode_page(j)) {
48302+ jnode_attach_page(j, page);
48303+ } else {
48304+ BUG_ON(jnode_page(j) != page);
48305+ assert("vs-1504", jnode_page(j) == page);
48306+ }
48307+ block = *jnode_get_io_block(j);
48308+ spin_unlock_jnode(j);
48309+ if (block == 0) {
71430cf6
MT
48310+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
48311+ SetPageUptodate(page);
48312+ unlock_page(page);
44254afd
MT
48313+ jput(j);
48314+ return 0;
48315+ }
48316+ break;
48317+
48318+ case ALLOCATED_EXTENT:
48319+ j = jnode_of_page(page);
48320+ if (IS_ERR(j))
48321+ return PTR_ERR(j);
48322+ if (*jnode_get_block(j) == 0) {
48323+ reiser4_block_nr blocknr;
48324+
48325+ blocknr = extent_get_start(ext) + pos;
48326+ jnode_set_block(j, &blocknr);
48327+ } else
48328+ assert("vs-1403",
48329+ j->blocknr == extent_get_start(ext) + pos);
48330+ break;
48331+
48332+ case UNALLOCATED_EXTENT:
48333+ j = jfind(mapping, index);
48334+ assert("nikita-2688", j);
48335+ assert("vs-1426", jnode_page(j) == NULL);
48336+
48337+ spin_lock_jnode(j);
48338+ jnode_attach_page(j, page);
48339+ spin_unlock_jnode(j);
48340+ break;
48341+
48342+ default:
48343+ warning("vs-957", "wrong extent\n");
48344+ return RETERR(-EIO);
48345+ }
48346+
48347+ BUG_ON(j == 0);
71430cf6 48348+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
44254afd
MT
48349+ jput(j);
48350+ return 0;
48351+}
48352+
44254afd 48353+/* Implements plugin->u.item.s.file.read operation for extent items. */
71430cf6 48354+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
44254afd
MT
48355+{
48356+ int result;
48357+ struct page *page;
48358+ unsigned long cur_page, next_page;
48359+ unsigned long page_off, count;
48360+ struct address_space *mapping;
48361+ loff_t file_off;
48362+ uf_coord_t *uf_coord;
48363+ coord_t *coord;
71430cf6
MT
48364+ struct extent_coord_extension *ext_coord;
48365+ unsigned long nr_pages;
44254afd
MT
48366+ char *kaddr;
48367+
48368+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
48369+ assert("vs-572", flow->user == 1);
48370+ assert("vs-1351", flow->length > 0);
48371+
48372+ uf_coord = &hint->ext_coord;
71430cf6 48373+
44254afd
MT
48374+ check_uf_coord(uf_coord, NULL);
48375+ assert("vs-33", uf_coord->lh == &hint->lh);
48376+
48377+ coord = &uf_coord->coord;
48378+ assert("vs-1119", znode_is_rlocked(coord->node));
48379+ assert("vs-1120", znode_is_loaded(coord->node));
48380+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
48381+
48382+ mapping = file->f_dentry->d_inode->i_mapping;
48383+ ext_coord = &uf_coord->extension.extent;
48384+
48385+ /* offset in a file to start read from */
48386+ file_off = get_key_offset(&flow->key);
48387+ /* offset within the page to start read from */
48388+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
48389+ /* bytes which can be read from the page which contains file_off */
48390+ count = PAGE_CACHE_SIZE - page_off;
48391+
48392+ /* index of page containing offset read is to start from */
48393+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
48394+ next_page = cur_page;
48395+ /* number of pages flow spans over */
48396+ nr_pages =
48397+ ((file_off + flow->length + PAGE_CACHE_SIZE -
48398+ 1) >> PAGE_CACHE_SHIFT) - cur_page;
48399+
48400+ /* we start having twig node read locked. However, we do not want to
48401+ keep that lock all the time readahead works. So, set a sel and
48402+ release twig node. */
71430cf6 48403+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
44254afd
MT
48404+ /* &hint->lh is done-ed */
48405+
44254afd 48406+ do {
71430cf6
MT
48407+ reiser4_txn_restart_current();
48408+ page = read_mapping_page(mapping, cur_page, file);
48409+ if (IS_ERR(page))
48410+ return PTR_ERR(page);
48411+ lock_page(page);
48412+ if (!PageUptodate(page)) {
44254afd 48413+ unlock_page(page);
71430cf6
MT
48414+ page_cache_release(page);
48415+ warning("jmacd-97178", "extent_read: page is not up to date");
48416+ return RETERR(-EIO);
44254afd 48417+ }
71430cf6
MT
48418+ mark_page_accessed(page);
48419+ unlock_page(page);
44254afd
MT
48420+
48421+ /* If users can be writing to this page using arbitrary virtual
48422+ addresses, take care about potential aliasing before reading
48423+ the page on the kernel side.
48424+ */
48425+ if (mapping_writably_mapped(mapping))
48426+ flush_dcache_page(page);
48427+
71430cf6 48428+ assert("nikita-3034", reiser4_schedulable());
44254afd
MT
48429+
48430+ /* number of bytes which are to be read from the page */
48431+ if (count > flow->length)
48432+ count = flow->length;
48433+
48434+ result = fault_in_pages_writeable(flow->data, count);
48435+ if (result) {
48436+ page_cache_release(page);
48437+ return RETERR(-EFAULT);
48438+ }
48439+
48440+ kaddr = kmap_atomic(page, KM_USER0);
48441+ result = __copy_to_user_inatomic(flow->data,
48442+ kaddr + page_off, count);
48443+ kunmap_atomic(kaddr, KM_USER0);
48444+ if (result != 0) {
48445+ kaddr = kmap(page);
48446+ result = __copy_to_user(flow->data, kaddr + page_off, count);
48447+ kunmap(page);
48448+ if (unlikely(result))
48449+ return RETERR(-EFAULT);
48450+ }
48451+
48452+ page_cache_release(page);
48453+
48454+ /* increase key (flow->key), update user area pointer (flow->data) */
48455+ move_flow_forward(flow, count);
48456+
48457+ page_off = 0;
48458+ cur_page ++;
48459+ count = PAGE_CACHE_SIZE;
48460+ nr_pages--;
48461+ } while (flow->length);
48462+
44254afd
MT
48463+ return 0;
48464+}
48465+
48466+/*
44254afd
MT
48467+ plugin->s.file.readpage
48468+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
48469+ or
48470+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
48471+
48472+ At the beginning: coord->node is read locked, zloaded, page is
48473+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
48474+*/
71430cf6 48475+int reiser4_readpage_extent(void *vp, struct page *page)
44254afd
MT
48476+{
48477+ uf_coord_t *uf_coord = vp;
48478+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
48479+ ON_DEBUG(reiser4_key key);
48480+
48481+ assert("vs-1040", PageLocked(page));
48482+ assert("vs-1050", !PageUptodate(page));
48483+ assert("vs-1039", page->mapping && page->mapping->host);
48484+
48485+ assert("vs-1044", znode_is_loaded(coord->node));
48486+ assert("vs-758", item_is_extent(coord));
48487+ assert("vs-1046", coord_is_existing_unit(coord));
48488+ assert("vs-1045", znode_is_rlocked(coord->node));
48489+ assert("vs-1047",
48490+ page->mapping->host->i_ino ==
48491+ get_key_objectid(item_key_by_coord(coord, &key)));
48492+ check_uf_coord(uf_coord, NULL);
48493+
71430cf6
MT
48494+ return reiser4_do_readpage_extent(
48495+ ext_by_ext_coord(uf_coord),
48496+ uf_coord->extension.extent.pos_in_unit, page);
44254afd
MT
48497+}
48498+
48499+/**
48500+ * get_block_address_extent
48501+ * @coord:
48502+ * @block:
48503+ * @result:
48504+ *
48505+ *
48506+ */
48507+int get_block_address_extent(const coord_t *coord, sector_t block,
48508+ sector_t *result)
48509+{
48510+ reiser4_extent *ext;
48511+
48512+ if (!coord_is_existing_unit(coord))
48513+ return RETERR(-EINVAL);
48514+
48515+ ext = extent_by_coord(coord);
48516+
48517+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
48518+ /* FIXME: bad things may happen if it is unallocated extent */
48519+ *result = 0;
48520+ else {
48521+ reiser4_key key;
48522+
48523+ unit_key_by_coord(coord, &key);
48524+ assert("vs-1645",
48525+ block >= get_key_offset(&key) >> current_blocksize_bits);
48526+ assert("vs-1646",
48527+ block <
48528+ (get_key_offset(&key) >> current_blocksize_bits) +
48529+ extent_get_width(ext));
48530+ *result =
48531+ extent_get_start(ext) + (block -
48532+ (get_key_offset(&key) >>
48533+ current_blocksize_bits));
48534+ }
48535+ return 0;
48536+}
48537+
48538+/*
48539+ plugin->u.item.s.file.append_key
48540+ key of first byte which is the next to last byte by addressed by this extent
48541+*/
48542+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
48543+{
48544+ item_key_by_coord(coord, key);
48545+ set_key_offset(key,
71430cf6
MT
48546+ get_key_offset(key) + reiser4_extent_size(coord,
48547+ nr_units_extent
48548+ (coord)));
44254afd
MT
48549+
48550+ assert("vs-610", get_key_offset(key)
48551+ && (get_key_offset(key) & (current_blocksize - 1)) == 0);
48552+ return key;
48553+}
48554+
48555+/* plugin->u.item.s.file.init_coord_extension */
48556+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
48557+{
48558+ coord_t *coord;
71430cf6 48559+ struct extent_coord_extension *ext_coord;
44254afd
MT
48560+ reiser4_key key;
48561+ loff_t offset;
48562+
48563+ assert("vs-1295", uf_coord->valid == 0);
48564+
48565+ coord = &uf_coord->coord;
48566+ assert("vs-1288", coord_is_iplug_set(coord));
48567+ assert("vs-1327", znode_is_loaded(coord->node));
48568+
48569+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
48570+ return;
48571+
48572+ ext_coord = &uf_coord->extension.extent;
48573+ ext_coord->nr_units = nr_units_extent(coord);
48574+ ext_coord->ext_offset =
48575+ (char *)extent_by_coord(coord) - zdata(coord->node);
48576+ ext_coord->width = extent_get_width(extent_by_coord(coord));
48577+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
48578+ uf_coord->valid = 1;
48579+
48580+ /* pos_in_unit is the only uninitialized field in extended coord */
48581+ if (coord->between == AFTER_UNIT) {
48582+ assert("vs-1330",
48583+ coord->unit_pos == nr_units_extent(coord) - 1);
48584+
48585+ ext_coord->pos_in_unit = ext_coord->width - 1;
48586+ } else {
48587+ /* AT_UNIT */
48588+ unit_key_by_coord(coord, &key);
48589+ offset = get_key_offset(&key);
48590+
48591+ assert("vs-1328", offset <= lookuped);
48592+ assert("vs-1329",
48593+ lookuped <
48594+ offset + ext_coord->width * current_blocksize);
48595+ ext_coord->pos_in_unit =
48596+ ((lookuped - offset) >> current_blocksize_bits);
48597+ }
48598+}
48599+
48600+/*
48601+ * Local variables:
48602+ * c-indentation-style: "K&R"
48603+ * mode-name: "LC"
48604+ * c-basic-offset: 8
48605+ * tab-width: 8
48606+ * fill-column: 79
48607+ * scroll-step: 1
48608+ * End:
48609+ */
71430cf6
MT
48610diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.22/fs/reiser4/plugin/item/extent_flush_ops.c
48611--- linux-2.6.22.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
48612+++ linux-2.6.22/fs/reiser4/plugin/item/extent_flush_ops.c 2007-07-29 00:25:34.964719254 +0400
48613@@ -0,0 +1,1028 @@
44254afd
MT
48614+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48615+
48616+#include "item.h"
48617+#include "../../tree.h"
48618+#include "../../jnode.h"
48619+#include "../../super.h"
48620+#include "../../flush.h"
48621+#include "../../carry.h"
48622+#include "../object.h"
48623+
48624+#include <linux/pagemap.h>
48625+
48626+static reiser4_block_nr extent_unit_start(const coord_t * item);
48627+
48628+/* Return either first or last extent (depending on @side) of the item
48629+ @coord is set to. Set @pos_in_unit either to first or to last block
48630+ of extent. */
48631+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
48632+ reiser4_block_nr * pos_in_unit)
48633+{
48634+ reiser4_extent *ext;
48635+
48636+ if (side == LEFT_SIDE) {
48637+ /* get first extent of item */
48638+ ext = extent_item(coord);
48639+ *pos_in_unit = 0;
48640+ } else {
48641+ /* get last extent of item and last position within it */
48642+ assert("vs-363", side == RIGHT_SIDE);
48643+ ext = extent_item(coord) + coord_last_unit_pos(coord);
48644+ *pos_in_unit = extent_get_width(ext) - 1;
48645+ }
48646+
48647+ return ext;
48648+}
48649+
48650+/* item_plugin->f.utmost_child */
48651+/* Return the child. Coord is set to extent item. Find jnode corresponding
48652+ either to first or to last unformatted node pointed by the item */
48653+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
48654+{
48655+ reiser4_extent *ext;
48656+ reiser4_block_nr pos_in_unit;
48657+
48658+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
48659+
48660+ switch (state_of_extent(ext)) {
48661+ case HOLE_EXTENT:
48662+ *childp = NULL;
48663+ return 0;
48664+ case ALLOCATED_EXTENT:
48665+ case UNALLOCATED_EXTENT:
48666+ break;
48667+ default:
48668+ /* this should never happen */
48669+ assert("vs-1417", 0);
48670+ }
48671+
48672+ {
48673+ reiser4_key key;
48674+ reiser4_tree *tree;
48675+ unsigned long index;
48676+
48677+ if (side == LEFT_SIDE) {
48678+ /* get key of first byte addressed by the extent */
48679+ item_key_by_coord(coord, &key);
48680+ } else {
48681+ /* get key of byte which next after last byte addressed by the extent */
48682+ append_key_extent(coord, &key);
48683+ }
48684+
48685+ assert("vs-544",
48686+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
48687+ /* index of first or last (depending on @side) page addressed
48688+ by the extent */
48689+ index =
48690+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
48691+ if (side == RIGHT_SIDE)
48692+ index--;
48693+
48694+ tree = coord->node->zjnode.tree;
48695+ *childp = jlookup(tree, get_key_objectid(&key), index);
48696+ }
48697+
48698+ return 0;
48699+}
48700+
48701+/* item_plugin->f.utmost_child_real_block */
48702+/* Return the child's block, if allocated. */
48703+int
48704+utmost_child_real_block_extent(const coord_t * coord, sideof side,
48705+ reiser4_block_nr * block)
48706+{
48707+ reiser4_extent *ext;
48708+
48709+ ext = extent_by_coord(coord);
48710+
48711+ switch (state_of_extent(ext)) {
48712+ case ALLOCATED_EXTENT:
48713+ *block = extent_get_start(ext);
48714+ if (side == RIGHT_SIDE)
48715+ *block += extent_get_width(ext) - 1;
48716+ break;
48717+ case HOLE_EXTENT:
48718+ case UNALLOCATED_EXTENT:
48719+ *block = 0;
48720+ break;
48721+ default:
48722+ /* this should never happen */
48723+ assert("vs-1418", 0);
48724+ }
48725+
48726+ return 0;
48727+}
48728+
48729+/* item_plugin->f.scan */
48730+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
48731+ This scan continues, advancing the parent coordinate, until either it encounters a
48732+ formatted child or it finishes scanning this node.
48733+
48734+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
48735+ not sure this is last property (same atom) is enforced, but it should be the case since
48736+ one atom must write the parent and the others must read the parent, thus fusing?). In
48737+ any case, the code below asserts this case for unallocated extents. Unallocated
48738+ extents are thus optimized because we can skip to the endpoint when scanning.
48739+
71430cf6
MT
48740+ It returns control to reiser4_scan_extent, handles these terminating conditions,
48741+ e.g., by loading the next twig.
44254afd 48742+*/
71430cf6 48743+int reiser4_scan_extent(flush_scan * scan)
44254afd
MT
48744+{
48745+ coord_t coord;
48746+ jnode *neighbor;
48747+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
48748+ reiser4_block_nr unit_start;
48749+ __u64 oid;
48750+ reiser4_key key;
48751+ int ret = 0, allocated, incr;
48752+ reiser4_tree *tree;
48753+
48754+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
48755+ scan->stop = 1;
48756+ return 0; /* Race with truncate, this node is already
48757+ * truncated. */
48758+ }
48759+
48760+ coord_dup(&coord, &scan->parent_coord);
48761+
71430cf6 48762+ assert("jmacd-1404", !reiser4_scan_finished(scan));
44254afd
MT
48763+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
48764+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
48765+
48766+ /* The scan_index variable corresponds to the current page index of the
48767+ unformatted block scan position. */
48768+ scan_index = index_jnode(scan->node);
48769+
48770+ assert("jmacd-7889", item_is_extent(&coord));
48771+
48772+ repeat:
48773+ /* objectid of file */
48774+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
48775+
48776+ allocated = !extent_is_unallocated(&coord);
48777+ /* Get the values of this extent unit: */
48778+ unit_index = extent_unit_index(&coord);
48779+ unit_width = extent_unit_width(&coord);
48780+ unit_start = extent_unit_start(&coord);
48781+
48782+ assert("jmacd-7187", unit_width > 0);
48783+ assert("jmacd-7188", scan_index >= unit_index);
48784+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
48785+
48786+ /* Depending on the scan direction, we set different maximum values for scan_index
48787+ (scan_max) and the number of nodes that would be passed if the scan goes the
48788+ entire way (scan_dist). Incr is an integer reflecting the incremental
48789+ direction of scan_index. */
71430cf6 48790+ if (reiser4_scanning_left(scan)) {
44254afd
MT
48791+ scan_max = unit_index;
48792+ scan_dist = scan_index - unit_index;
48793+ incr = -1;
48794+ } else {
48795+ scan_max = unit_index + unit_width - 1;
48796+ scan_dist = scan_max - unit_index;
48797+ incr = +1;
48798+ }
48799+
48800+ tree = coord.node->zjnode.tree;
48801+
48802+ /* If the extent is allocated we have to check each of its blocks. If the extent
48803+ is unallocated we can skip to the scan_max. */
48804+ if (allocated) {
48805+ do {
48806+ neighbor = jlookup(tree, oid, scan_index);
48807+ if (neighbor == NULL)
48808+ goto stop_same_parent;
48809+
48810+ if (scan->node != neighbor
71430cf6
MT
48811+ && !reiser4_scan_goto(scan, neighbor)) {
48812+ /* @neighbor was jput() by reiser4_scan_goto */
44254afd
MT
48813+ goto stop_same_parent;
48814+ }
48815+
48816+ ret = scan_set_current(scan, neighbor, 1, &coord);
48817+ if (ret != 0) {
48818+ goto exit;
48819+ }
48820+
48821+ /* reference to @neighbor is stored in @scan, no need
48822+ to jput(). */
48823+ scan_index += incr;
48824+
48825+ } while (incr + scan_max != scan_index);
48826+
48827+ } else {
48828+ /* Optimized case for unallocated extents, skip to the end. */
48829+ neighbor = jlookup(tree, oid, scan_max /*index */ );
48830+ if (neighbor == NULL) {
48831+ /* Race with truncate */
48832+ scan->stop = 1;
48833+ ret = 0;
48834+ goto exit;
48835+ }
48836+
71430cf6
MT
48837+ assert("zam-1043",
48838+ reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
44254afd
MT
48839+
48840+ ret = scan_set_current(scan, neighbor, scan_dist, &coord);
48841+ if (ret != 0) {
48842+ goto exit;
48843+ }
48844+ }
48845+
48846+ if (coord_sideof_unit(&coord, scan->direction) == 0
48847+ && item_is_extent(&coord)) {
48848+ /* Continue as long as there are more extent units. */
48849+
48850+ scan_index =
48851+ extent_unit_index(&coord) +
71430cf6
MT
48852+ (reiser4_scanning_left(scan) ?
48853+ extent_unit_width(&coord) - 1 : 0);
44254afd
MT
48854+ goto repeat;
48855+ }
48856+
48857+ if (0) {
48858+ stop_same_parent:
48859+
48860+ /* If we are scanning left and we stop in the middle of an allocated
48861+ extent, we know the preceder immediately.. */
48862+ /* middle of extent is (scan_index - unit_index) != 0. */
71430cf6
MT
48863+ if (reiser4_scanning_left(scan) &&
48864+ (scan_index - unit_index) != 0) {
44254afd
MT
48865+ /* FIXME(B): Someone should step-through and verify that this preceder
48866+ calculation is indeed correct. */
48867+ /* @unit_start is starting block (number) of extent
48868+ unit. Flush stopped at the @scan_index block from
48869+ the beginning of the file, which is (scan_index -
48870+ unit_index) block within extent.
48871+ */
48872+ if (unit_start) {
48873+ /* skip preceder update when we are at hole */
48874+ scan->preceder_blk =
48875+ unit_start + scan_index - unit_index;
48876+ check_preceder(scan->preceder_blk);
48877+ }
48878+ }
48879+
48880+ /* In this case, we leave coord set to the parent of scan->node. */
48881+ scan->stop = 1;
48882+
48883+ } else {
48884+ /* In this case, we are still scanning, coord is set to the next item which is
48885+ either off-the-end of the node or not an extent. */
48886+ assert("jmacd-8912", scan->stop == 0);
48887+ assert("jmacd-7812",
48888+ (coord_is_after_sideof_unit(&coord, scan->direction)
48889+ || !item_is_extent(&coord)));
48890+ }
48891+
48892+ ret = 0;
48893+ exit:
48894+ return ret;
48895+}
48896+
48897+/* ask block allocator for some blocks */
48898+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
48899+ reiser4_block_nr wanted_count,
48900+ reiser4_block_nr *first_allocated,
48901+ reiser4_block_nr *allocated,
48902+ block_stage_t block_stage)
48903+{
48904+ *allocated = wanted_count;
48905+ preceder->max_dist = 0; /* scan whole disk, if needed */
48906+
48907+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
48908+ preceder->block_stage = block_stage;
48909+
48910+ /* FIXME: we do not handle errors here now */
48911+ check_me("vs-420",
48912+ reiser4_alloc_blocks(preceder, first_allocated, allocated,
48913+ BA_PERMANENT) == 0);
48914+ /* update flush_pos's preceder to last allocated block number */
48915+ preceder->blk = *first_allocated + *allocated - 1;
48916+}
48917+
48918+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
48919+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
48920+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
48921+static reiser4_block_nr reserve_replace(void)
48922+{
48923+ reiser4_block_nr grabbed, needed;
48924+
48925+ grabbed = get_current_context()->grabbed_blocks;
48926+ needed = estimate_one_insert_into_item(current_tree);
48927+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
48928+ return grabbed;
48929+}
48930+
48931+static void free_replace_reserved(reiser4_block_nr grabbed)
48932+{
48933+ reiser4_context *ctx;
48934+
48935+ ctx = get_current_context();
48936+ grabbed2free(ctx, get_super_private(ctx->super),
48937+ ctx->grabbed_blocks - grabbed);
48938+}
48939+
48940+/* Block offset of first block addressed by unit */
48941+__u64 extent_unit_index(const coord_t * item)
48942+{
48943+ reiser4_key key;
48944+
48945+ assert("vs-648", coord_is_existing_unit(item));
48946+ unit_key_by_coord(item, &key);
48947+ return get_key_offset(&key) >> current_blocksize_bits;
48948+}
48949+
48950+/* AUDIT shouldn't return value be of reiser4_block_nr type?
48951+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
48952+__u64 extent_unit_width(const coord_t * item)
48953+{
48954+ assert("vs-649", coord_is_existing_unit(item));
48955+ return width_by_coord(item);
48956+}
48957+
48958+/* Starting block location of this unit */
48959+static reiser4_block_nr extent_unit_start(const coord_t * item)
48960+{
48961+ return extent_get_start(extent_by_coord(item));
48962+}
48963+
48964+/**
48965+ * split_allocated_extent -
48966+ * @coord:
48967+ * @pos_in_unit:
48968+ *
48969+ * replace allocated extent with two allocated extents
48970+ */
48971+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
48972+{
48973+ int result;
48974+ struct replace_handle *h;
48975+ reiser4_extent *ext;
48976+ reiser4_block_nr grabbed;
48977+
48978+ ext = extent_by_coord(coord);
48979+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
48980+ assert("vs-1411", extent_get_width(ext) > pos_in_unit);
48981+
71430cf6 48982+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
44254afd
MT
48983+ if (h == NULL)
48984+ return RETERR(-ENOMEM);
48985+ h->coord = coord;
48986+ h->lh = znode_lh(coord->node);
48987+ h->pkey = &h->key;
48988+ unit_key_by_coord(coord, h->pkey);
48989+ set_key_offset(h->pkey,
48990+ (get_key_offset(h->pkey) +
48991+ pos_in_unit * current_blocksize));
71430cf6
MT
48992+ reiser4_set_extent(&h->overwrite, extent_get_start(ext),
48993+ pos_in_unit);
48994+ reiser4_set_extent(&h->new_extents[0],
48995+ extent_get_start(ext) + pos_in_unit,
48996+ extent_get_width(ext) - pos_in_unit);
44254afd
MT
48997+ h->nr_new_extents = 1;
48998+ h->flags = COPI_DONT_SHIFT_LEFT;
48999+ h->paste_key = h->key;
49000+
49001+ /* reserve space for extent unit paste, @grabbed is reserved before */
49002+ grabbed = reserve_replace();
71430cf6
MT
49003+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49004+ extent */);
44254afd
MT
49005+ /* restore reserved */
49006+ free_replace_reserved(grabbed);
49007+ kfree(h);
49008+ return result;
49009+}
49010+
49011+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49012+ one). Return 1 if it succeeded, 0 - otherwise */
49013+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49014+ reiser4_extent *replace)
49015+{
49016+ assert("vs-1415", extent_by_coord(coord) == ext);
49017+
49018+ if (coord->unit_pos == 0
49019+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49020+ /* @ext either does not exist or is not allocated extent */
49021+ return 0;
49022+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49023+ extent_get_start(replace))
49024+ return 0;
49025+
49026+ /* we can glue, widen previous unit */
49027+ extent_set_width(ext - 1,
49028+ extent_get_width(ext - 1) + extent_get_width(replace));
49029+
49030+ if (extent_get_width(ext) != extent_get_width(replace)) {
49031+ /* make current extent narrower */
49032+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
49033+ extent_set_start(ext,
49034+ extent_get_start(ext) +
49035+ extent_get_width(replace));
49036+ extent_set_width(ext,
49037+ extent_get_width(ext) -
49038+ extent_get_width(replace));
49039+ } else {
49040+ /* current extent completely glued with its left neighbor, remove it */
49041+ coord_t from, to;
49042+
49043+ coord_dup(&from, coord);
49044+ from.unit_pos = nr_units_extent(coord) - 1;
49045+ coord_dup(&to, &from);
49046+
49047+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49048+ freed after unit removal to end of item */
49049+ memmove(ext, ext + 1,
49050+ (from.unit_pos -
49051+ coord->unit_pos) * sizeof(reiser4_extent));
49052+ /* wipe part of item which is going to be cut, so that node_check will not be confused */
49053+ cut_node_content(&from, &to, NULL, NULL, NULL);
49054+ }
49055+ znode_make_dirty(coord->node);
49056+ /* move coord back */
49057+ coord->unit_pos--;
49058+ return 1;
49059+}
49060+
49061+/**
49062+ * conv_extent - replace extent with 2 ones
49063+ * @coord: coordinate of extent to be replaced
49064+ * @replace: extent to overwrite the one @coord is set to
49065+ *
49066+ * Overwrites extent @coord is set to and paste one extent unit after
49067+ * overwritten one if @replace is shorter than initial extent
49068+ */
49069+static int conv_extent(coord_t *coord, reiser4_extent *replace)
49070+{
49071+ int result;
49072+ struct replace_handle *h;
49073+ reiser4_extent *ext;
49074+ reiser4_block_nr start, width, new_width;
49075+ reiser4_block_nr grabbed;
49076+ extent_state state;
49077+
49078+ ext = extent_by_coord(coord);
49079+ state = state_of_extent(ext);
49080+ start = extent_get_start(ext);
49081+ width = extent_get_width(ext);
49082+ new_width = extent_get_width(replace);
49083+
49084+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
49085+ state == ALLOCATED_EXTENT));
49086+ assert("vs-1459", width >= new_width);
49087+
49088+ if (try_to_merge_with_left(coord, ext, replace)) {
49089+ /* merged @replace with left neighbor. Current unit is either
49090+ removed or narrowed */
49091+ return 0;
49092+ }
49093+
49094+ if (width == new_width) {
49095+ /* replace current extent with @replace */
49096+ *ext = *replace;
49097+ znode_make_dirty(coord->node);
49098+ return 0;
49099+ }
49100+
71430cf6 49101+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
44254afd
MT
49102+ if (h == NULL)
49103+ return RETERR(-ENOMEM);
49104+ h->coord = coord;
49105+ h->lh = znode_lh(coord->node);
49106+ h->pkey = &h->key;
49107+ unit_key_by_coord(coord, h->pkey);
49108+ set_key_offset(h->pkey,
49109+ (get_key_offset(h->pkey) + new_width * current_blocksize));
49110+ h->overwrite = *replace;
49111+
49112+ /* replace @ext with @replace and padding extent */
71430cf6
MT
49113+ reiser4_set_extent(&h->new_extents[0],
49114+ (state == ALLOCATED_EXTENT) ?
49115+ (start + new_width) :
49116+ UNALLOCATED_EXTENT_START,
49117+ width - new_width);
44254afd
MT
49118+ h->nr_new_extents = 1;
49119+ h->flags = COPI_DONT_SHIFT_LEFT;
49120+ h->paste_key = h->key;
49121+
49122+ /* reserve space for extent unit paste, @grabbed is reserved before */
49123+ grabbed = reserve_replace();
71430cf6
MT
49124+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
49125+ extent */);
44254afd
MT
49126+
49127+ /* restore reserved */
49128+ free_replace_reserved(grabbed);
49129+ kfree(h);
49130+ return result;
49131+}
49132+
49133+/**
49134+ * assign_real_blocknrs
49135+ * @flush_pos:
49136+ * @oid: objectid of file jnodes to assign block number to belongs to
49137+ * @index: first jnode on the range
49138+ * @count: number of jnodes to assign block numbers to
49139+ * @first: start of allocated block range
71430cf6 49140+ *
44254afd
MT
49141+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
49142+ * @index. Jnodes get lookuped with jlookup.
49143+ */
49144+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
49145+ unsigned long index, reiser4_block_nr count,
49146+ reiser4_block_nr first)
49147+{
49148+ unsigned long i;
49149+ reiser4_tree *tree;
49150+ txn_atom *atom;
49151+ int nr;
49152+
49153+ atom = atom_locked_by_fq(flush_pos->fq);
49154+ assert("vs-1468", atom);
49155+ BUG_ON(atom == NULL);
49156+
49157+ nr = 0;
49158+ tree = current_tree;
49159+ for (i = 0; i < count; ++i, ++index) {
49160+ jnode *node;
49161+
49162+ node = jlookup(tree, oid, index);
49163+ assert("", node != NULL);
49164+ BUG_ON(node == NULL);
49165+
49166+ spin_lock_jnode(node);
49167+ assert("", !jnode_is_flushprepped(node));
49168+ assert("vs-1475", node->atom == atom);
49169+ assert("vs-1476", atomic_read(&node->x_count) > 0);
49170+
49171+ JF_CLR(node, JNODE_FLUSH_RESERVED);
49172+ jnode_set_block(node, &first);
49173+ unformatted_make_reloc(node, flush_pos->fq);
49174+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
49175+ FQ_LIST, 0));
49176+ spin_unlock_jnode(node);
49177+ first++;
49178+
49179+ atomic_dec(&node->x_count);
49180+ nr ++;
49181+ }
49182+
49183+ spin_unlock_atom(atom);
49184+ return;
49185+}
49186+
49187+/**
49188+ * make_node_ovrwr - assign node to overwrite set
49189+ * @jnodes: overwrite set list head
49190+ * @node: jnode to belong to overwrite set
49191+ *
49192+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
49193+ * which is an accumulator for nodes before they get to overwrite set list of
49194+ * atom.
49195+ */
49196+static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
49197+{
49198+ spin_lock_jnode(node);
49199+
49200+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
49201+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
49202+
49203+ JF_SET(node, JNODE_OVRWR);
49204+ list_move_tail(&node->capture_link, jnodes);
49205+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
49206+
49207+ spin_unlock_jnode(node);
49208+}
49209+
49210+/**
49211+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
49212+ * @flush_pos: flush position
49213+ * @oid: objectid of file jnodes belong to
49214+ * @index: starting index
49215+ * @width: extent width
49216+ *
49217+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
49218+ * overwrite set. Starting from the one with index @index. If end of slum is
49219+ * detected (node is not found or flushprepped) - stop iterating and set flush
49220+ * position's state to POS_INVALID.
49221+ */
49222+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
49223+ unsigned long index, reiser4_block_nr width)
49224+{
49225+ unsigned long i;
49226+ reiser4_tree *tree;
49227+ jnode *node;
49228+ txn_atom *atom;
49229+ LIST_HEAD(jnodes);
49230+
49231+ tree = current_tree;
49232+
71430cf6 49233+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
44254afd
MT
49234+ assert("vs-1478", atom);
49235+
49236+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
49237+ node = jlookup(tree, oid, index);
49238+ if (!node) {
49239+ flush_pos->state = POS_INVALID;
49240+ break;
49241+ }
49242+ if (jnode_check_flushprepped(node)) {
49243+ flush_pos->state = POS_INVALID;
49244+ atomic_dec(&node->x_count);
49245+ break;
49246+ }
49247+ if (node->atom != atom) {
49248+ flush_pos->state = POS_INVALID;
49249+ atomic_dec(&node->x_count);
49250+ break;
49251+ }
49252+ make_node_ovrwr(&jnodes, node);
49253+ atomic_dec(&node->x_count);
49254+ }
49255+
49256+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
49257+ spin_unlock_atom(atom);
49258+}
49259+
49260+/**
49261+ * allocated_extent_slum_size
49262+ * @flush_pos:
49263+ * @oid:
49264+ * @index:
49265+ * @count:
49266+ *
49267+ *
49268+ */
49269+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
49270+ unsigned long index, unsigned long count)
49271+{
49272+ unsigned long i;
49273+ reiser4_tree *tree;
49274+ txn_atom *atom;
49275+ int nr;
49276+
71430cf6 49277+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
44254afd
MT
49278+ assert("vs-1468", atom);
49279+
49280+ nr = 0;
49281+ tree = current_tree;
49282+ for (i = 0; i < count; ++i, ++index) {
49283+ jnode *node;
49284+
49285+ node = jlookup(tree, oid, index);
49286+ if (!node)
49287+ break;
49288+
49289+ if (jnode_check_flushprepped(node)) {
49290+ atomic_dec(&node->x_count);
49291+ break;
49292+ }
49293+
49294+ if (node->atom != atom) {
49295+ /*
49296+ * this is possible on overwrite: extent_write may
49297+ * capture several unformatted nodes without capturing
49298+ * any formatted nodes.
49299+ */
49300+ atomic_dec(&node->x_count);
71430cf6 49301+ break;
44254afd
MT
49302+ }
49303+
49304+ assert("vs-1476", atomic_read(&node->x_count) > 1);
49305+ atomic_dec(&node->x_count);
49306+ nr ++;
49307+ }
49308+
49309+ spin_unlock_atom(atom);
49310+ return nr;
49311+}
49312+
49313+/**
49314+ * alloc_extent
49315+ * @flush_pos:
49316+ *
49317+ *
49318+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
49319+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
49320+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
49321+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
49322+ * set to 1 and to overwrite set otherwise
49323+ */
71430cf6 49324+int reiser4_alloc_extent(flush_pos_t *flush_pos)
44254afd
MT
49325+{
49326+ coord_t *coord;
49327+ reiser4_extent *ext;
49328+ reiser4_extent replace_ext;
49329+ oid_t oid;
49330+ reiser4_block_nr protected;
49331+ reiser4_block_nr start;
49332+ __u64 index;
49333+ __u64 width;
49334+ extent_state state;
49335+ int result;
49336+ reiser4_block_nr first_allocated;
49337+ __u64 allocated;
49338+ reiser4_key key;
49339+ block_stage_t block_stage;
49340+
49341+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
49342+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
49343+ && item_is_extent(&flush_pos->coord));
49344+
49345+ coord = &flush_pos->coord;
49346+
49347+ ext = extent_by_coord(coord);
49348+ state = state_of_extent(ext);
49349+ if (state == HOLE_EXTENT) {
49350+ flush_pos->state = POS_INVALID;
49351+ return 0;
49352+ }
49353+
49354+ item_key_by_coord(coord, &key);
49355+ oid = get_key_objectid(&key);
49356+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
49357+ start = extent_get_start(ext);
49358+ width = extent_get_width(ext);
49359+
49360+ assert("vs-1457", width > flush_pos->pos_in_unit);
49361+
49362+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
49363+ /* relocate */
49364+ if (flush_pos->pos_in_unit) {
49365+ /* split extent unit into two */
49366+ result =
49367+ split_allocated_extent(coord,
49368+ flush_pos->pos_in_unit);
49369+ flush_pos->pos_in_unit = 0;
49370+ return result;
49371+ }
49372+
49373+ /* limit number of nodes to allocate */
49374+ if (flush_pos->nr_to_write < width)
49375+ width = flush_pos->nr_to_write;
49376+
49377+ if (state == ALLOCATED_EXTENT) {
49378+ /*
49379+ * all protected nodes are not flushprepped, therefore
49380+ * they are counted as flush_reserved
49381+ */
49382+ block_stage = BLOCK_FLUSH_RESERVED;
49383+ protected = allocated_extent_slum_size(flush_pos, oid,
49384+ index, width);
49385+ if (protected == 0) {
49386+ flush_pos->state = POS_INVALID;
49387+ flush_pos->pos_in_unit = 0;
49388+ return 0;
49389+ }
49390+ } else {
49391+ block_stage = BLOCK_UNALLOCATED;
49392+ protected = width;
49393+ }
49394+
49395+ /*
49396+ * look at previous unit if possible. If it is allocated, make
49397+ * preceder more precise
49398+ */
49399+ if (coord->unit_pos &&
49400+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
71430cf6
MT
49401+ reiser4_pos_hint(flush_pos)->blk =
49402+ extent_get_start(ext - 1) +
44254afd
MT
49403+ extent_get_width(ext - 1);
49404+
49405+ /* allocate new block numbers for protected nodes */
71430cf6
MT
49406+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49407+ protected,
44254afd
MT
49408+ &first_allocated, &allocated,
49409+ block_stage);
49410+
49411+ if (state == ALLOCATED_EXTENT)
49412+ /*
49413+ * on relocating - free nodes which are going to be
49414+ * relocated
49415+ */
49416+ reiser4_dealloc_blocks(&start, &allocated,
49417+ BLOCK_ALLOCATED, BA_DEFER);
49418+
49419+ /* assign new block numbers to protected nodes */
49420+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
49421+
44254afd 49422+ /* prepare extent which will replace current one */
71430cf6 49423+ reiser4_set_extent(&replace_ext, first_allocated, allocated);
44254afd
MT
49424+
49425+ /* adjust extent item */
49426+ result = conv_extent(coord, &replace_ext);
49427+ if (result != 0 && result != -ENOMEM) {
49428+ warning("vs-1461",
49429+ "Failed to allocate extent. Should not happen\n");
49430+ return result;
49431+ }
49432+
49433+ /*
49434+ * break flush: we prepared for flushing as many blocks as we
49435+ * were asked for
49436+ */
49437+ if (flush_pos->nr_to_write == allocated)
49438+ flush_pos->state = POS_INVALID;
49439+ } else {
49440+ /* overwrite */
49441+ mark_jnodes_overwrite(flush_pos, oid, index, width);
49442+ }
49443+ flush_pos->pos_in_unit = 0;
49444+ return 0;
49445+}
49446+
49447+/* if @key is glueable to the item @coord is set to */
49448+static int must_insert(const coord_t *coord, const reiser4_key *key)
49449+{
49450+ reiser4_key last;
49451+
49452+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID
49453+ && keyeq(append_key_extent(coord, &last), key))
49454+ return 0;
49455+ return 1;
49456+}
49457+
49458+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
49459+ or modify last unit of last item to have greater width */
49460+static int put_unit_to_end(znode *node, const reiser4_key *key,
49461+ reiser4_extent *copy_ext)
49462+{
49463+ int result;
49464+ coord_t coord;
49465+ cop_insert_flag flags;
49466+ reiser4_extent *last_ext;
49467+ reiser4_item_data data;
49468+
49469+ /* set coord after last unit in an item */
49470+ coord_init_last_unit(&coord, node);
49471+ coord.between = AFTER_UNIT;
49472+
49473+ flags =
49474+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
49475+ if (must_insert(&coord, key)) {
49476+ result =
49477+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
49478+ key, NULL /*lh */ , flags);
49479+
49480+ } else {
49481+ /* try to glue with last unit */
49482+ last_ext = extent_by_coord(&coord);
49483+ if (state_of_extent(last_ext) &&
49484+ extent_get_start(last_ext) + extent_get_width(last_ext) ==
49485+ extent_get_start(copy_ext)) {
49486+ /* widen last unit of node */
49487+ extent_set_width(last_ext,
49488+ extent_get_width(last_ext) +
49489+ extent_get_width(copy_ext));
49490+ znode_make_dirty(node);
49491+ return 0;
49492+ }
49493+
49494+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
49495+ result =
49496+ insert_into_item(&coord, NULL /*lh */ , key,
49497+ init_new_extent(&data, copy_ext, 1),
49498+ flags);
49499+ }
49500+
49501+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
49502+ return result;
49503+}
49504+
49505+/* @coord is set to extent unit */
49506+squeeze_result squalloc_extent(znode *left, const coord_t *coord,
49507+ flush_pos_t *flush_pos,
49508+ reiser4_key *stop_key)
49509+{
49510+ reiser4_extent *ext;
49511+ __u64 index;
49512+ __u64 width;
49513+ reiser4_block_nr start;
49514+ extent_state state;
49515+ oid_t oid;
49516+ reiser4_block_nr first_allocated;
49517+ __u64 allocated;
49518+ __u64 protected;
49519+ reiser4_extent copy_extent;
49520+ reiser4_key key;
49521+ int result;
49522+ block_stage_t block_stage;
49523+
49524+ assert("vs-1457", flush_pos->pos_in_unit == 0);
49525+ assert("vs-1467", coord_is_leftmost_unit(coord));
49526+ assert("vs-1467", item_is_extent(coord));
49527+
49528+ ext = extent_by_coord(coord);
49529+ index = extent_unit_index(coord);
49530+ start = extent_get_start(ext);
49531+ width = extent_get_width(ext);
49532+ state = state_of_extent(ext);
49533+ unit_key_by_coord(coord, &key);
49534+ oid = get_key_objectid(&key);
49535+
49536+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
49537+ (state == UNALLOCATED_EXTENT)) {
49538+ /* relocate */
49539+ if (state == ALLOCATED_EXTENT) {
49540+ /* all protected nodes are not flushprepped, therefore
49541+ * they are counted as flush_reserved */
49542+ block_stage = BLOCK_FLUSH_RESERVED;
49543+ protected = allocated_extent_slum_size(flush_pos, oid,
49544+ index, width);
49545+ if (protected == 0) {
49546+ flush_pos->state = POS_INVALID;
49547+ flush_pos->pos_in_unit = 0;
49548+ return 0;
49549+ }
49550+ } else {
49551+ block_stage = BLOCK_UNALLOCATED;
49552+ protected = width;
49553+ }
49554+
49555+ /*
49556+ * look at previous unit if possible. If it is allocated, make
49557+ * preceder more precise
49558+ */
49559+ if (coord->unit_pos &&
49560+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
71430cf6
MT
49561+ reiser4_pos_hint(flush_pos)->blk =
49562+ extent_get_start(ext - 1) +
44254afd
MT
49563+ extent_get_width(ext - 1);
49564+
49565+ /* allocate new block numbers for protected nodes */
71430cf6
MT
49566+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49567+ protected,
44254afd
MT
49568+ &first_allocated, &allocated,
49569+ block_stage);
49570+
49571+ /* prepare extent which will be copied to left */
71430cf6 49572+ reiser4_set_extent(&copy_extent, first_allocated, allocated);
44254afd
MT
49573+
49574+ result = put_unit_to_end(left, &key, &copy_extent);
49575+ if (result == -E_NODE_FULL) {
49576+ int target_block_stage;
49577+
49578+ /* free blocks which were just allocated */
49579+ target_block_stage =
49580+ (state ==
49581+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
49582+ BLOCK_UNALLOCATED;
49583+ reiser4_dealloc_blocks(&first_allocated, &allocated,
49584+ target_block_stage,
49585+ BA_PERMANENT);
49586+
49587+ /* rewind the preceder. */
49588+ flush_pos->preceder.blk = first_allocated;
49589+ check_preceder(flush_pos->preceder.blk);
49590+
49591+ return SQUEEZE_TARGET_FULL;
49592+ }
49593+
49594+ if (state == ALLOCATED_EXTENT) {
49595+ /* free nodes which were relocated */
49596+ reiser4_dealloc_blocks(&start, &allocated,
49597+ BLOCK_ALLOCATED, BA_DEFER);
49598+ }
49599+
49600+ /* assign new block numbers to protected nodes */
49601+ assign_real_blocknrs(flush_pos, oid, index, allocated,
49602+ first_allocated);
49603+
49604+ set_key_offset(&key,
49605+ get_key_offset(&key) +
49606+ (allocated << current_blocksize_bits));
49607+ } else {
49608+ /*
49609+ * overwrite: try to copy unit as it is to left neighbor and
49610+ * make all first not flushprepped nodes overwrite nodes
49611+ */
71430cf6 49612+ reiser4_set_extent(&copy_extent, start, width);
44254afd
MT
49613+ result = put_unit_to_end(left, &key, &copy_extent);
49614+ if (result == -E_NODE_FULL)
49615+ return SQUEEZE_TARGET_FULL;
49616+
49617+ if (state != HOLE_EXTENT)
49618+ mark_jnodes_overwrite(flush_pos, oid, index, width);
49619+ set_key_offset(&key,
49620+ get_key_offset(&key) +
49621+ (width << current_blocksize_bits));
49622+ }
49623+ *stop_key = key;
49624+ return SQUEEZE_CONTINUE;
49625+}
49626+
49627+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
49628+{
49629+ return key_by_inode_and_offset_common(inode, off, key);
49630+}
49631+
49632+/*
49633+ * Local variables:
49634+ * c-indentation-style: "K&R"
49635+ * mode-name: "LC"
49636+ * c-basic-offset: 8
49637+ * tab-width: 8
49638+ * fill-column: 79
49639+ * scroll-step: 1
49640+ * End:
49641+ */
71430cf6
MT
49642diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent.h linux-2.6.22/fs/reiser4/plugin/item/extent.h
49643--- linux-2.6.22.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 03:00:00.000000000 +0300
49644+++ linux-2.6.22/fs/reiser4/plugin/item/extent.h 2007-07-29 00:25:34.968720289 +0400
49645@@ -0,0 +1,231 @@
49646+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49647+
49648+#ifndef __REISER4_EXTENT_H__
49649+#define __REISER4_EXTENT_H__
49650+
49651+/* on disk extent */
49652+typedef struct {
49653+ reiser4_dblock_nr start;
49654+ reiser4_dblock_nr width;
49655+} reiser4_extent;
49656+
49657+struct extent_stat {
49658+ int unallocated_units;
49659+ int unallocated_blocks;
49660+ int allocated_units;
49661+ int allocated_blocks;
49662+ int hole_units;
49663+ int hole_blocks;
49664+};
49665+
49666+/* extents in an extent item can be either holes, or unallocated or allocated
49667+ extents */
49668+typedef enum {
49669+ HOLE_EXTENT,
49670+ UNALLOCATED_EXTENT,
49671+ ALLOCATED_EXTENT
49672+} extent_state;
49673+
49674+#define HOLE_EXTENT_START 0
49675+#define UNALLOCATED_EXTENT_START 1
49676+#define UNALLOCATED_EXTENT_START2 2
49677+
49678+struct extent_coord_extension {
49679+ reiser4_block_nr pos_in_unit;
49680+ reiser4_block_nr width; /* width of current unit */
49681+ pos_in_node_t nr_units; /* number of units */
49682+ int ext_offset; /* offset from the beginning of zdata() */
49683+ unsigned long expected_page;
49684+#if REISER4_DEBUG
49685+ reiser4_extent extent;
49686+#endif
49687+};
49688+
49689+/* macros to set/get fields of on-disk extent */
49690+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
49691+{
49692+ return le64_to_cpu(ext->start);
49693+}
49694+
49695+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
49696+{
49697+ return le64_to_cpu(ext->width);
49698+}
49699+
49700+extern __u64 reiser4_current_block_count(void);
49701+
49702+static inline void
49703+extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
49704+{
49705+ cassert(sizeof(ext->start) == 8);
49706+ assert("nikita-2510",
49707+ ergo(start > 1, start < reiser4_current_block_count()));
49708+ put_unaligned(cpu_to_le64(start), &ext->start);
49709+}
49710+
49711+static inline void
49712+extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
49713+{
49714+ cassert(sizeof(ext->width) == 8);
49715+ assert("", width > 0);
49716+ put_unaligned(cpu_to_le64(width), &ext->width);
49717+ assert("nikita-2511",
49718+ ergo(extent_get_start(ext) > 1,
49719+ extent_get_start(ext) + width <=
49720+ reiser4_current_block_count()));
49721+}
49722+
49723+#define extent_item(coord) \
49724+({ \
49725+ assert("nikita-3143", item_is_extent(coord)); \
49726+ ((reiser4_extent *)item_body_by_coord (coord)); \
49727+})
49728+
49729+#define extent_by_coord(coord) \
49730+({ \
49731+ assert("nikita-3144", item_is_extent(coord)); \
49732+ (extent_item (coord) + (coord)->unit_pos); \
49733+})
49734+
49735+#define width_by_coord(coord) \
49736+({ \
49737+ assert("nikita-3145", item_is_extent(coord)); \
49738+ extent_get_width (extent_by_coord(coord)); \
49739+})
49740+
49741+struct carry_cut_data;
49742+struct carry_kill_data;
49743+
49744+/* plugin->u.item.b.* */
49745+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
49746+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49747+ const reiser4_item_data *);
49748+int mergeable_extent(const coord_t * p1, const coord_t * p2);
49749+pos_in_node_t nr_units_extent(const coord_t *);
49750+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
49751+void init_coord_extent(coord_t *);
49752+int init_extent(coord_t *, reiser4_item_data *);
49753+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
49754+int can_shift_extent(unsigned free_space,
49755+ coord_t * source, znode * target, shift_direction,
49756+ unsigned *size, unsigned want);
49757+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
49758+ unsigned count, shift_direction where_is_free_space,
49759+ unsigned free_space);
49760+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
49761+ struct carry_kill_data *);
49762+int create_hook_extent(const coord_t * coord, void *arg);
49763+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49764+ struct carry_cut_data *, reiser4_key * smallest_removed,
49765+ reiser4_key * new_first);
49766+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49767+ struct carry_kill_data *, reiser4_key * smallest_removed,
49768+ reiser4_key * new_first);
49769+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
49770+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
49771+void print_extent(const char *, coord_t *);
49772+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
49773+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
49774+ reiser4_block_nr * block);
49775+void item_stat_extent(const coord_t * coord, void *vp);
49776+int reiser4_check_extent(const coord_t * coord, const char **error);
49777+
49778+/* plugin->u.item.s.file.* */
49779+ssize_t reiser4_write_extent(struct file *, const char __user *,
49780+ size_t, loff_t *);
49781+int reiser4_read_extent(struct file *, flow_t *, hint_t *);
49782+int reiser4_readpage_extent(void *, struct page *);
49783+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
49784+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
49785+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
49786+int get_block_address_extent(const coord_t *, sector_t block,
49787+ sector_t * result);
49788+
49789+/* these are used in flush.c
49790+ FIXME-VS: should they be somewhere in item_plugin? */
49791+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
49792+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
49793+ reiser4_key * stop_key);
49794+
49795+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
49796+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
49797+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
49798+
49799+/* plugin->u.item.f. */
49800+int reiser4_scan_extent(flush_scan * scan);
49801+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
49802+
49803+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
49804+ int nr_extents);
49805+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
49806+extent_state state_of_extent(reiser4_extent * ext);
49807+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
49808+ reiser4_block_nr width);
49809+int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
49810+ int *plugged_hole);
49811+
49812+#include "../../coord.h"
49813+#include "../../lock.h"
49814+#include "../../tap.h"
49815+
49816+struct replace_handle {
49817+ /* these are to be set before calling reiser4_replace_extent */
49818+ coord_t *coord;
49819+ lock_handle *lh;
49820+ reiser4_key key;
49821+ reiser4_key *pkey;
49822+ reiser4_extent overwrite;
49823+ reiser4_extent new_extents[2];
49824+ int nr_new_extents;
49825+ unsigned flags;
49826+
49827+ /* these are used by reiser4_replace_extent */
49828+ reiser4_item_data item;
49829+ coord_t coord_after;
49830+ lock_handle lh_after;
49831+ tap_t watch;
49832+ reiser4_key paste_key;
49833+#if REISER4_DEBUG
49834+ reiser4_extent orig_ext;
49835+ reiser4_key tmp;
49836+#endif
49837+};
49838+
49839+/* this structure is kmalloced before calling make_extent to avoid excessive
49840+ stack consumption on plug_hole->reiser4_replace_extent */
49841+struct make_extent_handle {
49842+ uf_coord_t *uf_coord;
49843+ reiser4_block_nr blocknr;
49844+ int created;
49845+ struct inode *inode;
49846+ union {
49847+ struct {
49848+ } append;
49849+ struct replace_handle replace;
49850+ } u;
49851+};
49852+
49853+int reiser4_replace_extent(struct replace_handle *,
49854+ int return_inserted_position);
49855+lock_handle *znode_lh(znode *);
49856+
49857+/* the reiser4 repacker support */
49858+struct repacker_cursor;
49859+extern int process_extent_backward_for_repacking(tap_t *,
49860+ struct repacker_cursor *);
49861+extern int mark_extent_for_repacking(tap_t *, int);
49862+
49863+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
49864+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
49865+
49866+/* __REISER4_EXTENT_H__ */
49867+#endif
49868+/*
49869+ Local variables:
49870+ c-indentation-style: "K&R"
49871+ mode-name: "LC"
49872+ c-basic-offset: 8
49873+ tab-width: 8
49874+ fill-column: 120
49875+ End:
49876+*/
49877diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.22/fs/reiser4/plugin/item/extent_item_ops.c
49878--- linux-2.6.22.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 03:00:00.000000000 +0300
49879+++ linux-2.6.22/fs/reiser4/plugin/item/extent_item_ops.c 2007-07-29 00:25:34.968720289 +0400
49880@@ -0,0 +1,889 @@
44254afd
MT
49881+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49882+
49883+#include "item.h"
49884+#include "../../inode.h"
49885+#include "../../tree_walk.h" /* check_sibling_list() */
49886+#include "../../page_cache.h"
49887+#include "../../carry.h"
49888+
49889+#include <linux/quotaops.h>
49890+
49891+/* item_plugin->b.max_key_inside */
49892+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
49893+{
49894+ item_key_by_coord(coord, key);
71430cf6 49895+ set_key_offset(key, get_key_offset(reiser4_max_key()));
44254afd
MT
49896+ return key;
49897+}
49898+
49899+/* item_plugin->b.can_contain_key
49900+ this checks whether @key of @data is matching to position set by @coord */
49901+int
49902+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49903+ const reiser4_item_data * data)
49904+{
49905+ reiser4_key item_key;
49906+
49907+ if (item_plugin_by_coord(coord) != data->iplug)
49908+ return 0;
49909+
49910+ item_key_by_coord(coord, &item_key);
49911+ if (get_key_locality(key) != get_key_locality(&item_key) ||
49912+ get_key_objectid(key) != get_key_objectid(&item_key) ||
49913+ get_key_ordering(key) != get_key_ordering(&item_key))
49914+ return 0;
49915+
49916+ return 1;
49917+}
49918+
49919+/* item_plugin->b.mergeable
49920+ first item is of extent type */
49921+/* Audited by: green(2002.06.13) */
49922+int mergeable_extent(const coord_t * p1, const coord_t * p2)
49923+{
49924+ reiser4_key key1, key2;
49925+
49926+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
49927+ /* FIXME-VS: Which is it? Assert or return 0 */
49928+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
49929+ return 0;
49930+ }
49931+
49932+ item_key_by_coord(p1, &key1);
49933+ item_key_by_coord(p2, &key2);
49934+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
49935+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
49936+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
49937+ get_key_type(&key1) != get_key_type(&key2))
49938+ return 0;
71430cf6
MT
49939+ if (get_key_offset(&key1) +
49940+ reiser4_extent_size(p1, nr_units_extent(p1)) !=
44254afd
MT
49941+ get_key_offset(&key2))
49942+ return 0;
49943+ return 1;
49944+}
49945+
49946+/* item_plugin->b.nr_units */
49947+pos_in_node_t nr_units_extent(const coord_t * coord)
49948+{
49949+ /* length of extent item has to be multiple of extent size */
49950+ assert("vs-1424",
49951+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
49952+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
49953+}
49954+
49955+/* item_plugin->b.lookup */
49956+lookup_result
49957+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
49958+ coord_t * coord)
49959+{ /* znode and item_pos are
49960+ set to an extent item to
49961+ look through */
49962+ reiser4_key item_key;
49963+ reiser4_block_nr lookuped, offset;
49964+ unsigned i, nr_units;
49965+ reiser4_extent *ext;
49966+ unsigned blocksize;
49967+ unsigned char blocksize_bits;
49968+
49969+ item_key_by_coord(coord, &item_key);
49970+ offset = get_key_offset(&item_key);
49971+
49972+ /* key we are looking for must be greater than key of item @coord */
49973+ assert("vs-414", keygt(key, &item_key));
49974+
49975+ assert("umka-99945",
49976+ !keygt(key, max_key_inside_extent(coord, &item_key)));
49977+
49978+ ext = extent_item(coord);
49979+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
49980+
49981+ blocksize = current_blocksize;
49982+ blocksize_bits = current_blocksize_bits;
49983+
49984+ /* offset we are looking for */
49985+ lookuped = get_key_offset(key);
49986+
49987+ nr_units = nr_units_extent(coord);
49988+ /* go through all extents until the one which address given offset */
49989+ for (i = 0; i < nr_units; i++, ext++) {
49990+ offset += (extent_get_width(ext) << blocksize_bits);
49991+ if (offset > lookuped) {
49992+ /* desired byte is somewhere in this extent */
49993+ coord->unit_pos = i;
49994+ coord->between = AT_UNIT;
49995+ return CBK_COORD_FOUND;
49996+ }
49997+ }
49998+
49999+ /* set coord after last unit */
50000+ coord->unit_pos = nr_units - 1;
50001+ coord->between = AFTER_UNIT;
50002+ return CBK_COORD_FOUND;
50003+}
50004+
50005+/* item_plugin->b.paste
50006+ item @coord is set to has been appended with @data->length of free
50007+ space. data->data contains data to be pasted into the item in position
50008+ @coord->in_item.unit_pos. It must fit into that free space.
50009+ @coord must be set between units.
50010+*/
50011+int
50012+paste_extent(coord_t * coord, reiser4_item_data * data,
50013+ carry_plugin_info * info UNUSED_ARG)
50014+{
50015+ unsigned old_nr_units;
50016+ reiser4_extent *ext;
50017+ int item_length;
50018+
50019+ ext = extent_item(coord);
50020+ item_length = item_length_by_coord(coord);
50021+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50022+
50023+ /* this is also used to copy extent into newly created item, so
50024+ old_nr_units could be 0 */
50025+ assert("vs-260", item_length >= data->length);
50026+
50027+ /* make sure that coord is set properly */
50028+ assert("vs-35",
50029+ ((!coord_is_existing_unit(coord))
50030+ || (!old_nr_units && !coord->unit_pos)));
50031+
50032+ /* first unit to be moved */
50033+ switch (coord->between) {
50034+ case AFTER_UNIT:
50035+ coord->unit_pos++;
50036+ case BEFORE_UNIT:
50037+ coord->between = AT_UNIT;
50038+ break;
50039+ case AT_UNIT:
50040+ assert("vs-331", !old_nr_units && !coord->unit_pos);
50041+ break;
50042+ default:
50043+ impossible("vs-330", "coord is set improperly");
50044+ }
50045+
50046+ /* prepare space for new units */
50047+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50048+ ext + coord->unit_pos,
50049+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50050+
50051+ /* copy new data from kernel space */
50052+ assert("vs-556", data->user == 0);
50053+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50054+
50055+ /* after paste @coord is set to first of pasted units */
50056+ assert("vs-332", coord_is_existing_unit(coord));
50057+ assert("vs-333",
50058+ !memcmp(data->data, extent_by_coord(coord),
50059+ (unsigned)data->length));
50060+ return 0;
50061+}
50062+
50063+/* item_plugin->b.can_shift */
50064+int
50065+can_shift_extent(unsigned free_space, coord_t * source,
50066+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50067+ unsigned *size, unsigned want)
50068+{
50069+ *size = item_length_by_coord(source);
50070+ if (*size > free_space)
50071+ /* never split a unit of extent item */
50072+ *size = free_space - free_space % sizeof(reiser4_extent);
50073+
50074+ /* we can shift *size bytes, calculate how many do we want to shift */
50075+ if (*size > want * sizeof(reiser4_extent))
50076+ *size = want * sizeof(reiser4_extent);
50077+
50078+ if (*size % sizeof(reiser4_extent) != 0)
50079+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
50080+ sizeof(reiser4_extent));
50081+ return *size / sizeof(reiser4_extent);
50082+
50083+}
50084+
50085+/* item_plugin->b.copy_units */
50086+void
50087+copy_units_extent(coord_t * target, coord_t * source,
50088+ unsigned from, unsigned count,
50089+ shift_direction where_is_free_space, unsigned free_space)
50090+{
50091+ char *from_ext, *to_ext;
50092+
50093+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
50094+
50095+ from_ext = item_body_by_coord(source);
50096+ to_ext = item_body_by_coord(target);
50097+
50098+ if (where_is_free_space == SHIFT_LEFT) {
50099+ assert("vs-215", from == 0);
50100+
50101+ /* At this moment, item length was already updated in the item
50102+ header by shifting code, hence nr_units_extent() will
50103+ return "new" number of units---one we obtain after copying
50104+ units.
50105+ */
50106+ to_ext +=
50107+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50108+ } else {
50109+ reiser4_key key;
50110+ coord_t coord;
50111+
50112+ assert("vs-216",
50113+ from + count == coord_last_unit_pos(source) + 1);
50114+
50115+ from_ext += item_length_by_coord(source) - free_space;
50116+
50117+ /* new units are inserted before first unit in an item,
50118+ therefore, we have to update item key */
50119+ coord = *source;
50120+ coord.unit_pos = from;
50121+ unit_key_extent(&coord, &key);
50122+
50123+ node_plugin_by_node(target->node)->update_item_key(target, &key,
50124+ NULL /*info */);
50125+ }
50126+
50127+ memcpy(to_ext, from_ext, free_space);
50128+}
50129+
50130+/* item_plugin->b.create_hook
50131+ @arg is znode of leaf node for which we need to update right delimiting key */
50132+int create_hook_extent(const coord_t * coord, void *arg)
50133+{
50134+ coord_t *child_coord;
50135+ znode *node;
50136+ reiser4_key key;
50137+ reiser4_tree *tree;
50138+
50139+ if (!arg)
50140+ return 0;
50141+
50142+ child_coord = arg;
50143+ tree = znode_get_tree(coord->node);
50144+
50145+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50146+
50147+ write_lock_tree(tree);
50148+ write_lock_dk(tree);
50149+ /* find a node on the left level for which right delimiting key has to
50150+ be updated */
50151+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50152+ assert("vs-411", znode_is_left_connected(child_coord->node));
50153+ node = child_coord->node->left;
50154+ } else {
50155+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50156+ node = child_coord->node;
50157+ assert("nikita-3314", node != NULL);
50158+ }
50159+
50160+ if (node != NULL) {
50161+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
50162+
50163+ assert("nikita-3282", check_sibling_list(node));
50164+ /* break sibling links */
50165+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50166+ ON_DEBUG(node->right->left_version =
50167+ atomic_inc_return(&delim_key_version);
50168+ node->right_version =
50169+ atomic_inc_return(&delim_key_version););
50170+
50171+ node->right->left = NULL;
50172+ node->right = NULL;
50173+ }
50174+ }
50175+ write_unlock_dk(tree);
50176+ write_unlock_tree(tree);
50177+ return 0;
50178+}
50179+
50180+#define ITEM_TAIL_KILLED 0
50181+#define ITEM_HEAD_KILLED 1
50182+#define ITEM_KILLED 2
50183+
50184+/* item_plugin->b.kill_hook
50185+ this is called when @count units starting from @from-th one are going to be removed
50186+ */
50187+int
50188+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50189+ struct carry_kill_data *kdata)
50190+{
50191+ reiser4_extent *ext;
50192+ reiser4_block_nr start, length;
50193+ const reiser4_key *pfrom_key, *pto_key;
50194+ struct inode *inode;
50195+ reiser4_tree *tree;
50196+ pgoff_t from_off, to_off, offset, skip;
50197+ int retval;
50198+
50199+ /* these are located in memory kmalloc-ed by kill_node_content */
50200+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50201+ coord_t *dup, *next;
50202+
50203+ assert("zam-811", znode_is_write_locked(coord->node));
50204+ assert("nikita-3315", kdata != NULL);
50205+ assert("vs-34", kdata->buf != NULL);
50206+
50207+ /* map structures to kdata->buf */
50208+ min_item_key = (reiser4_key *) (kdata->buf);
50209+ max_item_key = min_item_key + 1;
50210+ from_key = max_item_key + 1;
50211+ to_key = from_key + 1;
50212+ key = to_key + 1;
50213+ dup = (coord_t *) (key + 1);
50214+ next = dup + 1;
50215+
50216+ item_key_by_coord(coord, min_item_key);
50217+ max_item_key_by_coord(coord, max_item_key);
50218+
50219+ if (kdata->params.from_key) {
50220+ pfrom_key = kdata->params.from_key;
50221+ pto_key = kdata->params.to_key;
50222+ } else {
50223+ assert("vs-1549", from == coord->unit_pos);
50224+ unit_key_by_coord(coord, from_key);
50225+ pfrom_key = from_key;
50226+
50227+ coord_dup(dup, coord);
50228+ dup->unit_pos = from + count - 1;
50229+ max_unit_key_by_coord(dup, to_key);
50230+ pto_key = to_key;
50231+ }
50232+
50233+ if (!keylt(pto_key, max_item_key)) {
50234+ if (!keygt(pfrom_key, min_item_key)) {
50235+ znode *left, *right;
50236+
50237+ /* item is to be removed completely */
50238+ assert("nikita-3316", kdata->left != NULL
50239+ && kdata->right != NULL);
50240+
50241+ left = kdata->left->node;
50242+ right = kdata->right->node;
50243+
50244+ tree = current_tree;
50245+ /* we have to do two things:
50246+ *
50247+ * 1. link left and right formatted neighbors of
50248+ * extent being removed, and
50249+ *
50250+ * 2. update their delimiting keys.
50251+ *
50252+ * atomicity of these operations is protected by
50253+ * taking dk-lock and tree-lock.
50254+ */
50255+ /* if neighbors of item being removed are znodes -
50256+ * link them */
50257+ write_lock_tree(tree);
50258+ write_lock_dk(tree);
50259+ link_left_and_right(left, right);
50260+ if (left) {
50261+ /* update right delimiting key of left
50262+ * neighbor of extent item */
50263+ /*coord_t next;
50264+ reiser4_key key; */
50265+
50266+ coord_dup(next, coord);
50267+
50268+ if (coord_next_item(next))
50269+ *key = *znode_get_rd_key(coord->node);
50270+ else
50271+ item_key_by_coord(next, key);
50272+ znode_set_rd_key(left, key);
50273+ }
50274+ write_unlock_dk(tree);
50275+ write_unlock_tree(tree);
50276+
50277+ from_off =
50278+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50279+ to_off =
50280+ (get_key_offset(max_item_key) +
50281+ 1) >> PAGE_CACHE_SHIFT;
50282+ retval = ITEM_KILLED;
50283+ } else {
50284+ /* tail of item is to be removed */
50285+ from_off =
50286+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50287+ 1) >> PAGE_CACHE_SHIFT;
50288+ to_off =
50289+ (get_key_offset(max_item_key) +
50290+ 1) >> PAGE_CACHE_SHIFT;
50291+ retval = ITEM_TAIL_KILLED;
50292+ }
50293+ } else {
50294+ /* head of item is to be removed */
50295+ assert("vs-1571", keyeq(pfrom_key, min_item_key));
50296+ assert("vs-1572",
50297+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50298+ 0);
50299+ assert("vs-1573",
50300+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50301+ 1)) == 0);
50302+
50303+ if (kdata->left->node) {
50304+ /* update right delimiting key of left neighbor of extent item */
50305+ /*reiser4_key key; */
50306+
50307+ *key = *pto_key;
50308+ set_key_offset(key, get_key_offset(pto_key) + 1);
50309+
50310+ write_lock_dk(current_tree);
50311+ znode_set_rd_key(kdata->left->node, key);
50312+ write_unlock_dk(current_tree);
50313+ }
50314+
50315+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50316+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50317+ retval = ITEM_HEAD_KILLED;
50318+ }
50319+
50320+ inode = kdata->inode;
50321+ assert("vs-1545", inode != NULL);
50322+ if (inode != NULL)
50323+ /* take care of pages and jnodes corresponding to part of item being killed */
50324+ reiser4_invalidate_pages(inode->i_mapping, from_off,
50325+ to_off - from_off,
50326+ kdata->params.truncate);
50327+
50328+ ext = extent_item(coord) + from;
50329+ offset =
50330+ (get_key_offset(min_item_key) +
71430cf6 50331+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
44254afd
MT
50332+
50333+ assert("vs-1551", from_off >= offset);
50334+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
50335+ skip = from_off - offset;
50336+ offset = from_off;
50337+
50338+ while (offset < to_off) {
50339+ length = extent_get_width(ext) - skip;
50340+ if (state_of_extent(ext) == HOLE_EXTENT) {
50341+ skip = 0;
50342+ offset += length;
50343+ ext++;
50344+ continue;
50345+ }
50346+
50347+ if (offset + length > to_off) {
50348+ length = to_off - offset;
50349+ }
50350+
50351+ DQUOT_FREE_BLOCK_NODIRTY(inode, length);
50352+
50353+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50354+ /* some jnodes corresponding to this unallocated extent */
50355+ fake_allocated2free(length, 0 /* unformatted */ );
50356+
50357+ skip = 0;
50358+ offset += length;
50359+ ext++;
50360+ continue;
50361+ }
50362+
50363+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
50364+
50365+ if (length != 0) {
50366+ start = extent_get_start(ext) + skip;
50367+
50368+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
50369+ immediately */
50370+ reiser4_dealloc_blocks(&start, &length,
50371+ 0 /* not used */ ,
50372+ BA_DEFER
50373+ /* unformatted with defer */ );
50374+ }
50375+ skip = 0;
50376+ offset += length;
50377+ ext++;
50378+ }
50379+ return retval;
50380+}
50381+
50382+/* item_plugin->b.kill_units */
50383+int
50384+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50385+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
50386+ reiser4_key * new_first)
50387+{
50388+ reiser4_extent *ext;
50389+ reiser4_key item_key;
50390+ pos_in_node_t count;
50391+ reiser4_key from_key, to_key;
50392+ const reiser4_key *pfrom_key, *pto_key;
50393+ loff_t off;
50394+ int result;
50395+
50396+ assert("vs-1541",
50397+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
50398+ || (kdata->params.from_key != NULL
50399+ && kdata->params.to_key != NULL)));
50400+
50401+ if (kdata->params.from_key) {
50402+ pfrom_key = kdata->params.from_key;
50403+ pto_key = kdata->params.to_key;
50404+ } else {
50405+ coord_t dup;
50406+
50407+ /* calculate key range of kill */
50408+ assert("vs-1549", from == coord->unit_pos);
50409+ unit_key_by_coord(coord, &from_key);
50410+ pfrom_key = &from_key;
50411+
50412+ coord_dup(&dup, coord);
50413+ dup.unit_pos = to;
50414+ max_unit_key_by_coord(&dup, &to_key);
50415+ pto_key = &to_key;
50416+ }
50417+
50418+ item_key_by_coord(coord, &item_key);
50419+
50420+#if REISER4_DEBUG
50421+ {
50422+ reiser4_key max_item_key;
50423+
50424+ max_item_key_by_coord(coord, &max_item_key);
50425+
50426+ if (new_first) {
50427+ /* head of item is to be cut */
50428+ assert("vs-1542", keyeq(pfrom_key, &item_key));
50429+ assert("vs-1538", keylt(pto_key, &max_item_key));
50430+ } else {
50431+ /* tail of item is to be cut */
50432+ assert("vs-1540", keygt(pfrom_key, &item_key));
50433+ assert("vs-1543", !keylt(pto_key, &max_item_key));
50434+ }
50435+ }
50436+#endif
50437+
50438+ if (smallest_removed)
50439+ *smallest_removed = *pfrom_key;
50440+
50441+ if (new_first) {
50442+ /* item head is cut. Item key will change. This new key is calculated here */
50443+ assert("vs-1556",
50444+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50445+ (PAGE_CACHE_SIZE - 1));
50446+ *new_first = *pto_key;
50447+ set_key_offset(new_first, get_key_offset(new_first) + 1);
50448+ }
50449+
50450+ count = to - from + 1;
50451+ result = kill_hook_extent(coord, from, count, kdata);
50452+ if (result == ITEM_TAIL_KILLED) {
50453+ assert("vs-1553",
50454+ get_key_offset(pfrom_key) >=
71430cf6
MT
50455+ get_key_offset(&item_key) +
50456+ reiser4_extent_size(coord, from));
44254afd 50457+ off =
71430cf6
MT
50458+ get_key_offset(pfrom_key) -
50459+ (get_key_offset(&item_key) +
50460+ reiser4_extent_size(coord, from));
44254afd
MT
50461+ if (off) {
50462+ /* unit @from is to be cut partially. Its width decreases */
50463+ ext = extent_item(coord) + from;
50464+ extent_set_width(ext,
50465+ (off + PAGE_CACHE_SIZE -
50466+ 1) >> PAGE_CACHE_SHIFT);
50467+ count--;
50468+ }
50469+ } else {
50470+ __u64 max_to_offset;
50471+ __u64 rest;
50472+
50473+ assert("vs-1575", result == ITEM_HEAD_KILLED);
50474+ assert("", from == 0);
50475+ assert("",
50476+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50477+ 1)) == 0);
50478+ assert("",
50479+ get_key_offset(pto_key) + 1 >
71430cf6
MT
50480+ get_key_offset(&item_key) +
50481+ reiser4_extent_size(coord, to));
44254afd 50482+ max_to_offset =
71430cf6
MT
50483+ get_key_offset(&item_key) +
50484+ reiser4_extent_size(coord, to + 1) - 1;
44254afd
MT
50485+ assert("", get_key_offset(pto_key) <= max_to_offset);
50486+
50487+ rest =
50488+ (max_to_offset -
50489+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
50490+ if (rest) {
50491+ /* unit @to is to be cut partially */
50492+ ext = extent_item(coord) + to;
50493+
50494+ assert("", extent_get_width(ext) > rest);
50495+
50496+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50497+ extent_set_start(ext,
50498+ extent_get_start(ext) +
50499+ (extent_get_width(ext) -
50500+ rest));
50501+
50502+ extent_set_width(ext, rest);
50503+ count--;
50504+ }
50505+ }
50506+ return count * sizeof(reiser4_extent);
50507+}
50508+
50509+/* item_plugin->b.cut_units
50510+ this is too similar to kill_units_extent */
50511+int
50512+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
50513+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
50514+ reiser4_key * new_first)
50515+{
50516+ reiser4_extent *ext;
50517+ reiser4_key item_key;
50518+ pos_in_node_t count;
50519+ reiser4_key from_key, to_key;
50520+ const reiser4_key *pfrom_key, *pto_key;
50521+ loff_t off;
50522+
50523+ assert("vs-1541",
50524+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
50525+ || (cdata->params.from_key != NULL
50526+ && cdata->params.to_key != NULL)));
50527+
50528+ if (cdata->params.from_key) {
50529+ pfrom_key = cdata->params.from_key;
50530+ pto_key = cdata->params.to_key;
50531+ } else {
50532+ coord_t dup;
50533+
50534+ /* calculate key range of kill */
50535+ coord_dup(&dup, coord);
50536+ dup.unit_pos = from;
50537+ unit_key_by_coord(&dup, &from_key);
50538+
50539+ dup.unit_pos = to;
50540+ max_unit_key_by_coord(&dup, &to_key);
50541+
50542+ pfrom_key = &from_key;
50543+ pto_key = &to_key;
50544+ }
50545+
50546+ assert("vs-1555",
50547+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
50548+ assert("vs-1556",
50549+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
50550+ (PAGE_CACHE_SIZE - 1));
50551+
50552+ item_key_by_coord(coord, &item_key);
50553+
50554+#if REISER4_DEBUG
50555+ {
50556+ reiser4_key max_item_key;
50557+
50558+ assert("vs-1584",
50559+ get_key_locality(pfrom_key) ==
50560+ get_key_locality(&item_key));
50561+ assert("vs-1585",
50562+ get_key_type(pfrom_key) == get_key_type(&item_key));
50563+ assert("vs-1586",
50564+ get_key_objectid(pfrom_key) ==
50565+ get_key_objectid(&item_key));
50566+ assert("vs-1587",
50567+ get_key_ordering(pfrom_key) ==
50568+ get_key_ordering(&item_key));
50569+
50570+ max_item_key_by_coord(coord, &max_item_key);
50571+
50572+ if (new_first != NULL) {
50573+ /* head of item is to be cut */
50574+ assert("vs-1542", keyeq(pfrom_key, &item_key));
50575+ assert("vs-1538", keylt(pto_key, &max_item_key));
50576+ } else {
50577+ /* tail of item is to be cut */
50578+ assert("vs-1540", keygt(pfrom_key, &item_key));
50579+ assert("vs-1543", keyeq(pto_key, &max_item_key));
50580+ }
50581+ }
50582+#endif
50583+
50584+ if (smallest_removed)
50585+ *smallest_removed = *pfrom_key;
50586+
50587+ if (new_first) {
50588+ /* item head is cut. Item key will change. This new key is calculated here */
50589+ *new_first = *pto_key;
50590+ set_key_offset(new_first, get_key_offset(new_first) + 1);
50591+ }
50592+
50593+ count = to - from + 1;
50594+
50595+ assert("vs-1553",
50596+ get_key_offset(pfrom_key) >=
71430cf6 50597+ get_key_offset(&item_key) + reiser4_extent_size(coord, from));
44254afd
MT
50598+ off =
50599+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
71430cf6 50600+ reiser4_extent_size(coord, from));
44254afd
MT
50601+ if (off) {
50602+ /* tail of unit @from is to be cut partially. Its width decreases */
50603+ assert("vs-1582", new_first == NULL);
50604+ ext = extent_item(coord) + from;
50605+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
50606+ count--;
50607+ }
50608+
50609+ assert("vs-1554",
50610+ get_key_offset(pto_key) <=
71430cf6
MT
50611+ get_key_offset(&item_key) +
50612+ reiser4_extent_size(coord, to + 1) - 1);
44254afd 50613+ off =
71430cf6
MT
50614+ (get_key_offset(&item_key) +
50615+ reiser4_extent_size(coord, to + 1) - 1) -
50616+ get_key_offset(pto_key);
44254afd
MT
50617+ if (off) {
50618+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
50619+ and width decreased. */
50620+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
50621+ ext = extent_item(coord) + to;
50622+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50623+ extent_set_start(ext,
50624+ extent_get_start(ext) +
50625+ (extent_get_width(ext) -
50626+ (off >> PAGE_CACHE_SHIFT)));
50627+
50628+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
50629+ count--;
50630+ }
50631+ return count * sizeof(reiser4_extent);
50632+}
50633+
50634+/* item_plugin->b.unit_key */
50635+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
50636+{
50637+ assert("vs-300", coord_is_existing_unit(coord));
50638+
50639+ item_key_by_coord(coord, key);
50640+ set_key_offset(key,
50641+ (get_key_offset(key) +
71430cf6 50642+ reiser4_extent_size(coord, coord->unit_pos)));
44254afd
MT
50643+
50644+ return key;
50645+}
50646+
50647+/* item_plugin->b.max_unit_key */
50648+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
50649+{
50650+ assert("vs-300", coord_is_existing_unit(coord));
50651+
50652+ item_key_by_coord(coord, key);
50653+ set_key_offset(key,
50654+ (get_key_offset(key) +
71430cf6 50655+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
44254afd
MT
50656+ return key;
50657+}
50658+
50659+/* item_plugin->b.estimate
50660+ item_plugin->b.item_data_by_flow */
50661+
50662+#if REISER4_DEBUG
50663+
50664+/* item_plugin->b.check
50665+ used for debugging, every item should have here the most complete
50666+ possible check of the consistency of the item that the inventor can
50667+ construct
50668+*/
71430cf6
MT
50669+int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
50670+ const char **error /* where to store error message */)
44254afd
MT
50671+{
50672+ reiser4_extent *ext, *first;
50673+ unsigned i, j;
50674+ reiser4_block_nr start, width, blk_cnt;
50675+ unsigned num_units;
50676+ reiser4_tree *tree;
50677+ oid_t oid;
50678+ reiser4_key key;
50679+ coord_t scan;
50680+
50681+ assert("vs-933", REISER4_DEBUG);
50682+
50683+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
50684+ *error = "Extent on the wrong level";
50685+ return -1;
50686+ }
50687+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
50688+ *error = "Wrong item size";
50689+ return -1;
50690+ }
50691+ ext = first = extent_item(coord);
50692+ blk_cnt = reiser4_block_count(reiser4_get_current_sb());
50693+ num_units = coord_num_units(coord);
50694+ tree = znode_get_tree(coord->node);
50695+ item_key_by_coord(coord, &key);
50696+ oid = get_key_objectid(&key);
50697+ coord_dup(&scan, coord);
50698+
50699+ for (i = 0; i < num_units; ++i, ++ext) {
50700+ __u64 index;
50701+
50702+ scan.unit_pos = i;
50703+ index = extent_unit_index(&scan);
50704+
50705+#if 0
50706+ /* check that all jnodes are present for the unallocated
50707+ * extent */
50708+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
50709+ for (j = 0; j < extent_get_width(ext); j++) {
50710+ jnode *node;
50711+
50712+ node = jlookup(tree, oid, index + j);
50713+ if (node == NULL) {
50714+ print_coord("scan", &scan, 0);
50715+ *error = "Jnode missing";
50716+ return -1;
50717+ }
50718+ jput(node);
50719+ }
50720+ }
50721+#endif
50722+
50723+ start = extent_get_start(ext);
50724+ if (start < 2)
50725+ continue;
50726+ /* extent is allocated one */
50727+ width = extent_get_width(ext);
50728+ if (start >= blk_cnt) {
50729+ *error = "Start too large";
50730+ return -1;
50731+ }
50732+ if (start + width > blk_cnt) {
50733+ *error = "End too large";
50734+ return -1;
50735+ }
50736+ /* make sure that this extent does not overlap with other
50737+ allocated extents extents */
50738+ for (j = 0; j < i; j++) {
50739+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
50740+ continue;
50741+ if (!
50742+ ((extent_get_start(ext) >=
50743+ extent_get_start(first + j) +
50744+ extent_get_width(first + j))
50745+ || (extent_get_start(ext) +
50746+ extent_get_width(ext) <=
50747+ extent_get_start(first + j)))) {
50748+ *error = "Extent overlaps with others";
50749+ return -1;
50750+ }
50751+ }
50752+
50753+ }
50754+
50755+ return 0;
50756+}
50757+
50758+#endif /* REISER4_DEBUG */
50759+
50760+/*
50761+ Local variables:
50762+ c-indentation-style: "K&R"
50763+ mode-name: "LC"
50764+ c-basic-offset: 8
50765+ tab-width: 8
50766+ fill-column: 120
50767+ scroll-step: 1
50768+ End:
50769+*/
71430cf6
MT
50770diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/internal.c linux-2.6.22/fs/reiser4/plugin/item/internal.c
50771--- linux-2.6.22.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
50772+++ linux-2.6.22/fs/reiser4/plugin/item/internal.c 2007-07-29 00:25:34.968720289 +0400
50773@@ -0,0 +1,396 @@
44254afd
MT
50774+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50775+
50776+/* Implementation of internal-item plugin methods. */
50777+
50778+#include "../../forward.h"
50779+#include "../../debug.h"
50780+#include "../../dformat.h"
50781+#include "../../key.h"
50782+#include "../../coord.h"
50783+#include "internal.h"
50784+#include "item.h"
50785+#include "../node/node.h"
50786+#include "../plugin.h"
50787+#include "../../jnode.h"
50788+#include "../../znode.h"
50789+#include "../../tree_walk.h"
50790+#include "../../tree_mod.h"
50791+#include "../../tree.h"
50792+#include "../../super.h"
50793+#include "../../block_alloc.h"
50794+
50795+/* see internal.h for explanation */
50796+
50797+/* plugin->u.item.b.mergeable */
50798+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
50799+ const coord_t * p2 UNUSED_ARG /* second item */ )
50800+{
50801+ /* internal items are not mergeable */
50802+ return 0;
50803+}
50804+
50805+/* ->lookup() method for internal items */
50806+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
50807+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
50808+ coord_t * coord /* coord of item */ )
50809+{
50810+ reiser4_key ukey;
50811+
50812+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
50813+ default:
50814+ impossible("", "keycmp()?!");
50815+ case LESS_THAN:
50816+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
50817+ item plugin can not be taken using coord set this way */
50818+ assert("vs-681", coord->unit_pos == 0);
50819+ coord->between = AFTER_UNIT;
50820+ case EQUAL_TO:
50821+ return CBK_COORD_FOUND;
50822+ case GREATER_THAN:
50823+ return CBK_COORD_NOTFOUND;
50824+ }
50825+}
50826+
50827+/* return body of internal item at @coord */
50828+static internal_item_layout *internal_at(const coord_t * coord /* coord of
50829+ * item */ )
50830+{
50831+ assert("nikita-607", coord != NULL);
50832+ assert("nikita-1650",
50833+ item_plugin_by_coord(coord) ==
50834+ item_plugin_by_id(NODE_POINTER_ID));
50835+ return (internal_item_layout *) item_body_by_coord(coord);
50836+}
50837+
71430cf6
MT
50838+void reiser4_update_internal(const coord_t * coord,
50839+ const reiser4_block_nr * blocknr)
44254afd
MT
50840+{
50841+ internal_item_layout *item = internal_at(coord);
50842+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
50843+
50844+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
50845+}
50846+
50847+/* return child block number stored in the internal item at @coord */
50848+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
50849+{
50850+ assert("nikita-608", coord != NULL);
50851+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
50852+}
50853+
50854+/* get znode pointed to by internal @item */
50855+static znode *znode_at(const coord_t * item /* coord of item */ ,
50856+ znode * parent /* parent node */ )
50857+{
50858+ return child_znode(item, parent, 1, 0);
50859+}
50860+
50861+/* store pointer from internal item into "block". Implementation of
50862+ ->down_link() method */
50863+void down_link_internal(const coord_t * coord /* coord of item */ ,
50864+ const reiser4_key * key UNUSED_ARG /* key to get
50865+ * pointer for */ ,
50866+ reiser4_block_nr * block /* resulting block number */ )
50867+{
50868+ ON_DEBUG(reiser4_key item_key);
50869+
50870+ assert("nikita-609", coord != NULL);
50871+ assert("nikita-611", block != NULL);
50872+ assert("nikita-612", (key == NULL) ||
50873+ /* twig horrors */
50874+ (znode_get_level(coord->node) == TWIG_LEVEL)
50875+ || keyle(item_key_by_coord(coord, &item_key), key));
50876+
50877+ *block = pointer_at(coord);
50878+ assert("nikita-2960", reiser4_blocknr_is_sane(block));
50879+}
50880+
50881+/* Get the child's block number, or 0 if the block is unallocated. */
50882+int
50883+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
50884+ reiser4_block_nr * block)
50885+{
50886+ assert("jmacd-2059", coord != NULL);
50887+
50888+ *block = pointer_at(coord);
50889+ assert("nikita-2961", reiser4_blocknr_is_sane(block));
50890+
71430cf6 50891+ if (reiser4_blocknr_is_fake(block)) {
44254afd
MT
50892+ *block = 0;
50893+ }
50894+
50895+ return 0;
50896+}
50897+
50898+/* Return the child. */
50899+int
50900+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
50901+ jnode ** childp)
50902+{
50903+ reiser4_block_nr block = pointer_at(coord);
50904+ znode *child;
50905+
50906+ assert("jmacd-2059", childp != NULL);
50907+ assert("nikita-2962", reiser4_blocknr_is_sane(&block));
50908+
50909+ child = zlook(znode_get_tree(coord->node), &block);
50910+
50911+ if (IS_ERR(child)) {
50912+ return PTR_ERR(child);
50913+ }
50914+
50915+ *childp = ZJNODE(child);
50916+
50917+ return 0;
50918+}
50919+
71430cf6
MT
50920+#if REISER4_DEBUG
50921+
44254afd
MT
50922+static void check_link(znode * left, znode * right)
50923+{
50924+ znode *scan;
50925+
50926+ for (scan = left; scan != right; scan = scan->right) {
50927+ if (ZF_ISSET(scan, JNODE_RIP))
50928+ break;
50929+ if (znode_is_right_connected(scan) && scan->right != NULL) {
50930+ if (ZF_ISSET(scan->right, JNODE_RIP))
50931+ break;
50932+ assert("nikita-3285",
50933+ znode_is_left_connected(scan->right));
50934+ assert("nikita-3265",
50935+ ergo(scan != left,
50936+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
50937+ assert("nikita-3284", scan->right->left == scan);
50938+ } else
50939+ break;
50940+ }
50941+}
50942+
50943+int check__internal(const coord_t * coord, const char **error)
50944+{
50945+ reiser4_block_nr blk;
50946+ znode *child;
50947+ coord_t cpy;
50948+
50949+ blk = pointer_at(coord);
50950+ if (!reiser4_blocknr_is_sane(&blk)) {
50951+ *error = "Invalid pointer";
50952+ return -1;
50953+ }
50954+ coord_dup(&cpy, coord);
50955+ child = znode_at(&cpy, cpy.node);
50956+ if (child != NULL) {
50957+ znode *left_child;
50958+ znode *right_child;
50959+
50960+ left_child = right_child = NULL;
50961+
50962+ assert("nikita-3256", znode_invariant(child));
50963+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
50964+ left_child = znode_at(&cpy, cpy.node);
50965+ if (left_child != NULL) {
50966+ read_lock_tree(znode_get_tree(child));
50967+ check_link(left_child, child);
50968+ read_unlock_tree(znode_get_tree(child));
50969+ zput(left_child);
50970+ }
50971+ }
50972+ coord_dup(&cpy, coord);
50973+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
50974+ right_child = znode_at(&cpy, cpy.node);
50975+ if (right_child != NULL) {
50976+ read_lock_tree(znode_get_tree(child));
50977+ check_link(child, right_child);
50978+ read_unlock_tree(znode_get_tree(child));
50979+ zput(right_child);
50980+ }
50981+ }
50982+ zput(child);
50983+ }
50984+ return 0;
50985+}
50986+
71430cf6
MT
50987+#endif /* REISER4_DEBUG */
50988+
44254afd
MT
50989+/* return true only if this item really points to "block" */
50990+/* Audited by: green(2002.06.14) */
50991+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
50992+ const reiser4_block_nr * block /* block number to
50993+ * check */ )
50994+{
50995+ assert("nikita-613", coord != NULL);
50996+ assert("nikita-614", block != NULL);
50997+
50998+ return pointer_at(coord) == *block;
50999+}
51000+
51001+/* hook called by ->create_item() method of node plugin after new internal
51002+ item was just created.
51003+
51004+ This is point where pointer to new node is inserted into tree. Initialize
51005+ parent pointer in child znode, insert child into sibling list and slum.
51006+
51007+*/
51008+int create_hook_internal(const coord_t * item /* coord of item */ ,
51009+ void *arg /* child's left neighbor, if any */ )
51010+{
51011+ znode *child;
51012+ __u64 child_ptr;
51013+
51014+ assert("nikita-1252", item != NULL);
51015+ assert("nikita-1253", item->node != NULL);
51016+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51017+ assert("nikita-1450", item->unit_pos == 0);
51018+
51019+ /*
51020+ * preparing to item insertion build_child_ptr_data sets pointer to
51021+ * data to be inserted to jnode's blocknr which is in cpu byte
51022+ * order. Node's create_item simply copied those data. As result we
51023+ * have child pointer in cpu's byte order. Convert content of internal
51024+ * item to little endian byte order.
51025+ */
51026+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
71430cf6 51027+ reiser4_update_internal(item, &child_ptr);
44254afd
MT
51028+
51029+ child = znode_at(item, item->node);
51030+ if (child != NULL && !IS_ERR(child)) {
51031+ znode *left;
51032+ int result = 0;
51033+ reiser4_tree *tree;
51034+
51035+ left = arg;
51036+ tree = znode_get_tree(item->node);
51037+ write_lock_tree(tree);
51038+ write_lock_dk(tree);
51039+ assert("nikita-1400", (child->in_parent.node == NULL)
51040+ || (znode_above_root(child->in_parent.node)));
51041+ ++item->node->c_count;
51042+ coord_to_parent_coord(item, &child->in_parent);
51043+ sibling_list_insert_nolock(child, left);
51044+
51045+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51046+ ZF_CLR(child, JNODE_ORPHAN);
51047+
51048+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51049+ znode_get_rd_key(child))) {
51050+ znode_set_rd_key(child, znode_get_rd_key(left));
51051+ }
51052+ write_unlock_dk(tree);
51053+ write_unlock_tree(tree);
51054+ zput(child);
51055+ return result;
51056+ } else {
51057+ if (child == NULL)
51058+ child = ERR_PTR(-EIO);
51059+ return PTR_ERR(child);
51060+ }
51061+}
51062+
51063+/* hook called by ->cut_and_kill() method of node plugin just before internal
51064+ item is removed.
51065+
51066+ This is point where empty node is removed from the tree. Clear parent
51067+ pointer in child, and mark node for pending deletion.
51068+
51069+ Node will be actually deleted later and in several installations:
51070+
51071+ . when last lock on this node will be released, node will be removed from
51072+ the sibling list and its lock will be invalidated
51073+
51074+ . when last reference to this node will be dropped, bitmap will be updated
51075+ and node will be actually removed from the memory.
51076+
44254afd
MT
51077+*/
51078+int kill_hook_internal(const coord_t * item /* coord of item */ ,
51079+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
51080+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51081+ struct carry_kill_data *p UNUSED_ARG)
51082+{
51083+ znode *child;
51084+
51085+ assert("nikita-1222", item != NULL);
51086+ assert("nikita-1224", from == 0);
51087+ assert("nikita-1225", count == 1);
51088+
51089+ child = znode_at(item, item->node);
51090+ if (IS_ERR(child))
51091+ return PTR_ERR(child);
51092+ else if (node_is_empty(child)) {
51093+ reiser4_tree *tree;
51094+
51095+ assert("nikita-1397", znode_is_write_locked(child));
51096+ assert("nikita-1398", child->c_count == 0);
51097+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51098+
51099+ tree = znode_get_tree(item->node);
51100+ write_lock_tree(tree);
51101+ init_parent_coord(&child->in_parent, NULL);
51102+ --item->node->c_count;
51103+ write_unlock_tree(tree);
51104+ zput(child);
51105+ return 0;
51106+ } else {
51107+ warning("nikita-1223",
51108+ "Cowardly refuse to remove link to non-empty node");
51109+ zput(child);
51110+ return RETERR(-EIO);
51111+ }
51112+}
51113+
51114+/* hook called by ->shift() node plugin method when iternal item was just
51115+ moved from one node to another.
51116+
51117+ Update parent pointer in child and c_counts in old and new parent
51118+
51119+*/
51120+int shift_hook_internal(const coord_t * item /* coord of item */ ,
51121+ unsigned from UNUSED_ARG /* start unit */ ,
51122+ unsigned count UNUSED_ARG /* stop unit */ ,
51123+ znode * old_node /* old parent */ )
51124+{
51125+ znode *child;
51126+ znode *new_node;
51127+ reiser4_tree *tree;
51128+
51129+ assert("nikita-1276", item != NULL);
51130+ assert("nikita-1277", from == 0);
51131+ assert("nikita-1278", count == 1);
51132+ assert("nikita-1451", item->unit_pos == 0);
51133+
51134+ new_node = item->node;
51135+ assert("nikita-2132", new_node != old_node);
51136+ tree = znode_get_tree(item->node);
51137+ child = child_znode(item, old_node, 1, 0);
51138+ if (child == NULL)
51139+ return 0;
51140+ if (!IS_ERR(child)) {
51141+ write_lock_tree(tree);
51142+ ++new_node->c_count;
51143+ assert("nikita-1395", znode_parent(child) == old_node);
51144+ assert("nikita-1396", old_node->c_count > 0);
51145+ coord_to_parent_coord(item, &child->in_parent);
51146+ assert("nikita-1781", znode_parent(child) == new_node);
51147+ assert("nikita-1782",
51148+ check_tree_pointer(item, child) == NS_FOUND);
51149+ --old_node->c_count;
51150+ write_unlock_tree(tree);
51151+ zput(child);
51152+ return 0;
51153+ } else
51154+ return PTR_ERR(child);
51155+}
51156+
51157+/* plugin->u.item.b.max_key_inside - not defined */
51158+
51159+/* plugin->u.item.b.nr_units - item.c:single_unit */
51160+
51161+/* Make Linus happy.
51162+ Local variables:
51163+ c-indentation-style: "K&R"
51164+ mode-name: "LC"
51165+ c-basic-offset: 8
51166+ tab-width: 8
51167+ fill-column: 120
51168+ End:
51169+*/
71430cf6
MT
51170diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/internal.h linux-2.6.22/fs/reiser4/plugin/item/internal.h
51171--- linux-2.6.22.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
51172+++ linux-2.6.22/fs/reiser4/plugin/item/internal.h 2007-07-29 00:25:34.968720289 +0400
44254afd
MT
51173@@ -0,0 +1,57 @@
51174+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51175+/* Internal item contains down-link to the child of the internal/twig
51176+ node in a tree. It is internal items that are actually used during
51177+ tree traversal. */
51178+
51179+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51180+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51181+
51182+#include "../../forward.h"
51183+#include "../../dformat.h"
51184+
51185+/* on-disk layout of internal item */
51186+typedef struct internal_item_layout {
51187+ /* 0 */ reiser4_dblock_nr pointer;
51188+ /* 4 */
51189+} internal_item_layout;
51190+
51191+struct cut_list;
51192+
51193+int mergeable_internal(const coord_t * p1, const coord_t * p2);
51194+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51195+ coord_t * coord);
51196+/* store pointer from internal item into "block". Implementation of
51197+ ->down_link() method */
51198+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51199+ reiser4_block_nr * block);
51200+extern int has_pointer_to_internal(const coord_t * coord,
51201+ const reiser4_block_nr * block);
51202+extern int create_hook_internal(const coord_t * item, void *arg);
51203+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51204+ pos_in_node_t count, struct carry_kill_data *);
51205+extern int shift_hook_internal(const coord_t * item, unsigned from,
51206+ unsigned count, znode * old_node);
71430cf6 51207+extern void reiser4_print_internal(const char *prefix, coord_t * coord);
44254afd
MT
51208+
51209+extern int utmost_child_internal(const coord_t * coord, sideof side,
51210+ jnode ** child);
51211+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51212+ reiser4_block_nr * block);
51213+
71430cf6
MT
51214+extern void reiser4_update_internal(const coord_t * coord,
51215+ const reiser4_block_nr * blocknr);
44254afd
MT
51216+/* FIXME: reiserfs has check_internal */
51217+extern int check__internal(const coord_t * coord, const char **error);
51218+
51219+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51220+#endif
51221+
51222+/* Make Linus happy.
51223+ Local variables:
51224+ c-indentation-style: "K&R"
51225+ mode-name: "LC"
51226+ c-basic-offset: 8
51227+ tab-width: 8
51228+ fill-column: 120
51229+ End:
51230+*/
71430cf6
MT
51231diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/item.c linux-2.6.22/fs/reiser4/plugin/item/item.c
51232--- linux-2.6.22.orig/fs/reiser4/plugin/item/item.c 1970-01-01 03:00:00.000000000 +0300
51233+++ linux-2.6.22/fs/reiser4/plugin/item/item.c 2007-07-29 00:25:34.972721325 +0400
51234@@ -0,0 +1,719 @@
44254afd
MT
51235+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51236+
51237+/* definition of item plugins. */
51238+
51239+#include "../../forward.h"
51240+#include "../../debug.h"
51241+#include "../../key.h"
51242+#include "../../coord.h"
51243+#include "../plugin_header.h"
51244+#include "sde.h"
51245+#include "internal.h"
51246+#include "item.h"
51247+#include "static_stat.h"
51248+#include "../plugin.h"
51249+#include "../../znode.h"
51250+#include "../../tree.h"
51251+#include "../../context.h"
51252+#include "ctail.h"
51253+
51254+/* return pointer to item body */
51255+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51256+{
51257+ assert("nikita-324", coord != NULL);
51258+ assert("nikita-325", coord->node != NULL);
51259+ assert("nikita-326", znode_is_loaded(coord->node));
51260+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
51261+
51262+ coord->offset =
51263+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
51264+ zdata(coord->node);
51265+ ON_DEBUG(coord->body_v = coord->node->times_locked);
51266+}
51267+
51268+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51269+{
51270+ return zdata(coord->node) + coord->offset;
51271+}
51272+
51273+#if REISER4_DEBUG
51274+
51275+int item_body_is_valid(const coord_t * coord)
51276+{
51277+ return
51278+ coord->offset ==
51279+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
51280+ zdata(coord->node);
51281+}
51282+
51283+#endif
51284+
51285+/* return length of item at @coord */
51286+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51287+{
51288+ int len;
51289+
51290+ assert("nikita-327", coord != NULL);
51291+ assert("nikita-328", coord->node != NULL);
51292+ assert("nikita-329", znode_is_loaded(coord->node));
51293+
51294+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51295+ return len;
51296+}
51297+
51298+void obtain_item_plugin(const coord_t * coord)
51299+{
51300+ assert("nikita-330", coord != NULL);
51301+ assert("nikita-331", coord->node != NULL);
51302+ assert("nikita-332", znode_is_loaded(coord->node));
51303+
51304+ coord_set_iplug((coord_t *) coord,
51305+ node_plugin_by_node(coord->node)->
51306+ plugin_by_coord(coord));
51307+ assert("nikita-2479",
51308+ coord_iplug(coord) ==
51309+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51310+}
51311+
44254afd
MT
51312+/* return id of item */
51313+/* Audited by: green(2002.06.15) */
51314+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
51315+{
51316+ assert("vs-539", coord != NULL);
51317+ assert("vs-538", coord->node != NULL);
51318+ assert("vs-537", znode_is_loaded(coord->node));
51319+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
51320+ assert("vs-540",
51321+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
51322+
51323+ return item_id_by_plugin(item_plugin_by_coord(coord));
51324+}
51325+
51326+/* return key of item at @coord */
51327+/* Audited by: green(2002.06.15) */
51328+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
51329+ reiser4_key * key /* result */ )
51330+{
51331+ assert("nikita-338", coord != NULL);
51332+ assert("nikita-339", coord->node != NULL);
51333+ assert("nikita-340", znode_is_loaded(coord->node));
51334+
51335+ return node_plugin_by_node(coord->node)->key_at(coord, key);
51336+}
51337+
51338+/* this returns max key in the item */
51339+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
51340+ reiser4_key * key /* result */ )
51341+{
51342+ coord_t last;
51343+
51344+ assert("nikita-338", coord != NULL);
51345+ assert("nikita-339", coord->node != NULL);
51346+ assert("nikita-340", znode_is_loaded(coord->node));
51347+
51348+ /* make coord pointing to last item's unit */
51349+ coord_dup(&last, coord);
51350+ last.unit_pos = coord_num_units(&last) - 1;
51351+ assert("vs-1560", coord_is_existing_unit(&last));
51352+
51353+ max_unit_key_by_coord(&last, key);
51354+ return key;
51355+}
51356+
51357+/* return key of unit at @coord */
51358+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51359+ reiser4_key * key /* result */ )
51360+{
51361+ assert("nikita-772", coord != NULL);
51362+ assert("nikita-774", coord->node != NULL);
51363+ assert("nikita-775", znode_is_loaded(coord->node));
51364+
51365+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
51366+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
51367+ else
51368+ return item_key_by_coord(coord, key);
51369+}
51370+
51371+/* return the biggest key contained the unit @coord */
51372+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
51373+ reiser4_key * key /* result */ )
51374+{
51375+ assert("nikita-772", coord != NULL);
51376+ assert("nikita-774", coord->node != NULL);
51377+ assert("nikita-775", znode_is_loaded(coord->node));
51378+
51379+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
51380+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
51381+ else
51382+ return unit_key_by_coord(coord, key);
51383+}
51384+
51385+/* ->max_key_inside() method for items consisting of exactly one key (like
51386+ stat-data) */
51387+static reiser4_key *max_key_inside_single_key(const coord_t *
51388+ coord /* coord of item */ ,
51389+ reiser4_key *
51390+ result /* resulting key */ )
51391+{
51392+ assert("nikita-604", coord != NULL);
51393+
51394+ /* coord -> key is starting key of this item and it has to be already
51395+ filled in */
51396+ return unit_key_by_coord(coord, result);
51397+}
51398+
51399+/* ->nr_units() method for items consisting of exactly one unit always */
71430cf6 51400+pos_in_node_t
44254afd
MT
51401+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
51402+{
51403+ return 1;
51404+}
51405+
51406+static int
51407+paste_no_paste(coord_t * coord UNUSED_ARG,
51408+ reiser4_item_data * data UNUSED_ARG,
51409+ carry_plugin_info * info UNUSED_ARG)
51410+{
51411+ return 0;
51412+}
51413+
51414+/* default ->fast_paste() method */
51415+static int
51416+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
51417+{
51418+ return 1;
51419+}
51420+
51421+int item_can_contain_key(const coord_t * item /* coord of item */ ,
51422+ const reiser4_key * key /* key to check */ ,
51423+ const reiser4_item_data * data /* parameters of item
51424+ * being created */ )
51425+{
51426+ item_plugin *iplug;
51427+ reiser4_key min_key_in_item;
51428+ reiser4_key max_key_in_item;
51429+
51430+ assert("nikita-1658", item != NULL);
51431+ assert("nikita-1659", key != NULL);
51432+
51433+ iplug = item_plugin_by_coord(item);
51434+ if (iplug->b.can_contain_key != NULL)
51435+ return iplug->b.can_contain_key(item, key, data);
51436+ else {
51437+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
51438+ item_key_by_coord(item, &min_key_in_item);
51439+ iplug->b.max_key_inside(item, &max_key_in_item);
51440+
51441+ /* can contain key if
51442+ min_key_in_item <= key &&
51443+ key <= max_key_in_item
51444+ */
51445+ return keyle(&min_key_in_item, key)
51446+ && keyle(key, &max_key_in_item);
51447+ }
51448+}
51449+
51450+/* mergeable method for non mergeable items */
51451+static int
51452+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
51453+{
51454+ return 0;
51455+}
51456+
51457+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
51458+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
51459+ const coord_t * i2 /* coord of second item */ )
51460+{
51461+ item_plugin *iplug;
51462+ reiser4_key k1;
51463+ reiser4_key k2;
51464+
51465+ assert("nikita-1336", i1 != NULL);
51466+ assert("nikita-1337", i2 != NULL);
51467+
51468+ iplug = item_plugin_by_coord(i1);
51469+ assert("nikita-1338", iplug != NULL);
51470+
51471+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
51472+ shifting code when nodes are in "suspended" state. */
51473+ assert("nikita-1663",
51474+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
51475+
51476+ if (iplug->b.mergeable != NULL) {
51477+ return iplug->b.mergeable(i1, i2);
51478+ } else if (iplug->b.max_key_inside != NULL) {
51479+ iplug->b.max_key_inside(i1, &k1);
51480+ item_key_by_coord(i2, &k2);
51481+
51482+ /* mergeable if ->max_key_inside() >= key of i2; */
51483+ return keyge(iplug->b.max_key_inside(i1, &k1),
51484+ item_key_by_coord(i2, &k2));
51485+ } else {
51486+ item_key_by_coord(i1, &k1);
51487+ item_key_by_coord(i2, &k2);
51488+
51489+ return
51490+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
51491+ (get_key_objectid(&k1) == get_key_objectid(&k2))
51492+ && (iplug == item_plugin_by_coord(i2));
51493+ }
51494+}
51495+
51496+int item_is_extent(const coord_t * item)
51497+{
51498+ assert("vs-482", coord_is_existing_item(item));
51499+ return item_id_by_coord(item) == EXTENT_POINTER_ID;
51500+}
51501+
51502+int item_is_tail(const coord_t * item)
51503+{
51504+ assert("vs-482", coord_is_existing_item(item));
51505+ return item_id_by_coord(item) == FORMATTING_ID;
51506+}
51507+
71430cf6
MT
51508+#if REISER4_DEBUG
51509+
44254afd
MT
51510+int item_is_statdata(const coord_t * item)
51511+{
51512+ assert("vs-516", coord_is_existing_item(item));
71430cf6 51513+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
44254afd
MT
51514+}
51515+
51516+int item_is_ctail(const coord_t * item)
51517+{
51518+ assert("edward-xx", coord_is_existing_item(item));
51519+ return item_id_by_coord(item) == CTAIL_ID;
51520+}
51521+
71430cf6
MT
51522+#endif /* REISER4_DEBUG */
51523+
51524+static int change_item(struct inode *inode,
51525+ reiser4_plugin * plugin,
51526+ pset_member memb)
44254afd
MT
51527+{
51528+ /* cannot change constituent item (sd, or dir_item) */
51529+ return RETERR(-EINVAL);
51530+}
51531+
51532+static reiser4_plugin_ops item_plugin_ops = {
51533+ .init = NULL,
51534+ .load = NULL,
51535+ .save_len = NULL,
51536+ .save = NULL,
51537+ .change = change_item
51538+};
51539+
51540+item_plugin item_plugins[LAST_ITEM_ID] = {
51541+ [STATIC_STAT_DATA_ID] = {
51542+ .h = {
51543+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51544+ .id = STATIC_STAT_DATA_ID,
71430cf6 51545+ .groups = (1 << STAT_DATA_ITEM_TYPE),
44254afd
MT
51546+ .pops = &item_plugin_ops,
51547+ .label = "sd",
51548+ .desc = "stat-data",
51549+ .linkage = {NULL, NULL}
51550+ },
51551+ .b = {
44254afd
MT
51552+ .max_key_inside = max_key_inside_single_key,
51553+ .can_contain_key = NULL,
51554+ .mergeable = not_mergeable,
51555+ .nr_units = nr_units_single_unit,
51556+ .lookup = NULL,
51557+ .init = NULL,
51558+ .paste = paste_no_paste,
51559+ .fast_paste = NULL,
51560+ .can_shift = NULL,
51561+ .copy_units = NULL,
51562+ .create_hook = NULL,
51563+ .kill_hook = NULL,
51564+ .shift_hook = NULL,
51565+ .cut_units = NULL,
51566+ .kill_units = NULL,
51567+ .unit_key = NULL,
51568+ .max_unit_key = NULL,
51569+ .estimate = NULL,
51570+ .item_data_by_flow = NULL,
51571+#if REISER4_DEBUG
51572+ .check = NULL
51573+#endif
51574+ },
51575+ .f = {
51576+ .utmost_child = NULL,
51577+ .utmost_child_real_block = NULL,
51578+ .update = NULL,
51579+ .scan = NULL,
51580+ .convert = NULL
51581+ },
51582+ .s = {
51583+ .sd = {
51584+ .init_inode = init_inode_static_sd,
51585+ .save_len = save_len_static_sd,
51586+ .save = save_static_sd
51587+ }
51588+ }
51589+ },
51590+ [SIMPLE_DIR_ENTRY_ID] = {
51591+ .h = {
51592+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51593+ .id = SIMPLE_DIR_ENTRY_ID,
71430cf6 51594+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
44254afd
MT
51595+ .pops = &item_plugin_ops,
51596+ .label = "de",
51597+ .desc = "directory entry",
51598+ .linkage = {NULL, NULL}
51599+ },
51600+ .b = {
44254afd
MT
51601+ .max_key_inside = max_key_inside_single_key,
51602+ .can_contain_key = NULL,
51603+ .mergeable = NULL,
51604+ .nr_units = nr_units_single_unit,
51605+ .lookup = NULL,
51606+ .init = NULL,
51607+ .paste = NULL,
51608+ .fast_paste = NULL,
51609+ .can_shift = NULL,
51610+ .copy_units = NULL,
51611+ .create_hook = NULL,
51612+ .kill_hook = NULL,
51613+ .shift_hook = NULL,
51614+ .cut_units = NULL,
51615+ .kill_units = NULL,
51616+ .unit_key = NULL,
51617+ .max_unit_key = NULL,
51618+ .estimate = NULL,
51619+ .item_data_by_flow = NULL,
51620+#if REISER4_DEBUG
51621+ .check = NULL
51622+#endif
51623+ },
51624+ .f = {
51625+ .utmost_child = NULL,
51626+ .utmost_child_real_block = NULL,
51627+ .update = NULL,
51628+ .scan = NULL,
51629+ .convert = NULL
51630+ },
51631+ .s = {
51632+ .dir = {
51633+ .extract_key = extract_key_de,
51634+ .update_key = update_key_de,
51635+ .extract_name = extract_name_de,
51636+ .extract_file_type = extract_file_type_de,
51637+ .add_entry = add_entry_de,
51638+ .rem_entry = rem_entry_de,
51639+ .max_name_len = max_name_len_de
51640+ }
51641+ }
51642+ },
51643+ [COMPOUND_DIR_ID] = {
51644+ .h = {
51645+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51646+ .id = COMPOUND_DIR_ID,
71430cf6 51647+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
44254afd
MT
51648+ .pops = &item_plugin_ops,
51649+ .label = "cde",
51650+ .desc = "compressed directory entry",
51651+ .linkage = {NULL, NULL}
51652+ },
51653+ .b = {
44254afd
MT
51654+ .max_key_inside = max_key_inside_cde,
51655+ .can_contain_key = can_contain_key_cde,
51656+ .mergeable = mergeable_cde,
51657+ .nr_units = nr_units_cde,
51658+ .lookup = lookup_cde,
51659+ .init = init_cde,
51660+ .paste = paste_cde,
51661+ .fast_paste = agree_to_fast_op,
51662+ .can_shift = can_shift_cde,
51663+ .copy_units = copy_units_cde,
51664+ .create_hook = NULL,
51665+ .kill_hook = NULL,
51666+ .shift_hook = NULL,
51667+ .cut_units = cut_units_cde,
51668+ .kill_units = kill_units_cde,
51669+ .unit_key = unit_key_cde,
51670+ .max_unit_key = unit_key_cde,
51671+ .estimate = estimate_cde,
51672+ .item_data_by_flow = NULL,
51673+#if REISER4_DEBUG
71430cf6 51674+ .check = reiser4_check_cde
44254afd
MT
51675+#endif
51676+ },
51677+ .f = {
51678+ .utmost_child = NULL,
51679+ .utmost_child_real_block = NULL,
51680+ .update = NULL,
51681+ .scan = NULL,
51682+ .convert = NULL
51683+ },
51684+ .s = {
51685+ .dir = {
51686+ .extract_key = extract_key_cde,
51687+ .update_key = update_key_cde,
51688+ .extract_name = extract_name_cde,
51689+ .extract_file_type = extract_file_type_de,
51690+ .add_entry = add_entry_cde,
51691+ .rem_entry = rem_entry_cde,
51692+ .max_name_len = max_name_len_cde
51693+ }
51694+ }
51695+ },
51696+ [NODE_POINTER_ID] = {
51697+ .h = {
51698+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51699+ .id = NODE_POINTER_ID,
71430cf6 51700+ .groups = (1 << INTERNAL_ITEM_TYPE),
44254afd
MT
51701+ .pops = NULL,
51702+ .label = "internal",
51703+ .desc = "internal item",
51704+ .linkage = {NULL, NULL}
51705+ },
51706+ .b = {
44254afd
MT
51707+ .max_key_inside = NULL,
51708+ .can_contain_key = NULL,
51709+ .mergeable = mergeable_internal,
51710+ .nr_units = nr_units_single_unit,
51711+ .lookup = lookup_internal,
51712+ .init = NULL,
51713+ .paste = NULL,
51714+ .fast_paste = NULL,
51715+ .can_shift = NULL,
51716+ .copy_units = NULL,
51717+ .create_hook = create_hook_internal,
51718+ .kill_hook = kill_hook_internal,
51719+ .shift_hook = shift_hook_internal,
51720+ .cut_units = NULL,
51721+ .kill_units = NULL,
51722+ .unit_key = NULL,
51723+ .max_unit_key = NULL,
51724+ .estimate = NULL,
51725+ .item_data_by_flow = NULL,
51726+#if REISER4_DEBUG
51727+ .check = check__internal
51728+#endif
51729+ },
51730+ .f = {
51731+ .utmost_child = utmost_child_internal,
51732+ .utmost_child_real_block =
51733+ utmost_child_real_block_internal,
71430cf6 51734+ .update = reiser4_update_internal,
44254afd
MT
51735+ .scan = NULL,
51736+ .convert = NULL
51737+ },
51738+ .s = {
51739+ .internal = {
51740+ .down_link = down_link_internal,
51741+ .has_pointer_to = has_pointer_to_internal
51742+ }
51743+ }
51744+ },
51745+ [EXTENT_POINTER_ID] = {
51746+ .h = {
51747+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51748+ .id = EXTENT_POINTER_ID,
71430cf6 51749+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
44254afd
MT
51750+ .pops = NULL,
51751+ .label = "extent",
51752+ .desc = "extent item",
51753+ .linkage = {NULL, NULL}
51754+ },
51755+ .b = {
44254afd
MT
51756+ .max_key_inside = max_key_inside_extent,
51757+ .can_contain_key = can_contain_key_extent,
51758+ .mergeable = mergeable_extent,
51759+ .nr_units = nr_units_extent,
51760+ .lookup = lookup_extent,
51761+ .init = NULL,
51762+ .paste = paste_extent,
51763+ .fast_paste = agree_to_fast_op,
51764+ .can_shift = can_shift_extent,
51765+ .create_hook = create_hook_extent,
51766+ .copy_units = copy_units_extent,
51767+ .kill_hook = kill_hook_extent,
51768+ .shift_hook = NULL,
51769+ .cut_units = cut_units_extent,
51770+ .kill_units = kill_units_extent,
51771+ .unit_key = unit_key_extent,
51772+ .max_unit_key = max_unit_key_extent,
51773+ .estimate = NULL,
51774+ .item_data_by_flow = NULL,
51775+#if REISER4_DEBUG
71430cf6 51776+ .check = reiser4_check_extent
44254afd
MT
51777+#endif
51778+ },
51779+ .f = {
51780+ .utmost_child = utmost_child_extent,
51781+ .utmost_child_real_block =
51782+ utmost_child_real_block_extent,
51783+ .update = NULL,
71430cf6 51784+ .scan = reiser4_scan_extent,
44254afd
MT
51785+ .convert = NULL,
51786+ .key_by_offset = key_by_offset_extent
51787+ },
51788+ .s = {
51789+ .file = {
71430cf6
MT
51790+ .write = reiser4_write_extent,
51791+ .read = reiser4_read_extent,
51792+ .readpage = reiser4_readpage_extent,
44254afd 51793+ .get_block = get_block_address_extent,
44254afd
MT
51794+ .append_key = append_key_extent,
51795+ .init_coord_extension =
51796+ init_coord_extension_extent
51797+ }
51798+ }
51799+ },
51800+ [FORMATTING_ID] = {
51801+ .h = {
51802+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51803+ .id = FORMATTING_ID,
71430cf6 51804+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
44254afd
MT
51805+ .pops = NULL,
51806+ .label = "body",
51807+ .desc = "body (or tail?) item",
51808+ .linkage = {NULL, NULL}
51809+ },
51810+ .b = {
44254afd
MT
51811+ .max_key_inside = max_key_inside_tail,
51812+ .can_contain_key = can_contain_key_tail,
51813+ .mergeable = mergeable_tail,
51814+ .nr_units = nr_units_tail,
51815+ .lookup = lookup_tail,
51816+ .init = NULL,
51817+ .paste = paste_tail,
51818+ .fast_paste = agree_to_fast_op,
51819+ .can_shift = can_shift_tail,
51820+ .create_hook = NULL,
51821+ .copy_units = copy_units_tail,
51822+ .kill_hook = kill_hook_tail,
51823+ .shift_hook = NULL,
51824+ .cut_units = cut_units_tail,
51825+ .kill_units = kill_units_tail,
51826+ .unit_key = unit_key_tail,
51827+ .max_unit_key = unit_key_tail,
51828+ .estimate = NULL,
51829+ .item_data_by_flow = NULL,
51830+#if REISER4_DEBUG
51831+ .check = NULL
51832+#endif
51833+ },
51834+ .f = {
51835+ .utmost_child = NULL,
51836+ .utmost_child_real_block = NULL,
51837+ .update = NULL,
51838+ .scan = NULL,
51839+ .convert = NULL
51840+ },
51841+ .s = {
51842+ .file = {
71430cf6
MT
51843+ .write = reiser4_write_tail,
51844+ .read = reiser4_read_tail,
44254afd 51845+ .readpage = readpage_tail,
71430cf6 51846+ .get_block = get_block_address_tail,
44254afd
MT
51847+ .append_key = append_key_tail,
51848+ .init_coord_extension =
51849+ init_coord_extension_tail
51850+ }
51851+ }
51852+ },
51853+ [CTAIL_ID] = {
51854+ .h = {
51855+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51856+ .id = CTAIL_ID,
71430cf6 51857+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
44254afd
MT
51858+ .pops = NULL,
51859+ .label = "ctail",
51860+ .desc = "cryptcompress tail item",
51861+ .linkage = {NULL, NULL}
51862+ },
51863+ .b = {
44254afd
MT
51864+ .max_key_inside = max_key_inside_tail,
51865+ .can_contain_key = can_contain_key_ctail,
51866+ .mergeable = mergeable_ctail,
51867+ .nr_units = nr_units_ctail,
51868+ .lookup = NULL,
51869+ .init = init_ctail,
51870+ .paste = paste_ctail,
51871+ .fast_paste = agree_to_fast_op,
51872+ .can_shift = can_shift_ctail,
51873+ .create_hook = create_hook_ctail,
51874+ .copy_units = copy_units_ctail,
51875+ .kill_hook = kill_hook_ctail,
51876+ .shift_hook = shift_hook_ctail,
51877+ .cut_units = cut_units_ctail,
51878+ .kill_units = kill_units_ctail,
51879+ .unit_key = unit_key_tail,
51880+ .max_unit_key = unit_key_tail,
51881+ .estimate = estimate_ctail,
51882+ .item_data_by_flow = NULL,
51883+#if REISER4_DEBUG
51884+ .check = check_ctail
51885+#endif
51886+ },
51887+ .f = {
51888+ .utmost_child = utmost_child_ctail,
51889+ /* FIXME-EDWARD: write this */
51890+ .utmost_child_real_block = NULL,
51891+ .update = NULL,
51892+ .scan = scan_ctail,
51893+ .convert = convert_ctail
51894+ },
51895+ .s = {
51896+ .file = {
51897+ .write = NULL,
51898+ .read = read_ctail,
51899+ .readpage = readpage_ctail,
51900+ .get_block = get_block_address_tail,
44254afd
MT
51901+ .append_key = append_key_ctail,
51902+ .init_coord_extension =
51903+ init_coord_extension_tail
51904+ }
51905+ }
51906+ },
51907+ [BLACK_BOX_ID] = {
51908+ .h = {
51909+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
51910+ .id = BLACK_BOX_ID,
71430cf6 51911+ .groups = (1 << OTHER_ITEM_TYPE),
44254afd
MT
51912+ .pops = NULL,
51913+ .label = "blackbox",
51914+ .desc = "black box item",
51915+ .linkage = {NULL, NULL}
51916+ },
51917+ .b = {
44254afd
MT
51918+ .max_key_inside = NULL,
51919+ .can_contain_key = NULL,
51920+ .mergeable = not_mergeable,
51921+ .nr_units = nr_units_single_unit,
51922+ /* to need for ->lookup method */
51923+ .lookup = NULL,
51924+ .init = NULL,
51925+ .paste = NULL,
51926+ .fast_paste = NULL,
51927+ .can_shift = NULL,
51928+ .copy_units = NULL,
51929+ .create_hook = NULL,
51930+ .kill_hook = NULL,
51931+ .shift_hook = NULL,
51932+ .cut_units = NULL,
51933+ .kill_units = NULL,
51934+ .unit_key = NULL,
51935+ .max_unit_key = NULL,
51936+ .estimate = NULL,
51937+ .item_data_by_flow = NULL,
51938+#if REISER4_DEBUG
51939+ .check = NULL
51940+#endif
51941+ }
51942+ }
51943+};
51944+
51945+/* Make Linus happy.
51946+ Local variables:
51947+ c-indentation-style: "K&R"
51948+ mode-name: "LC"
51949+ c-basic-offset: 8
51950+ tab-width: 8
51951+ fill-column: 120
51952+ End:
51953+*/
71430cf6
MT
51954diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/item.h linux-2.6.22/fs/reiser4/plugin/item/item.h
51955--- linux-2.6.22.orig/fs/reiser4/plugin/item/item.h 1970-01-01 03:00:00.000000000 +0300
51956+++ linux-2.6.22/fs/reiser4/plugin/item/item.h 2007-07-29 00:25:34.972721325 +0400
51957@@ -0,0 +1,397 @@
44254afd
MT
51958+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51959+
51960+/* first read balance.c comments before reading this */
51961+
51962+/* An item_plugin implements all of the operations required for
51963+ balancing that are item specific. */
51964+
51965+/* an item plugin also implements other operations that are specific to that
51966+ item. These go into the item specific operations portion of the item
51967+ handler, and all of the item specific portions of the item handler are put
51968+ into a union. */
51969+
51970+#if !defined( __REISER4_ITEM_H__ )
51971+#define __REISER4_ITEM_H__
51972+
51973+#include "../../forward.h"
51974+#include "../plugin_header.h"
51975+#include "../../dformat.h"
51976+#include "../../seal.h"
51977+#include "../../plugin/file/file.h"
51978+
51979+#include <linux/fs.h> /* for struct file, struct inode */
51980+#include <linux/mm.h> /* for struct page */
51981+#include <linux/dcache.h> /* for struct dentry */
51982+
51983+typedef enum {
51984+ STAT_DATA_ITEM_TYPE,
51985+ DIR_ENTRY_ITEM_TYPE,
51986+ INTERNAL_ITEM_TYPE,
51987+ UNIX_FILE_METADATA_ITEM_TYPE,
51988+ OTHER_ITEM_TYPE
51989+} item_type_id;
51990+
51991+/* this is the part of each item plugin that all items are expected to
51992+ support or at least explicitly fail to support by setting the
51993+ pointer to null. */
71430cf6 51994+struct balance_ops {
44254afd
MT
51995+ /* operations called by balancing
51996+
51997+ It is interesting to consider that some of these item
51998+ operations could be given sources or targets that are not
51999+ really items in nodes. This could be ok/useful.
52000+
52001+ */
52002+ /* maximal key that can _possibly_ be occupied by this item
52003+
52004+ When inserting, and node ->lookup() method (called by
52005+ coord_by_key()) reaches an item after binary search,
52006+ the ->max_key_inside() item plugin method is used to determine
52007+ whether new item should pasted into existing item
52008+ (new_key<=max_key_inside()) or new item has to be created
52009+ (new_key>max_key_inside()).
52010+
52011+ For items that occupy exactly one key (like stat-data)
52012+ this method should return this key. For items that can
52013+ grow indefinitely (extent, directory item) this should
71430cf6 52014+ return reiser4_max_key().
44254afd
MT
52015+
52016+ For example extent with the key
52017+
52018+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52019+
52020+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52021+ */
52022+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52023+
52024+ /* true if item @coord can merge data at @key. */
52025+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
52026+ const reiser4_item_data *);
52027+ /* mergeable() - check items for mergeability
52028+
52029+ Optional method. Returns true if two items can be merged.
52030+
52031+ */
52032+ int (*mergeable) (const coord_t *, const coord_t *);
52033+
71430cf6
MT
52034+ /* number of atomic things in an item.
52035+ NOTE FOR CONTRIBUTORS: use a generic method
52036+ nr_units_single_unit() for solid (atomic) items, as
52037+ tree operations use it as a criterion of solidness
52038+ (see is_solid_item macro) */
52039+ pos_in_node_t(*nr_units) (const coord_t *);
44254afd
MT
52040+
52041+ /* search within item for a unit within the item, and return a
52042+ pointer to it. This can be used to calculate how many
52043+ bytes to shrink an item if you use pointer arithmetic and
52044+ compare to the start of the item body if the item's data
52045+ are continuous in the node, if the item's data are not
52046+ continuous in the node, all sorts of other things are maybe
52047+ going to break as well. */
52048+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52049+ /* method called by ode_plugin->create_item() to initialise new
52050+ item */
52051+ int (*init) (coord_t * target, coord_t * from,
52052+ reiser4_item_data * data);
71430cf6
MT
52053+ /* method called (e.g., by reiser4_resize_item()) to place new data
52054+ into item when it grows */
44254afd
MT
52055+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52056+ /* return true if paste into @coord is allowed to skip
52057+ carry. That is, if such paste would require any changes
52058+ at the parent level
52059+ */
52060+ int (*fast_paste) (const coord_t *);
52061+ /* how many but not more than @want units of @source can be
52062+ shifted into @target node. If pend == append - we try to
52063+ append last item of @target by first units of @source. If
52064+ pend == prepend - we try to "prepend" first item in @target
52065+ by last units of @source. @target node has @free_space
52066+ bytes of free space. Total size of those units are returned
52067+ via @size.
52068+
52069+ @target is not NULL if shifting to the mergeable item and
52070+ NULL is new item will be created during shifting.
52071+ */
52072+ int (*can_shift) (unsigned free_space, coord_t *,
52073+ znode *, shift_direction, unsigned *size,
52074+ unsigned want);
52075+
52076+ /* starting off @from-th unit of item @source append or
52077+ prepend @count units to @target. @target has been already
52078+ expanded by @free_space bytes. That must be exactly what is
52079+ needed for those items in @target. If @where_is_free_space
52080+ == SHIFT_LEFT - free space is at the end of @target item,
52081+ othersize - it is in the beginning of it. */
52082+ void (*copy_units) (coord_t *, coord_t *,
52083+ unsigned from, unsigned count,
52084+ shift_direction where_is_free_space,
52085+ unsigned free_space);
52086+
52087+ int (*create_hook) (const coord_t *, void *);
52088+ /* do whatever is necessary to do when @count units starting
52089+ from @from-th one are removed from the tree */
52090+ /* FIXME-VS: this is used to be here for, in particular,
52091+ extents and items of internal type to free blocks they point
52092+ to at the same time with removing items from a
52093+ tree. Problems start, however, when dealloc_block fails due
52094+ to some reason. Item gets removed, but blocks it pointed to
52095+ are not freed. It is not clear how to fix this for items of
52096+ internal type because a need to remove internal item may
52097+ appear in the middle of balancing, and there is no way to
52098+ undo changes made. OTOH, if space allocator involves
52099+ balancing to perform dealloc_block - this will probably
52100+ break balancing due to deadlock issues
52101+ */
52102+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
52103+ pos_in_node_t count, struct carry_kill_data *);
52104+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52105+ znode * _node);
52106+
52107+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52108+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
52109+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52110+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52111+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52112+ */
52113+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52114+ struct carry_cut_data *,
52115+ reiser4_key * smallest_removed,
52116+ reiser4_key * new_first_key);
52117+
52118+ /* like cut_units, except that these units are removed from the
52119+ tree, not only from a node */
52120+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52121+ struct carry_kill_data *,
52122+ reiser4_key * smallest_removed,
52123+ reiser4_key * new_first);
52124+
52125+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
52126+ key of unit is returned. If @coord is not set to certain
52127+ unit - ERR_PTR(-ENOENT) is returned */
52128+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52129+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52130+ /* estimate how much space is needed for paste @data into item at
52131+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
52132+ pasting
52133+ */
52134+ int (*estimate) (const coord_t *, const reiser4_item_data *);
52135+
52136+ /* converts flow @f to item data. @coord == 0 on insert */
52137+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
52138+ reiser4_item_data *);
52139+
52140+ /*void (*show) (struct seq_file *, coord_t *); */
52141+
52142+#if REISER4_DEBUG
52143+ /* used for debugging, every item should have here the most
52144+ complete possible check of the consistency of the item that
52145+ the inventor can construct */
52146+ int (*check) (const coord_t *, const char **error);
52147+#endif
52148+
71430cf6 52149+};
44254afd 52150+
71430cf6 52151+struct flush_ops {
44254afd
MT
52152+ /* return the right or left child of @coord, only if it is in memory */
52153+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52154+
52155+ /* return whether the right or left child of @coord has a non-fake
52156+ block number. */
52157+ int (*utmost_child_real_block) (const coord_t *, sideof side,
52158+ reiser4_block_nr *);
52159+ /* relocate child at @coord to the @block */
52160+ void (*update) (const coord_t *, const reiser4_block_nr *);
52161+ /* count unformatted nodes per item for leave relocation policy, etc.. */
52162+ int (*scan) (flush_scan * scan);
52163+ /* convert item by flush */
52164+ int (*convert) (flush_pos_t * pos);
52165+ /* backward mapping from jnode offset to a key. */
52166+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
71430cf6 52167+};
44254afd
MT
52168+
52169+/* operations specific to the directory item */
71430cf6 52170+struct dir_entry_iops {
44254afd
MT
52171+ /* extract stat-data key from directory entry at @coord and place it
52172+ into @key. */
52173+ int (*extract_key) (const coord_t *, reiser4_key * key);
52174+ /* update object key in item. */
52175+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52176+ /* extract name from directory entry at @coord and return it */
52177+ char *(*extract_name) (const coord_t *, char *buf);
52178+ /* extract file type (DT_* stuff) from directory entry at @coord and
52179+ return it */
52180+ unsigned (*extract_file_type) (const coord_t *);
52181+ int (*add_entry) (struct inode * dir,
52182+ coord_t *, lock_handle *,
52183+ const struct dentry * name,
52184+ reiser4_dir_entry_desc * entry);
52185+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
52186+ coord_t *, lock_handle *,
52187+ reiser4_dir_entry_desc * entry);
52188+ int (*max_name_len) (const struct inode * dir);
71430cf6 52189+};
44254afd
MT
52190+
52191+/* operations specific to items regular (unix) file metadata are built of */
71430cf6 52192+struct file_iops{
44254afd
MT
52193+ int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52194+ int (*read) (struct file *, flow_t *, hint_t *);
52195+ int (*readpage) (void *, struct page *);
52196+ int (*get_block) (const coord_t *, sector_t, sector_t *);
44254afd
MT
52197+ /*
52198+ * key of first byte which is not addressed by the item @coord is set
52199+ * to.
52200+ * For example, for extent item with the key
52201+ *
52202+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52203+ *
52204+ * ->append_key is
52205+ *
52206+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52207+ */
52208+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52209+
52210+ void (*init_coord_extension) (uf_coord_t *, loff_t);
71430cf6 52211+};
44254afd
MT
52212+
52213+/* operations specific to items of stat data type */
71430cf6 52214+struct sd_iops {
44254afd
MT
52215+ int (*init_inode) (struct inode * inode, char *sd, int len);
52216+ int (*save_len) (struct inode * inode);
52217+ int (*save) (struct inode * inode, char **area);
71430cf6 52218+};
44254afd
MT
52219+
52220+/* operations specific to internal item */
71430cf6 52221+struct internal_iops{
44254afd
MT
52222+ /* all tree traversal want to know from internal item is where
52223+ to go next. */
52224+ void (*down_link) (const coord_t * coord,
52225+ const reiser4_key * key, reiser4_block_nr * block);
52226+ /* check that given internal item contains given pointer. */
52227+ int (*has_pointer_to) (const coord_t * coord,
52228+ const reiser4_block_nr * block);
71430cf6 52229+};
44254afd
MT
52230+
52231+struct item_plugin {
52232+ /* generic fields */
52233+ plugin_header h;
44254afd 52234+ /* methods common for all item types */
71430cf6
MT
52235+ struct balance_ops b; /* balance operations */
52236+ struct flush_ops f; /* flush operates with items via this methods */
44254afd
MT
52237+
52238+ /* methods specific to particular type of item */
52239+ union {
71430cf6
MT
52240+ struct dir_entry_iops dir;
52241+ struct file_iops file;
52242+ struct sd_iops sd;
52243+ struct internal_iops internal;
44254afd 52244+ } s;
44254afd
MT
52245+};
52246+
71430cf6
MT
52247+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
52248+
44254afd
MT
52249+static inline item_id item_id_by_plugin(item_plugin * plugin)
52250+{
52251+ return plugin->h.id;
52252+}
52253+
52254+static inline char get_iplugid(item_plugin * iplug)
52255+{
52256+ assert("nikita-2838", iplug != NULL);
52257+ assert("nikita-2839", iplug->h.id < 0xff);
52258+ return (char)item_id_by_plugin(iplug);
52259+}
52260+
52261+extern unsigned long znode_times_locked(const znode * z);
52262+
52263+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52264+{
52265+ assert("nikita-2837", coord != NULL);
52266+ assert("nikita-2838", iplug != NULL);
52267+ coord->iplugid = get_iplugid(iplug);
52268+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52269+}
52270+
52271+static inline item_plugin *coord_iplug(const coord_t * coord)
52272+{
52273+ assert("nikita-2833", coord != NULL);
52274+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52275+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52276+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52277+ coord->iplugid);
52278+}
52279+
52280+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52281+ const reiser4_item_data *);
52282+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52283+extern int item_is_extent(const coord_t *);
52284+extern int item_is_tail(const coord_t *);
52285+extern int item_is_statdata(const coord_t * item);
52286+extern int item_is_ctail(const coord_t *);
52287+
52288+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
71430cf6 52289+extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
44254afd
MT
52290+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52291+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52292+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52293+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52294+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52295+ reiser4_key * key);
44254afd
MT
52296+extern void obtain_item_plugin(const coord_t * coord);
52297+
52298+#if defined(REISER4_DEBUG)
52299+extern int znode_is_loaded(const znode * node);
52300+#endif
52301+
52302+/* return plugin of item at @coord */
52303+static inline item_plugin *item_plugin_by_coord(const coord_t *
52304+ coord /* coord to query */ )
52305+{
52306+ assert("nikita-330", coord != NULL);
52307+ assert("nikita-331", coord->node != NULL);
52308+ assert("nikita-332", znode_is_loaded(coord->node));
52309+
52310+ if (unlikely(!coord_is_iplug_set(coord)))
52311+ obtain_item_plugin(coord);
52312+ return coord_iplug(coord);
52313+}
52314+
52315+/* this returns true if item is of internal type */
52316+static inline int item_is_internal(const coord_t * item)
52317+{
52318+ assert("vs-483", coord_is_existing_item(item));
71430cf6 52319+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
44254afd
MT
52320+}
52321+
52322+extern void item_body_by_coord_hard(coord_t * coord);
52323+extern void *item_body_by_coord_easy(const coord_t * coord);
52324+#if REISER4_DEBUG
52325+extern int item_body_is_valid(const coord_t * coord);
52326+#endif
52327+
52328+/* return pointer to item body */
52329+static inline void *item_body_by_coord(const coord_t *
52330+ coord /* coord to query */ )
52331+{
52332+ assert("nikita-324", coord != NULL);
52333+ assert("nikita-325", coord->node != NULL);
52334+ assert("nikita-326", znode_is_loaded(coord->node));
52335+
52336+ if (coord->offset == INVALID_OFFSET)
52337+ item_body_by_coord_hard((coord_t *) coord);
52338+ assert("nikita-3201", item_body_is_valid(coord));
52339+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
52340+ return item_body_by_coord_easy(coord);
52341+}
52342+
52343+/* __REISER4_ITEM_H__ */
52344+#endif
52345+/* Make Linus happy.
52346+ Local variables:
52347+ c-indentation-style: "K&R"
52348+ mode-name: "LC"
52349+ c-basic-offset: 8
52350+ tab-width: 8
52351+ fill-column: 120
52352+ scroll-step: 1
52353+ End:
52354+*/
71430cf6
MT
52355diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/Makefile linux-2.6.22/fs/reiser4/plugin/item/Makefile
52356--- linux-2.6.22.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 03:00:00.000000000 +0300
52357+++ linux-2.6.22/fs/reiser4/plugin/item/Makefile 2007-07-29 00:25:34.972721325 +0400
52358@@ -0,0 +1,18 @@
52359+obj-$(CONFIG_REISER4_FS) += item_plugins.o
52360+
52361+item_plugins-objs := \
52362+ item.o \
52363+ static_stat.o \
52364+ sde.o \
52365+ cde.o \
52366+ blackbox.o \
52367+ internal.o \
52368+ tail.o \
52369+ ctail.o \
52370+ extent.o \
52371+ extent_item_ops.o \
52372+ extent_file_ops.o \
52373+ extent_flush_ops.o
52374+
52375+
52376+
52377diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/sde.c linux-2.6.22/fs/reiser4/plugin/item/sde.c
52378--- linux-2.6.22.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 03:00:00.000000000 +0300
52379+++ linux-2.6.22/fs/reiser4/plugin/item/sde.c 2007-07-29 00:25:34.972721325 +0400
44254afd
MT
52380@@ -0,0 +1,190 @@
52381+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52382+
52383+/* Directory entry implementation */
52384+#include "../../forward.h"
52385+#include "../../debug.h"
52386+#include "../../dformat.h"
52387+#include "../../kassign.h"
52388+#include "../../coord.h"
52389+#include "sde.h"
52390+#include "item.h"
52391+#include "../plugin.h"
52392+#include "../../znode.h"
52393+#include "../../carry.h"
52394+#include "../../tree.h"
52395+#include "../../inode.h"
52396+
52397+#include <linux/fs.h> /* for struct inode */
52398+#include <linux/dcache.h> /* for struct dentry */
52399+#include <linux/quotaops.h>
52400+
52401+/* ->extract_key() method of simple directory item plugin. */
52402+int extract_key_de(const coord_t * coord /* coord of item */ ,
52403+ reiser4_key * key /* resulting key */ )
52404+{
52405+ directory_entry_format *dent;
52406+
52407+ assert("nikita-1458", coord != NULL);
52408+ assert("nikita-1459", key != NULL);
52409+
52410+ dent = (directory_entry_format *) item_body_by_coord(coord);
52411+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
52412+ return extract_key_from_id(&dent->id, key);
52413+}
52414+
52415+int
52416+update_key_de(const coord_t * coord, const reiser4_key * key,
52417+ lock_handle * lh UNUSED_ARG)
52418+{
52419+ directory_entry_format *dent;
52420+ obj_key_id obj_id;
52421+ int result;
52422+
52423+ assert("nikita-2342", coord != NULL);
52424+ assert("nikita-2343", key != NULL);
52425+
52426+ dent = (directory_entry_format *) item_body_by_coord(coord);
52427+ result = build_obj_key_id(key, &obj_id);
52428+ if (result == 0) {
52429+ dent->id = obj_id;
52430+ znode_make_dirty(coord->node);
52431+ }
52432+ return 0;
52433+}
52434+
52435+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
52436+ char *buf)
52437+{
52438+ reiser4_key key;
52439+
52440+ unit_key_by_coord(coord, &key);
52441+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
52442+ reiser4_print_address("oops", znode_get_block(coord->node));
52443+ if (!is_longname_key(&key)) {
52444+ if (is_dot_key(&key))
52445+ return (char *)".";
52446+ else
52447+ return extract_name_from_key(&key, buf);
52448+ } else
52449+ return (char *)dent->name;
52450+}
52451+
52452+/* ->extract_name() method of simple directory item plugin. */
52453+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
52454+{
52455+ directory_entry_format *dent;
52456+
52457+ assert("nikita-1460", coord != NULL);
52458+
52459+ dent = (directory_entry_format *) item_body_by_coord(coord);
52460+ return extract_dent_name(coord, dent, buf);
52461+}
52462+
52463+/* ->extract_file_type() method of simple directory item plugin. */
52464+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
52465+ * item */ )
52466+{
52467+ assert("nikita-1764", coord != NULL);
52468+ /* we don't store file type in the directory entry yet.
52469+
52470+ But see comments at kassign.h:obj_key_id
52471+ */
52472+ return DT_UNKNOWN;
52473+}
52474+
52475+int add_entry_de(struct inode *dir /* directory of item */ ,
52476+ coord_t * coord /* coord of item */ ,
52477+ lock_handle * lh /* insertion lock handle */ ,
52478+ const struct dentry *de /* name to add */ ,
52479+ reiser4_dir_entry_desc * entry /* parameters of new directory
52480+ * entry */ )
52481+{
52482+ reiser4_item_data data;
52483+ directory_entry_format *dent;
52484+ int result;
52485+ const char *name;
52486+ int len;
52487+ int longname;
52488+
52489+ name = de->d_name.name;
52490+ len = de->d_name.len;
52491+ assert("nikita-1163", strlen(name) == len);
52492+
52493+ longname = is_longname(name, len);
52494+
52495+ data.length = sizeof *dent;
52496+ if (longname)
52497+ data.length += len + 1;
52498+ data.data = NULL;
52499+ data.user = 0;
52500+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
52501+
52502+ /* NOTE-NIKITA quota plugin */
52503+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
52504+ return -EDQUOT;
52505+
52506+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
52507+ if (result != 0)
52508+ return result;
52509+
52510+ dent = (directory_entry_format *) item_body_by_coord(coord);
52511+ build_inode_key_id(entry->obj, &dent->id);
52512+ if (longname) {
52513+ memcpy(dent->name, name, len);
52514+ put_unaligned(0, &dent->name[len]);
52515+ }
52516+ return 0;
52517+}
52518+
52519+int rem_entry_de(struct inode *dir /* directory of item */ ,
52520+ const struct qstr *name UNUSED_ARG,
52521+ coord_t * coord /* coord of item */ ,
52522+ lock_handle * lh UNUSED_ARG /* lock handle for
52523+ * removal */ ,
52524+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
52525+ * directory entry
52526+ * being removed */ )
52527+{
52528+ coord_t shadow;
52529+ int result;
52530+ int length;
52531+
52532+ length = item_length_by_coord(coord);
52533+ if (inode_get_bytes(dir) < length) {
52534+ warning("nikita-2627", "Dir is broke: %llu: %llu",
52535+ (unsigned long long)get_inode_oid(dir),
52536+ inode_get_bytes(dir));
52537+
52538+ return RETERR(-EIO);
52539+ }
52540+
52541+ /* cut_node() is supposed to take pointers to _different_
52542+ coords, because it will modify them without respect to
52543+ possible aliasing. To work around this, create temporary copy
52544+ of @coord.
52545+ */
52546+ coord_dup(&shadow, coord);
52547+ result =
52548+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
52549+ if (result == 0) {
52550+ /* NOTE-NIKITA quota plugin */
52551+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
52552+ }
52553+ return result;
52554+}
52555+
52556+int max_name_len_de(const struct inode *dir)
52557+{
71430cf6
MT
52558+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
52559+ sizeof(directory_entry_format) - 2;
44254afd
MT
52560+}
52561+
52562+/* Make Linus happy.
52563+ Local variables:
52564+ c-indentation-style: "K&R"
52565+ mode-name: "LC"
52566+ c-basic-offset: 8
52567+ tab-width: 8
52568+ fill-column: 120
52569+ End:
52570+*/
71430cf6
MT
52571diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/sde.h linux-2.6.22/fs/reiser4/plugin/item/sde.h
52572--- linux-2.6.22.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 03:00:00.000000000 +0300
52573+++ linux-2.6.22/fs/reiser4/plugin/item/sde.h 2007-07-29 00:25:34.976722360 +0400
44254afd
MT
52574@@ -0,0 +1,66 @@
52575+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52576+
52577+/* Directory entry. */
52578+
52579+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
52580+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
52581+
52582+#include "../../forward.h"
52583+#include "../../dformat.h"
52584+#include "../../kassign.h"
52585+#include "../../key.h"
52586+
52587+#include <linux/fs.h>
52588+#include <linux/dcache.h> /* for struct dentry */
52589+
52590+typedef struct directory_entry_format {
52591+ /* key of object stat-data. It's not necessary to store whole
52592+ key here, because it's always key of stat-data, so minor
52593+ packing locality and offset can be omitted here. But this
52594+ relies on particular key allocation scheme for stat-data, so,
52595+ for extensibility sake, whole key can be stored here.
52596+
52597+ We store key as array of bytes, because we don't want 8-byte
52598+ alignment of dir entries.
52599+ */
52600+ obj_key_id id;
52601+ /* file name. Null terminated string. */
52602+ d8 name[0];
52603+} directory_entry_format;
52604+
52605+void print_de(const char *prefix, coord_t * coord);
52606+int extract_key_de(const coord_t * coord, reiser4_key * key);
52607+int update_key_de(const coord_t * coord, const reiser4_key * key,
52608+ lock_handle * lh);
52609+char *extract_name_de(const coord_t * coord, char *buf);
52610+unsigned extract_file_type_de(const coord_t * coord);
52611+int add_entry_de(struct inode *dir, coord_t * coord,
52612+ lock_handle * lh, const struct dentry *name,
52613+ reiser4_dir_entry_desc * entry);
52614+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
52615+ lock_handle * lh, reiser4_dir_entry_desc * entry);
52616+int max_name_len_de(const struct inode *dir);
52617+
52618+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
52619+
52620+char *extract_dent_name(const coord_t * coord,
52621+ directory_entry_format * dent, char *buf);
52622+
52623+#if REISER4_LARGE_KEY
52624+#define DE_NAME_BUF_LEN (24)
52625+#else
52626+#define DE_NAME_BUF_LEN (16)
52627+#endif
52628+
52629+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
52630+#endif
52631+
52632+/* Make Linus happy.
52633+ Local variables:
52634+ c-indentation-style: "K&R"
52635+ mode-name: "LC"
52636+ c-basic-offset: 8
52637+ tab-width: 8
52638+ fill-column: 120
52639+ End:
52640+*/
71430cf6
MT
52641diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.22/fs/reiser4/plugin/item/static_stat.c
52642--- linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 03:00:00.000000000 +0300
52643+++ linux-2.6.22/fs/reiser4/plugin/item/static_stat.c 2007-07-29 00:25:34.976722360 +0400
52644@@ -0,0 +1,1107 @@
44254afd
MT
52645+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52646+
52647+/* stat data manipulation. */
52648+
52649+#include "../../forward.h"
52650+#include "../../super.h"
52651+#include "../../vfs_ops.h"
52652+#include "../../inode.h"
52653+#include "../../debug.h"
52654+#include "../../dformat.h"
52655+#include "../object.h"
52656+#include "../plugin.h"
52657+#include "../plugin_header.h"
52658+#include "static_stat.h"
52659+#include "item.h"
52660+
52661+#include <linux/types.h>
52662+#include <linux/fs.h>
52663+
52664+/* see static_stat.h for explanation */
52665+
52666+/* helper function used while we are dumping/loading inode/plugin state
52667+ to/from the stat-data. */
52668+
52669+static void move_on(int *length /* space remaining in stat-data */ ,
52670+ char **area /* current coord in stat data */ ,
52671+ int size_of /* how many bytes to move forward */ )
52672+{
52673+ assert("nikita-615", length != NULL);
52674+ assert("nikita-616", area != NULL);
52675+
52676+ *length -= size_of;
52677+ *area += size_of;
52678+
52679+ assert("nikita-617", *length >= 0);
52680+}
52681+
52682+/* helper function used while loading inode/plugin state from stat-data.
52683+ Complain if there is less space in stat-data than was expected.
52684+ Can only happen on disk corruption. */
52685+static int not_enough_space(struct inode *inode /* object being processed */ ,
52686+ const char *where /* error message */ )
52687+{
52688+ assert("nikita-618", inode != NULL);
52689+
52690+ warning("nikita-619", "Not enough space in %llu while loading %s",
52691+ (unsigned long long)get_inode_oid(inode), where);
52692+
52693+ return RETERR(-EINVAL);
52694+}
52695+
52696+/* helper function used while loading inode/plugin state from
52697+ stat-data. Call it if invalid plugin id was found. */
52698+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
52699+ struct inode *inode /* object being processed */ )
52700+{
52701+ warning("nikita-620", "Unknown plugin %i in %llu",
52702+ id, (unsigned long long)get_inode_oid(inode));
52703+
52704+ return RETERR(-EINVAL);
52705+}
52706+
52707+/* this is installed as ->init_inode() method of
52708+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
52709+ Copies data from on-disk stat-data format into inode.
52710+ Handles stat-data extensions. */
52711+/* was sd_load */
52712+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
52713+ char *sd /* stat-data body */ ,
52714+ int len /* length of stat-data */ )
52715+{
52716+ int result;
52717+ int bit;
52718+ int chunk;
52719+ __u16 mask;
52720+ __u64 bigmask;
52721+ reiser4_stat_data_base *sd_base;
52722+ reiser4_inode *state;
52723+
52724+ assert("nikita-625", inode != NULL);
52725+ assert("nikita-626", sd != NULL);
52726+
52727+ result = 0;
52728+ sd_base = (reiser4_stat_data_base *) sd;
52729+ state = reiser4_inode_data(inode);
52730+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
52731+ bigmask = mask;
71430cf6 52732+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
44254afd
MT
52733+
52734+ move_on(&len, &sd, sizeof *sd_base);
52735+ for (bit = 0, chunk = 0;
52736+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
52737+ ++bit, mask >>= 1) {
52738+ if (((bit + 1) % 16) != 0) {
52739+ /* handle extension */
52740+ sd_ext_plugin *sdplug;
52741+
52742+ if (bit >= LAST_SD_EXTENSION) {
52743+ warning("vpf-1904",
52744+ "No such extension %i in inode %llu",
52745+ bit,
52746+ (unsigned long long)
52747+ get_inode_oid(inode));
52748+
52749+ result = RETERR(-EINVAL);
52750+ break;
52751+ }
52752+
52753+ sdplug = sd_ext_plugin_by_id(bit);
52754+ if (sdplug == NULL) {
52755+ warning("nikita-627",
52756+ "No such extension %i in inode %llu",
52757+ bit,
52758+ (unsigned long long)
52759+ get_inode_oid(inode));
52760+
52761+ result = RETERR(-EINVAL);
52762+ break;
52763+ }
52764+ if (mask & 1) {
52765+ assert("nikita-628", sdplug->present);
52766+ /* alignment is not supported in node layout
52767+ plugin yet.
52768+ result = align( inode, &len, &sd,
52769+ sdplug -> alignment );
52770+ if( result != 0 )
52771+ return result; */
52772+ result = sdplug->present(inode, &sd, &len);
52773+ } else if (sdplug->absent != NULL)
52774+ result = sdplug->absent(inode);
52775+ if (result)
52776+ break;
52777+ /* else, we are looking at the last bit in 16-bit
52778+ portion of bitmask */
52779+ } else if (mask & 1) {
52780+ /* next portion of bitmask */
52781+ if (len < (int)sizeof(d16)) {
52782+ warning("nikita-629",
52783+ "No space for bitmap in inode %llu",
52784+ (unsigned long long)
52785+ get_inode_oid(inode));
52786+
52787+ result = RETERR(-EINVAL);
52788+ break;
52789+ }
52790+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
52791+ bigmask <<= 16;
52792+ bigmask |= mask;
52793+ move_on(&len, &sd, sizeof(d16));
52794+ ++chunk;
52795+ if (chunk == 3) {
52796+ if (!(mask & 0x8000)) {
52797+ /* clear last bit */
52798+ mask &= ~0x8000;
52799+ continue;
52800+ }
52801+ /* too much */
52802+ warning("nikita-630",
52803+ "Too many extensions in %llu",
52804+ (unsigned long long)
52805+ get_inode_oid(inode));
52806+
52807+ result = RETERR(-EINVAL);
52808+ break;
52809+ }
52810+ } else
52811+ /* bitmask exhausted */
52812+ break;
52813+ }
52814+ state->extmask = bigmask;
52815+ /* common initialisations */
44254afd
MT
52816+ if (len - (bit / 16 * sizeof(d16)) > 0) {
52817+ /* alignment in save_len_static_sd() is taken into account
52818+ -edward */
52819+ warning("nikita-631", "unused space in inode %llu",
52820+ (unsigned long long)get_inode_oid(inode));
52821+ }
52822+
52823+ return result;
52824+}
52825+
52826+/* estimates size of stat-data required to store inode.
52827+ Installed as ->save_len() method of
52828+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
52829+/* was sd_len */
52830+int save_len_static_sd(struct inode *inode /* object being processed */ )
52831+{
52832+ unsigned int result;
52833+ __u64 mask;
52834+ int bit;
52835+
52836+ assert("nikita-632", inode != NULL);
52837+
52838+ result = sizeof(reiser4_stat_data_base);
52839+ mask = reiser4_inode_data(inode)->extmask;
52840+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
52841+ if (mask & 1) {
52842+ sd_ext_plugin *sdplug;
52843+
52844+ sdplug = sd_ext_plugin_by_id(bit);
52845+ assert("nikita-633", sdplug != NULL);
52846+ /* no aligment support
52847+ result +=
52848+ round_up( result, sdplug -> alignment ) - result; */
52849+ result += sdplug->save_len(inode);
52850+ }
52851+ }
52852+ result += bit / 16 * sizeof(d16);
52853+ return result;
52854+}
52855+
52856+/* saves inode into stat-data.
52857+ Installed as ->save() method of
52858+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
52859+/* was sd_save */
52860+int save_static_sd(struct inode *inode /* object being processed */ ,
52861+ char **area /* where to save stat-data */ )
52862+{
52863+ int result;
52864+ __u64 emask;
52865+ int bit;
52866+ unsigned int len;
52867+ reiser4_stat_data_base *sd_base;
52868+
52869+ assert("nikita-634", inode != NULL);
52870+ assert("nikita-635", area != NULL);
52871+
52872+ result = 0;
52873+ emask = reiser4_inode_data(inode)->extmask;
52874+ sd_base = (reiser4_stat_data_base *) * area;
52875+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
52876+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
52877+
52878+ *area += sizeof *sd_base;
52879+ len = 0xffffffffu;
52880+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
52881+ if (emask & 1) {
52882+ if ((bit + 1) % 16 != 0) {
52883+ sd_ext_plugin *sdplug;
52884+ sdplug = sd_ext_plugin_by_id(bit);
52885+ assert("nikita-636", sdplug != NULL);
52886+ /* no alignment support yet
52887+ align( inode, &len, area,
52888+ sdplug -> alignment ); */
52889+ result = sdplug->save(inode, area);
52890+ if (result)
52891+ break;
52892+ } else {
52893+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
52894+ (d16 *)(*area));
52895+ /*cputod16((unsigned)(emask & 0xffff),
52896+ (d16 *) * area);*/
52897+ *area += sizeof(d16);
52898+ }
52899+ }
52900+ }
52901+ return result;
52902+}
52903+
52904+/* stat-data extension handling functions. */
52905+
52906+static int present_lw_sd(struct inode *inode /* object being processed */ ,
52907+ char **area /* position in stat-data */ ,
52908+ int *len /* remaining length */ )
52909+{
52910+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
52911+ reiser4_light_weight_stat *sd_lw;
52912+
52913+ sd_lw = (reiser4_light_weight_stat *) * area;
52914+
52915+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
52916+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
52917+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
52918+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
52919+ inode->i_mode &= ~S_IFIFO;
52920+ warning("", "partially converted file is encountered");
71430cf6 52921+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
44254afd
MT
52922+ }
52923+ move_on(len, area, sizeof *sd_lw);
52924+ return 0;
52925+ } else
52926+ return not_enough_space(inode, "lw sd");
52927+}
52928+
52929+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
52930+ * processed */ )
52931+{
52932+ return sizeof(reiser4_light_weight_stat);
52933+}
52934+
52935+static int save_lw_sd(struct inode *inode /* object being processed */ ,
52936+ char **area /* position in stat-data */ )
52937+{
52938+ reiser4_light_weight_stat *sd;
52939+ mode_t delta;
52940+
52941+ assert("nikita-2705", inode != NULL);
52942+ assert("nikita-2706", area != NULL);
52943+ assert("nikita-2707", *area != NULL);
52944+
52945+ sd = (reiser4_light_weight_stat *) * area;
52946+
71430cf6
MT
52947+ delta = (reiser4_inode_get_flag(inode,
52948+ REISER4_PART_MIXED) ? S_IFIFO : 0);
44254afd
MT
52949+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
52950+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
52951+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
52952+ *area += sizeof *sd;
52953+ return 0;
52954+}
52955+
52956+static int present_unix_sd(struct inode *inode /* object being processed */ ,
52957+ char **area /* position in stat-data */ ,
52958+ int *len /* remaining length */ )
52959+{
52960+ assert("nikita-637", inode != NULL);
52961+ assert("nikita-638", area != NULL);
52962+ assert("nikita-639", *area != NULL);
52963+ assert("nikita-640", len != NULL);
52964+ assert("nikita-641", *len > 0);
52965+
52966+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
52967+ reiser4_unix_stat *sd;
52968+
52969+ sd = (reiser4_unix_stat *) * area;
52970+
52971+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
52972+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
52973+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
52974+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
52975+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
52976+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
52977+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
52978+ else
52979+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
52980+ move_on(len, area, sizeof *sd);
52981+ return 0;
52982+ } else
52983+ return not_enough_space(inode, "unix sd");
52984+}
52985+
52986+static int absent_unix_sd(struct inode *inode /* object being processed */ )
52987+{
52988+ inode->i_uid = get_super_private(inode->i_sb)->default_uid;
52989+ inode->i_gid = get_super_private(inode->i_sb)->default_gid;
52990+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
52991+ inode_set_bytes(inode, inode->i_size);
71430cf6 52992+ /* mark inode as lightweight, so that caller (lookup_common) will
44254afd 52993+ complete initialisation by copying [ug]id from a parent. */
71430cf6 52994+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
44254afd
MT
52995+ return 0;
52996+}
52997+
52998+/* Audited by: green(2002.06.14) */
52999+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53000+ * processed */ )
53001+{
53002+ return sizeof(reiser4_unix_stat);
53003+}
53004+
53005+static int save_unix_sd(struct inode *inode /* object being processed */ ,
53006+ char **area /* position in stat-data */ )
53007+{
53008+ reiser4_unix_stat *sd;
53009+
53010+ assert("nikita-642", inode != NULL);
53011+ assert("nikita-643", area != NULL);
53012+ assert("nikita-644", *area != NULL);
53013+
53014+ sd = (reiser4_unix_stat *) * area;
53015+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53016+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53017+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53018+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53019+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53020+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53021+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53022+ else
53023+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53024+ *area += sizeof *sd;
53025+ return 0;
53026+}
53027+
53028+static int
53029+present_large_times_sd(struct inode *inode /* object being processed */ ,
53030+ char **area /* position in stat-data */ ,
53031+ int *len /* remaining length */ )
53032+{
53033+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53034+ reiser4_large_times_stat *sd_lt;
53035+
53036+ sd_lt = (reiser4_large_times_stat *) * area;
53037+
53038+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53039+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53040+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53041+
53042+ move_on(len, area, sizeof *sd_lt);
53043+ return 0;
53044+ } else
53045+ return not_enough_space(inode, "large times sd");
53046+}
53047+
53048+static int
53049+save_len_large_times_sd(struct inode *inode UNUSED_ARG
53050+ /* object being processed */ )
53051+{
53052+ return sizeof(reiser4_large_times_stat);
53053+}
53054+
53055+static int
53056+save_large_times_sd(struct inode *inode /* object being processed */ ,
53057+ char **area /* position in stat-data */ )
53058+{
53059+ reiser4_large_times_stat *sd;
53060+
53061+ assert("nikita-2817", inode != NULL);
53062+ assert("nikita-2818", area != NULL);
53063+ assert("nikita-2819", *area != NULL);
53064+
53065+ sd = (reiser4_large_times_stat *) * area;
53066+
53067+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53068+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53069+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53070+
53071+ *area += sizeof *sd;
53072+ return 0;
53073+}
53074+
53075+/* symlink stat data extension */
53076+
71430cf6 53077+/* allocate memory for symlink target and attach it to inode->i_private */
44254afd
MT
53078+static int
53079+symlink_target_to_inode(struct inode *inode, const char *target, int len)
53080+{
71430cf6
MT
53081+ assert("vs-845", inode->i_private == NULL);
53082+ assert("vs-846", !reiser4_inode_get_flag(inode,
53083+ REISER4_GENERIC_PTR_USED));
44254afd
MT
53084+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
53085+ places, though */
71430cf6
MT
53086+ inode->i_private = kmalloc((size_t) len + 1,
53087+ reiser4_ctx_gfp_mask_get());
53088+ if (!inode->i_private)
44254afd
MT
53089+ return RETERR(-ENOMEM);
53090+
71430cf6
MT
53091+ memcpy((char *)(inode->i_private), target, (size_t) len);
53092+ ((char *)(inode->i_private))[len] = 0;
53093+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
44254afd
MT
53094+ return 0;
53095+}
53096+
53097+/* this is called on read_inode. There is nothing to do actually, but some
53098+ sanity checks */
53099+static int present_symlink_sd(struct inode *inode, char **area, int *len)
53100+{
53101+ int result;
53102+ int length;
53103+ reiser4_symlink_stat *sd;
53104+
53105+ length = (int)inode->i_size;
53106+ /*
53107+ * *len is number of bytes in stat data item from *area to the end of
53108+ * item. It must be not less than size of symlink + 1 for ending 0
53109+ */
53110+ if (length > *len)
53111+ return not_enough_space(inode, "symlink");
53112+
53113+ if (*(*area + length) != 0) {
53114+ warning("vs-840", "Symlink is not zero terminated");
53115+ return RETERR(-EIO);
53116+ }
53117+
53118+ sd = (reiser4_symlink_stat *) * area;
53119+ result = symlink_target_to_inode(inode, sd->body, length);
53120+
53121+ move_on(len, area, length + 1);
53122+ return result;
53123+}
53124+
53125+static int save_len_symlink_sd(struct inode *inode)
53126+{
53127+ return inode->i_size + 1;
53128+}
53129+
53130+/* this is called on create and update stat data. Do nothing on update but
53131+ update @area */
53132+static int save_symlink_sd(struct inode *inode, char **area)
53133+{
53134+ int result;
53135+ int length;
53136+ reiser4_symlink_stat *sd;
53137+
53138+ length = (int)inode->i_size;
53139+ /* inode->i_size must be set already */
53140+ assert("vs-841", length);
53141+
53142+ result = 0;
53143+ sd = (reiser4_symlink_stat *) * area;
71430cf6 53144+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
44254afd
MT
53145+ const char *target;
53146+
71430cf6
MT
53147+ target = (const char *)(inode->i_private);
53148+ inode->i_private = NULL;
44254afd
MT
53149+
53150+ result = symlink_target_to_inode(inode, target, length);
53151+
53152+ /* copy symlink to stat data */
53153+ memcpy(sd->body, target, (size_t) length);
53154+ (*area)[length] = 0;
53155+ } else {
53156+ /* there is nothing to do in update but move area */
53157+ assert("vs-844",
71430cf6 53158+ !memcmp(inode->i_private, sd->body,
44254afd
MT
53159+ (size_t) length + 1));
53160+ }
53161+
53162+ *area += (length + 1);
53163+ return result;
53164+}
53165+
53166+static int present_flags_sd(struct inode *inode /* object being processed */ ,
53167+ char **area /* position in stat-data */ ,
53168+ int *len /* remaining length */ )
53169+{
53170+ assert("nikita-645", inode != NULL);
53171+ assert("nikita-646", area != NULL);
53172+ assert("nikita-647", *area != NULL);
53173+ assert("nikita-648", len != NULL);
53174+ assert("nikita-649", *len > 0);
53175+
53176+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
53177+ reiser4_flags_stat *sd;
53178+
53179+ sd = (reiser4_flags_stat *) * area;
53180+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53181+ move_on(len, area, sizeof *sd);
53182+ return 0;
53183+ } else
53184+ return not_enough_space(inode, "generation and attrs");
53185+}
53186+
53187+/* Audited by: green(2002.06.14) */
53188+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53189+ * processed */ )
53190+{
53191+ return sizeof(reiser4_flags_stat);
53192+}
53193+
53194+static int save_flags_sd(struct inode *inode /* object being processed */ ,
53195+ char **area /* position in stat-data */ )
53196+{
53197+ reiser4_flags_stat *sd;
53198+
53199+ assert("nikita-650", inode != NULL);
53200+ assert("nikita-651", area != NULL);
53201+ assert("nikita-652", *area != NULL);
53202+
53203+ sd = (reiser4_flags_stat *) * area;
53204+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53205+ *area += sizeof *sd;
53206+ return 0;
53207+}
53208+
53209+static int absent_plugin_sd(struct inode *inode);
53210+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53211+ char **area /* position in stat-data */ ,
71430cf6
MT
53212+ int *len /* remaining length */,
53213+ int is_pset /* 1 if plugin set, 0 if heir set. */)
44254afd
MT
53214+{
53215+ reiser4_plugin_stat *sd;
53216+ reiser4_plugin *plugin;
71430cf6 53217+ reiser4_inode *info;
44254afd
MT
53218+ int i;
53219+ __u16 mask;
53220+ int result;
53221+ int num_of_plugins;
53222+
53223+ assert("nikita-653", inode != NULL);
53224+ assert("nikita-654", area != NULL);
53225+ assert("nikita-655", *area != NULL);
53226+ assert("nikita-656", len != NULL);
53227+ assert("nikita-657", *len > 0);
53228+
53229+ if (*len < (int)sizeof(reiser4_plugin_stat))
53230+ return not_enough_space(inode, "plugin");
53231+
53232+ sd = (reiser4_plugin_stat *) * area;
71430cf6 53233+ info = reiser4_inode_data(inode);
44254afd
MT
53234+
53235+ mask = 0;
53236+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53237+ move_on(len, area, sizeof *sd);
53238+ result = 0;
53239+ for (i = 0; i < num_of_plugins; ++i) {
53240+ reiser4_plugin_slot *slot;
53241+ reiser4_plugin_type type;
53242+ pset_member memb;
53243+
53244+ slot = (reiser4_plugin_slot *) * area;
53245+ if (*len < (int)sizeof *slot)
53246+ return not_enough_space(inode, "additional plugin");
53247+
53248+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
71430cf6
MT
53249+ type = aset_member_to_type_unsafe(memb);
53250+
44254afd
MT
53251+ if (type == REISER4_PLUGIN_TYPES) {
53252+ warning("nikita-3502",
71430cf6
MT
53253+ "wrong %s member (%i) for %llu", is_pset ?
53254+ "pset" : "hset", memb,
44254afd
MT
53255+ (unsigned long long)get_inode_oid(inode));
53256+ return RETERR(-EINVAL);
53257+ }
71430cf6 53258+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
44254afd
MT
53259+ type, &slot->id);
53260+ if (plugin == NULL)
53261+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53262+
53263+ /* plugin is loaded into inode, mark this into inode's
53264+ bitmask of loaded non-standard plugins */
53265+ if (!(mask & (1 << memb))) {
53266+ mask |= (1 << memb);
53267+ } else {
53268+ warning("nikita-658", "duplicate plugin for %llu",
53269+ (unsigned long long)get_inode_oid(inode));
53270+ return RETERR(-EINVAL);
53271+ }
53272+ move_on(len, area, sizeof *slot);
53273+ /* load plugin data, if any */
71430cf6 53274+ if (plugin->h.pops != NULL && plugin->h.pops->load)
44254afd 53275+ result = plugin->h.pops->load(inode, plugin, area, len);
71430cf6
MT
53276+ else
53277+ result = aset_set_unsafe(is_pset ? &info->pset :
53278+ &info->hset, memb, plugin);
53279+ if (result)
53280+ return result;
44254afd 53281+ }
71430cf6
MT
53282+ if (is_pset) {
53283+ /* if object plugin wasn't loaded from stat-data, guess it by
53284+ mode bits */
53285+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53286+ if (plugin == NULL)
53287+ result = absent_plugin_sd(inode);
53288+ info->plugin_mask = mask;
53289+ } else
53290+ info->heir_mask = mask;
44254afd 53291+
44254afd
MT
53292+ return result;
53293+}
53294+
71430cf6
MT
53295+static int present_pset_sd(struct inode *inode, char **area, int *len) {
53296+ return present_plugin_sd(inode, area, len, 1 /* pset */);
53297+}
53298+
44254afd
MT
53299+/* Determine object plugin for @inode based on i_mode.
53300+
53301+ Many objects in reiser4 file system are controlled by standard object
53302+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53303+
53304+ For such files we don't explicitly store plugin id in object stat
53305+ data. Rather required plugin is guessed from mode bits, where file "type"
53306+ is encoded (see stat(2)).
53307+*/
53308+static int
53309+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53310+{
53311+ int fplug_id;
53312+ int dplug_id;
53313+ reiser4_inode *info;
53314+
53315+ assert("nikita-736", inode != NULL);
53316+
53317+ dplug_id = fplug_id = -1;
53318+
53319+ switch (inode->i_mode & S_IFMT) {
53320+ case S_IFSOCK:
53321+ case S_IFBLK:
53322+ case S_IFCHR:
53323+ case S_IFIFO:
53324+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
53325+ break;
53326+ case S_IFLNK:
53327+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
53328+ break;
53329+ case S_IFDIR:
53330+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53331+ dplug_id = HASHED_DIR_PLUGIN_ID;
53332+ break;
53333+ default:
53334+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53335+ return RETERR(-EIO);
53336+ case S_IFREG:
53337+ fplug_id = UNIX_FILE_PLUGIN_ID;
53338+ break;
53339+ }
53340+ info = reiser4_inode_data(inode);
71430cf6
MT
53341+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
53342+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
53343+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
53344+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
44254afd
MT
53345+ return 0;
53346+}
53347+
53348+/* Audited by: green(2002.06.14) */
53349+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
53350+{
53351+ int result;
53352+
53353+ assert("nikita-659", inode != NULL);
53354+
53355+ result = guess_plugin_by_mode(inode);
53356+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
53357+ but setup_inode_ops() will call make_bad_inode().
53358+ Another, more logical but bit more complex solution is to add
53359+ "bad-file plugin". */
53360+ /* FIXME-VS: activate was called here */
53361+ return result;
53362+}
53363+
53364+/* helper function for plugin_sd_save_len(): calculate how much space
53365+ required to save state of given plugin */
53366+/* Audited by: green(2002.06.14) */
53367+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
53368+ struct inode *inode /* object being processed */ ,
71430cf6
MT
53369+ pset_member memb,
53370+ int len, int is_pset)
44254afd
MT
53371+{
53372+ reiser4_inode *info;
53373+ assert("nikita-661", inode != NULL);
53374+
71430cf6
MT
53375+ if (plugin == NULL)
53376+ return len;
53377+
44254afd 53378+ info = reiser4_inode_data(inode);
71430cf6
MT
53379+ if (is_pset ?
53380+ info->plugin_mask & (1 << memb) :
53381+ info->heir_mask & (1 << memb)) {
44254afd
MT
53382+ len += sizeof(reiser4_plugin_slot);
53383+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
53384+ /* non-standard plugin, call method */
53385+ /* commented as it is incompatible with alignment
53386+ * policy in save_plug() -edward */
53387+ /* len = round_up(len, plugin->h.pops->alignment); */
53388+ len += plugin->h.pops->save_len(inode, plugin);
53389+ }
53390+ }
53391+ return len;
53392+}
53393+
53394+/* calculate how much space is required to save state of all plugins,
53395+ associated with inode */
71430cf6
MT
53396+static int save_len_plugin_sd(struct inode *inode /* object being processed */,
53397+ int is_pset)
44254afd
MT
53398+{
53399+ int len;
71430cf6 53400+ int last;
44254afd
MT
53401+ reiser4_inode *state;
53402+ pset_member memb;
53403+
53404+ assert("nikita-663", inode != NULL);
53405+
53406+ state = reiser4_inode_data(inode);
71430cf6 53407+
44254afd 53408+ /* common case: no non-standard plugins */
71430cf6 53409+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
44254afd
MT
53410+ return 0;
53411+ len = sizeof(reiser4_plugin_stat);
71430cf6
MT
53412+ last = PSET_LAST;
53413+
53414+ for (memb = 0; memb < last; ++memb) {
53415+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
53416+ inode, memb, len, is_pset);
53417+ }
44254afd
MT
53418+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
53419+ return len;
53420+}
53421+
71430cf6
MT
53422+static int save_len_pset_sd(struct inode *inode) {
53423+ return save_len_plugin_sd(inode, 1 /* pset */);
53424+}
53425+
44254afd
MT
53426+/* helper function for plugin_sd_save(): save plugin, associated with
53427+ inode. */
53428+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
53429+ struct inode *inode /* object being processed */ ,
71430cf6 53430+ int memb /* what element of pset is saved */ ,
44254afd 53431+ char **area /* position in stat-data */ ,
71430cf6
MT
53432+ int *count /* incremented if plugin were actually saved. */,
53433+ int is_pset /* 1 for plugin set, 0 for heir set */)
44254afd
MT
53434+{
53435+ reiser4_plugin_slot *slot;
53436+ int fake_len;
53437+ int result;
53438+
53439+ assert("nikita-665", inode != NULL);
53440+ assert("nikita-666", area != NULL);
53441+ assert("nikita-667", *area != NULL);
53442+
53443+ if (plugin == NULL)
53444+ return 0;
71430cf6
MT
53445+
53446+ if (is_pset ?
53447+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
53448+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
44254afd
MT
53449+ return 0;
53450+ slot = (reiser4_plugin_slot *) * area;
53451+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
53452+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
53453+ fake_len = (int)0xffff;
53454+ move_on(&fake_len, area, sizeof *slot);
53455+ ++*count;
53456+ result = 0;
53457+ if (plugin->h.pops != NULL) {
53458+ if (plugin->h.pops->save != NULL)
53459+ result = plugin->h.pops->save(inode, plugin, area);
53460+ }
53461+ return result;
53462+}
53463+
53464+/* save state of all non-standard plugins associated with inode */
53465+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
71430cf6
MT
53466+ char **area /* position in stat-data */,
53467+ int is_pset /* 1 for pset, 0 for hset */)
44254afd 53468+{
71430cf6 53469+ int fake_len;
44254afd
MT
53470+ int result = 0;
53471+ int num_of_plugins;
53472+ reiser4_plugin_stat *sd;
53473+ reiser4_inode *state;
44254afd
MT
53474+ pset_member memb;
53475+
53476+ assert("nikita-669", inode != NULL);
53477+ assert("nikita-670", area != NULL);
53478+ assert("nikita-671", *area != NULL);
53479+
53480+ state = reiser4_inode_data(inode);
71430cf6 53481+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
44254afd
MT
53482+ return 0;
53483+ sd = (reiser4_plugin_stat *) * area;
53484+ fake_len = (int)0xffff;
53485+ move_on(&fake_len, area, sizeof *sd);
53486+
53487+ num_of_plugins = 0;
53488+ for (memb = 0; memb < PSET_LAST; ++memb) {
71430cf6
MT
53489+ result = save_plug(aset_get(is_pset ? state->pset : state->hset,
53490+ memb),
53491+ inode, memb, area, &num_of_plugins, is_pset);
44254afd
MT
53492+ if (result != 0)
53493+ break;
53494+ }
53495+
53496+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
53497+ return result;
53498+}
53499+
71430cf6
MT
53500+static int save_pset_sd(struct inode *inode, char **area) {
53501+ return save_plugin_sd(inode, area, 1 /* pset */);
53502+}
53503+
53504+static int present_hset_sd(struct inode *inode, char **area, int *len) {
53505+ return present_plugin_sd(inode, area, len, 0 /* hset */);
53506+}
53507+
53508+static int save_len_hset_sd(struct inode *inode) {
53509+ return save_len_plugin_sd(inode, 0 /* pset */);
53510+}
53511+
53512+static int save_hset_sd(struct inode *inode, char **area) {
53513+ return save_plugin_sd(inode, area, 0 /* hset */);
53514+}
53515+
44254afd 53516+/* helper function for crypto_sd_present(), crypto_sd_save.
71430cf6
MT
53517+ Extract crypto info from stat-data and attach it to inode */
53518+static int extract_crypto_info (struct inode * inode,
44254afd
MT
53519+ reiser4_crypto_stat * sd)
53520+{
71430cf6
MT
53521+ struct reiser4_crypto_info * info;
53522+ assert("edward-11", !inode_crypto_info(inode));
44254afd 53523+ assert("edward-1413",
71430cf6 53524+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
44254afd 53525+ /* create and attach a crypto-stat without secret key loaded */
71430cf6 53526+ info = reiser4_alloc_crypto_info(inode);
44254afd
MT
53527+ if (IS_ERR(info))
53528+ return PTR_ERR(info);
53529+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
53530+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
71430cf6
MT
53531+ reiser4_attach_crypto_info(inode, info);
53532+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
44254afd
MT
53533+ return 0;
53534+}
53535+
53536+/* crypto stat-data extension */
53537+
53538+static int present_crypto_sd(struct inode *inode, char **area, int *len)
53539+{
53540+ int result;
53541+ reiser4_crypto_stat *sd;
53542+ digest_plugin *dplug = inode_digest_plugin(inode);
53543+
53544+ assert("edward-06", dplug != NULL);
53545+ assert("edward-684", dplug->fipsize);
53546+ assert("edward-07", area != NULL);
53547+ assert("edward-08", *area != NULL);
53548+ assert("edward-09", len != NULL);
53549+ assert("edward-10", *len > 0);
53550+
53551+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
53552+ return not_enough_space(inode, "crypto-sd");
53553+ }
53554+ /* *len is number of bytes in stat data item from *area to the end of
53555+ item. It must be not less than size of this extension */
53556+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
53557+
53558+ sd = (reiser4_crypto_stat *) * area;
71430cf6 53559+ result = extract_crypto_info(inode, sd);
44254afd
MT
53560+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
53561+
53562+ return result;
53563+}
53564+
53565+static int save_len_crypto_sd(struct inode *inode)
53566+{
53567+ return sizeof(reiser4_crypto_stat) +
53568+ inode_digest_plugin(inode)->fipsize;
53569+}
53570+
53571+static int save_crypto_sd(struct inode *inode, char **area)
53572+{
53573+ int result = 0;
53574+ reiser4_crypto_stat *sd;
71430cf6 53575+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
44254afd
MT
53576+ digest_plugin *dplug = inode_digest_plugin(inode);
53577+
53578+ assert("edward-12", dplug != NULL);
53579+ assert("edward-13", area != NULL);
53580+ assert("edward-14", *area != NULL);
53581+ assert("edward-15", info != NULL);
53582+ assert("edward-1414", info->keyid != NULL);
53583+ assert("edward-1415", info->keysize != 0);
53584+ assert("edward-76", reiser4_inode_data(inode) != NULL);
53585+
71430cf6 53586+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
44254afd
MT
53587+ /* file is just created */
53588+ sd = (reiser4_crypto_stat *) *area;
53589+ /* copy everything but private key to the disk stat-data */
53590+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
53591+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
71430cf6 53592+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
44254afd
MT
53593+ }
53594+ *area += (sizeof(*sd) + dplug->fipsize);
53595+ return result;
53596+}
53597+
53598+static int eio(struct inode *inode, char **area, int *len)
53599+{
53600+ return RETERR(-EIO);
53601+}
53602+
53603+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
53604+ [LIGHT_WEIGHT_STAT] = {
53605+ .h = {
53606+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53607+ .id = LIGHT_WEIGHT_STAT,
53608+ .pops = NULL,
53609+ .label = "light-weight sd",
53610+ .desc = "sd for light-weight files",
53611+ .linkage = {NULL,NULL}
53612+ },
53613+ .present = present_lw_sd,
53614+ .absent = NULL,
53615+ .save_len = save_len_lw_sd,
53616+ .save = save_lw_sd,
53617+ .alignment = 8
53618+ },
53619+ [UNIX_STAT] = {
53620+ .h = {
53621+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53622+ .id = UNIX_STAT,
53623+ .pops = NULL,
53624+ .label = "unix-sd",
53625+ .desc = "unix stat-data fields",
53626+ .linkage = {NULL,NULL}
53627+ },
53628+ .present = present_unix_sd,
53629+ .absent = absent_unix_sd,
53630+ .save_len = save_len_unix_sd,
53631+ .save = save_unix_sd,
53632+ .alignment = 8
53633+ },
53634+ [LARGE_TIMES_STAT] = {
53635+ .h = {
53636+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53637+ .id = LARGE_TIMES_STAT,
53638+ .pops = NULL,
53639+ .label = "64time-sd",
53640+ .desc = "nanosecond resolution for times",
53641+ .linkage = {NULL,NULL}
53642+ },
53643+ .present = present_large_times_sd,
53644+ .absent = NULL,
53645+ .save_len = save_len_large_times_sd,
53646+ .save = save_large_times_sd,
53647+ .alignment = 8
53648+ },
53649+ [SYMLINK_STAT] = {
53650+ /* stat data of symlink has this extension */
53651+ .h = {
53652+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53653+ .id = SYMLINK_STAT,
53654+ .pops = NULL,
53655+ .label = "symlink-sd",
53656+ .desc =
53657+ "stat data is appended with symlink name",
53658+ .linkage = {NULL,NULL}
53659+ },
53660+ .present = present_symlink_sd,
53661+ .absent = NULL,
53662+ .save_len = save_len_symlink_sd,
53663+ .save = save_symlink_sd,
53664+ .alignment = 8
53665+ },
53666+ [PLUGIN_STAT] = {
53667+ .h = {
53668+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53669+ .id = PLUGIN_STAT,
53670+ .pops = NULL,
53671+ .label = "plugin-sd",
53672+ .desc = "plugin stat-data fields",
53673+ .linkage = {NULL,NULL}
53674+ },
71430cf6 53675+ .present = present_pset_sd,
44254afd 53676+ .absent = absent_plugin_sd,
71430cf6
MT
53677+ .save_len = save_len_pset_sd,
53678+ .save = save_pset_sd,
53679+ .alignment = 8
53680+ },
53681+ [HEIR_STAT] = {
53682+ .h = {
53683+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53684+ .id = HEIR_STAT,
53685+ .pops = NULL,
53686+ .label = "heir-plugin-sd",
53687+ .desc = "heir plugin stat-data fields",
53688+ .linkage = {NULL,NULL}
53689+ },
53690+ .present = present_hset_sd,
53691+ .absent = NULL,
53692+ .save_len = save_len_hset_sd,
53693+ .save = save_hset_sd,
44254afd
MT
53694+ .alignment = 8
53695+ },
53696+ [FLAGS_STAT] = {
53697+ .h = {
53698+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53699+ .id = FLAGS_STAT,
53700+ .pops = NULL,
53701+ .label = "flags-sd",
53702+ .desc = "inode bit flags",
53703+ .linkage = {NULL, NULL}
53704+ },
53705+ .present = present_flags_sd,
53706+ .absent = NULL,
53707+ .save_len = save_len_flags_sd,
53708+ .save = save_flags_sd,
53709+ .alignment = 8
53710+ },
53711+ [CAPABILITIES_STAT] = {
53712+ .h = {
53713+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53714+ .id = CAPABILITIES_STAT,
53715+ .pops = NULL,
53716+ .label = "capabilities-sd",
53717+ .desc = "capabilities",
53718+ .linkage = {NULL, NULL}
53719+ },
53720+ .present = eio,
53721+ .absent = NULL,
53722+ .save_len = save_len_flags_sd,
53723+ .save = save_flags_sd,
53724+ .alignment = 8
53725+ },
53726+ [CRYPTO_STAT] = {
53727+ .h = {
53728+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
53729+ .id = CRYPTO_STAT,
53730+ .pops = NULL,
53731+ .label = "crypto-sd",
53732+ .desc = "secret key size and id",
53733+ .linkage = {NULL, NULL}
53734+ },
53735+ .present = present_crypto_sd,
53736+ .absent = NULL,
53737+ .save_len = save_len_crypto_sd,
53738+ .save = save_crypto_sd,
53739+ .alignment = 8
53740+ }
53741+};
53742+
53743+/* Make Linus happy.
53744+ Local variables:
53745+ c-indentation-style: "K&R"
53746+ mode-name: "LC"
53747+ c-basic-offset: 8
53748+ tab-width: 8
53749+ fill-column: 120
53750+ End:
53751+*/
71430cf6
MT
53752diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.22/fs/reiser4/plugin/item/static_stat.h
53753--- linux-2.6.22.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 03:00:00.000000000 +0300
53754+++ linux-2.6.22/fs/reiser4/plugin/item/static_stat.h 2007-07-29 00:25:34.976722360 +0400
53755@@ -0,0 +1,224 @@
44254afd
MT
53756+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53757+
53758+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
53759+
53760+In the case where each file has not less than the fields needed by the
53761+stat() syscall, it is more compact to store those fields in this
53762+struct.
53763+
53764+If this item does not exist, then all stats are dynamically resolved.
53765+At the moment, we either resolve all stats dynamically or all of them
53766+statically. If you think this is not fully optimal, and the rest of
53767+reiser4 is working, then fix it...:-)
53768+
53769+*/
53770+
53771+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
53772+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
53773+
53774+#include "../../forward.h"
53775+#include "../../dformat.h"
53776+
53777+#include <linux/fs.h> /* for struct inode */
53778+
53779+/* Stat data layout: goals and implementation.
53780+
53781+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
53782+ them, including not having semantic metadata attached to them.
53783+
53784+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
53785+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
53786+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
53787+ attributes are.
53788+
53789+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
53790+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
53791+ file in their use of file attributes.
53792+
53793+ Yet this compromise deserves to be compromised a little.
53794+
53795+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
53796+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
53797+
53798+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
53799+ from parent directory (as uid, gid) or initialised to some sane values.
53800+
53801+ To capitalize on existing code infrastructure, extensions are
53802+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
53803+ Each stat-data extension plugin implements four methods:
53804+
53805+ ->present() called by sd_load() when this extension is found in stat-data
53806+ ->absent() called by sd_load() when this extension is not found in stat-data
53807+ ->save_len() called by sd_len() to calculate total length of stat-data
53808+ ->save() called by sd_save() to store extension data into stat-data
53809+
53810+ Implementation is in fs/reiser4/plugin/item/static_stat.c
53811+*/
53812+
53813+/* stat-data extension. Please order this by presumed frequency of use */
53814+typedef enum {
53815+ /* support for light-weight files */
53816+ LIGHT_WEIGHT_STAT,
53817+ /* data required to implement unix stat(2) call. Layout is in
53818+ reiser4_unix_stat. If this is not present, file is light-weight */
53819+ UNIX_STAT,
53820+ /* this contains additional set of 32bit [anc]time fields to implement
53821+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
53822+ if this extension is governed by 32bittimes mount option. */
53823+ LARGE_TIMES_STAT,
53824+ /* stat data has link name included */
53825+ SYMLINK_STAT,
71430cf6
MT
53826+ /* on-disk slots of non-standard plugins for main plugin table
53827+ (@reiser4_inode->pset), that is, plugins that cannot be deduced
53828+ from file mode bits), for example, aggregation, interpolation etc. */
44254afd
MT
53829+ PLUGIN_STAT,
53830+ /* this extension contains persistent inode flags. These flags are
53831+ single bits: immutable, append, only, etc. Layout is in
53832+ reiser4_flags_stat. */
53833+ FLAGS_STAT,
53834+ /* this extension contains capabilities sets, associated with this
53835+ file. Layout is in reiser4_capabilities_stat */
53836+ CAPABILITIES_STAT,
53837+ /* this extension contains size and public id of the secret key.
53838+ Layout is in reiser4_crypto_stat */
53839+ CRYPTO_STAT,
71430cf6
MT
53840+ /* on-disk slots of non-default plugins for inheritance, which
53841+ are extracted to special plugin table (@reiser4_inode->hset).
53842+ By default, children of the object will inherit plugins from
53843+ its main plugin table (pset). */
53844+ HEIR_STAT,
44254afd
MT
53845+ LAST_SD_EXTENSION,
53846+ /*
53847+ * init_inode_static_sd() iterates over extension mask until all
53848+ * non-zero bits are processed. This means, that neither ->present(),
53849+ * nor ->absent() methods will be called for stat-data extensions that
53850+ * go after last present extension. But some basic extensions, we want
53851+ * either ->absent() or ->present() method to be called, because these
53852+ * extensions set up something in inode even when they are not
53853+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
53854+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
53855+ * ->present(), or ->absent() method will be called, independently of
53856+ * what other extensions are present.
53857+ */
71430cf6 53858+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
44254afd
MT
53859+} sd_ext_bits;
53860+
53861+/* minimal stat-data. This allows to support light-weight files. */
53862+typedef struct reiser4_stat_data_base {
53863+ /* 0 */ __le16 extmask;
53864+ /* 2 */
53865+} PACKED reiser4_stat_data_base;
53866+
53867+typedef struct reiser4_light_weight_stat {
53868+ /* 0 */ __le16 mode;
53869+ /* 2 */ __le32 nlink;
71430cf6 53870+ /* 6 */ __le64 size;
44254afd 53871+ /* size in bytes */
71430cf6 53872+ /* 14 */
44254afd
MT
53873+} PACKED reiser4_light_weight_stat;
53874+
53875+typedef struct reiser4_unix_stat {
53876+ /* owner id */
53877+ /* 0 */ __le32 uid;
53878+ /* group id */
53879+ /* 4 */ __le32 gid;
53880+ /* access time */
53881+ /* 8 */ __le32 atime;
53882+ /* modification time */
53883+ /* 12 */ __le32 mtime;
53884+ /* change time */
53885+ /* 16 */ __le32 ctime;
53886+ union {
53887+ /* minor:major for device files */
53888+ /* 20 */ __le64 rdev;
53889+ /* bytes used by file */
53890+ /* 20 */ __le64 bytes;
53891+ } u;
53892+ /* 28 */
53893+} PACKED reiser4_unix_stat;
53894+
53895+/* symlink stored as part of inode */
53896+typedef struct reiser4_symlink_stat {
53897+ char body[0];
53898+} PACKED reiser4_symlink_stat;
53899+
53900+typedef struct reiser4_plugin_slot {
53901+ /* 0 */ __le16 pset_memb;
53902+ /* 2 */ __le16 id;
53903+ /* 4 *//* here plugin stores its persistent state */
53904+} PACKED reiser4_plugin_slot;
53905+
53906+/* stat-data extension for files with non-standard plugin. */
53907+typedef struct reiser4_plugin_stat {
53908+ /* number of additional plugins, associated with this object */
53909+ /* 0 */ __le16 plugins_no;
53910+ /* 2 */ reiser4_plugin_slot slot[0];
53911+ /* 2 */
53912+} PACKED reiser4_plugin_stat;
53913+
53914+/* stat-data extension for inode flags. Currently it is just fixed-width 32
53915+ * bit mask. If need arise, this can be replaced with variable width
53916+ * bitmask. */
53917+typedef struct reiser4_flags_stat {
53918+ /* 0 */ __le32 flags;
53919+ /* 4 */
53920+} PACKED reiser4_flags_stat;
53921+
53922+typedef struct reiser4_capabilities_stat {
53923+ /* 0 */ __le32 effective;
53924+ /* 8 */ __le32 permitted;
53925+ /* 16 */
53926+} PACKED reiser4_capabilities_stat;
53927+
53928+typedef struct reiser4_cluster_stat {
53929+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
53930+ /* 0 */ d8 cluster_shift;
53931+ /* 1 */
53932+} PACKED reiser4_cluster_stat;
53933+
53934+typedef struct reiser4_crypto_stat {
53935+ /* secret key size, bits */
53936+ /* 0 */ d16 keysize;
53937+ /* secret key id */
53938+ /* 2 */ d8 keyid[0];
53939+ /* 2 */
53940+} PACKED reiser4_crypto_stat;
53941+
53942+typedef struct reiser4_large_times_stat {
53943+ /* access time */
71430cf6 53944+ /* 0 */ d32 atime;
44254afd 53945+ /* modification time */
71430cf6 53946+ /* 4 */ d32 mtime;
44254afd 53947+ /* change time */
71430cf6
MT
53948+ /* 8 */ d32 ctime;
53949+ /* 12 */
44254afd
MT
53950+} PACKED reiser4_large_times_stat;
53951+
53952+/* this structure is filled by sd_item_stat */
53953+typedef struct sd_stat {
53954+ int dirs;
53955+ int files;
53956+ int others;
53957+} sd_stat;
53958+
53959+/* plugin->item.common.* */
53960+extern void print_sd(const char *prefix, coord_t * coord);
53961+extern void item_stat_static_sd(const coord_t * coord, void *vp);
53962+
53963+/* plugin->item.s.sd.* */
53964+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
53965+extern int save_len_static_sd(struct inode *inode);
53966+extern int save_static_sd(struct inode *inode, char **area);
53967+
53968+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
53969+#endif
53970+
53971+/* Make Linus happy.
53972+ Local variables:
53973+ c-indentation-style: "K&R"
53974+ mode-name: "LC"
53975+ c-basic-offset: 8
53976+ tab-width: 8
53977+ fill-column: 120
53978+ End:
53979+*/
71430cf6
MT
53980diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/tail.c linux-2.6.22/fs/reiser4/plugin/item/tail.c
53981--- linux-2.6.22.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 03:00:00.000000000 +0300
53982+++ linux-2.6.22/fs/reiser4/plugin/item/tail.c 2007-07-29 00:25:34.980723395 +0400
53983@@ -0,0 +1,809 @@
44254afd
MT
53984+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53985+
53986+#include "item.h"
53987+#include "../../inode.h"
53988+#include "../../page_cache.h"
53989+#include "../../carry.h"
53990+#include "../../vfs_ops.h"
53991+
53992+#include <linux/quotaops.h>
53993+#include <asm/uaccess.h>
53994+#include <linux/swap.h>
53995+#include <linux/writeback.h>
53996+
53997+/* plugin->u.item.b.max_key_inside */
53998+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
53999+{
54000+ item_key_by_coord(coord, key);
71430cf6 54001+ set_key_offset(key, get_key_offset(reiser4_max_key()));
44254afd
MT
54002+ return key;
54003+}
54004+
54005+/* plugin->u.item.b.can_contain_key */
54006+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54007+ const reiser4_item_data *data)
54008+{
54009+ reiser4_key item_key;
54010+
54011+ if (item_plugin_by_coord(coord) != data->iplug)
54012+ return 0;
54013+
54014+ item_key_by_coord(coord, &item_key);
54015+ if (get_key_locality(key) != get_key_locality(&item_key) ||
54016+ get_key_objectid(key) != get_key_objectid(&item_key))
54017+ return 0;
54018+
54019+ return 1;
54020+}
54021+
54022+/* plugin->u.item.b.mergeable
54023+ first item is of tail type */
54024+/* Audited by: green(2002.06.14) */
54025+int mergeable_tail(const coord_t *p1, const coord_t *p2)
54026+{
54027+ reiser4_key key1, key2;
54028+
71430cf6
MT
54029+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
54030+ UNIX_FILE_METADATA_ITEM_TYPE));
44254afd
MT
54031+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54032+
54033+ if (item_id_by_coord(p2) != FORMATTING_ID) {
54034+ /* second item is of another type */
54035+ return 0;
54036+ }
54037+
54038+ item_key_by_coord(p1, &key1);
54039+ item_key_by_coord(p2, &key2);
54040+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
54041+ get_key_objectid(&key1) != get_key_objectid(&key2)
54042+ || get_key_type(&key1) != get_key_type(&key2)) {
54043+ /* items of different objects */
54044+ return 0;
54045+ }
54046+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54047+ /* not adjacent items */
54048+ return 0;
54049+ }
54050+ return 1;
54051+}
54052+
54053+/* plugin->u.item.b.print
54054+ plugin->u.item.b.check */
54055+
54056+/* plugin->u.item.b.nr_units */
54057+pos_in_node_t nr_units_tail(const coord_t * coord)
54058+{
54059+ return item_length_by_coord(coord);
54060+}
54061+
54062+/* plugin->u.item.b.lookup */
54063+lookup_result
54064+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54065+{
54066+ reiser4_key item_key;
54067+ __u64 lookuped, offset;
54068+ unsigned nr_units;
54069+
54070+ item_key_by_coord(coord, &item_key);
54071+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
54072+ nr_units = nr_units_tail(coord);
54073+
54074+ /* key we are looking for must be greater than key of item @coord */
54075+ assert("vs-416", keygt(key, &item_key));
54076+
54077+ /* offset we are looking for */
54078+ lookuped = get_key_offset(key);
54079+
54080+ if (lookuped >= offset && lookuped < offset + nr_units) {
54081+ /* byte we are looking for is in this item */
54082+ coord->unit_pos = lookuped - offset;
54083+ coord->between = AT_UNIT;
54084+ return CBK_COORD_FOUND;
54085+ }
54086+
54087+ /* set coord after last unit */
54088+ coord->unit_pos = nr_units - 1;
54089+ coord->between = AFTER_UNIT;
54090+ return bias ==
54091+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54092+}
54093+
54094+/* plugin->u.item.b.paste */
54095+int
54096+paste_tail(coord_t *coord, reiser4_item_data *data,
54097+ carry_plugin_info *info UNUSED_ARG)
54098+{
54099+ unsigned old_item_length;
54100+ char *item;
54101+
54102+ /* length the item had before resizing has been performed */
54103+ old_item_length = item_length_by_coord(coord) - data->length;
54104+
54105+ /* tail items never get pasted in the middle */
54106+ assert("vs-363",
54107+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54108+ (coord->unit_pos == old_item_length - 1 &&
54109+ coord->between == AFTER_UNIT) ||
54110+ (coord->unit_pos == 0 && old_item_length == 0
54111+ && coord->between == AT_UNIT));
54112+
54113+ item = item_body_by_coord(coord);
54114+ if (coord->unit_pos == 0)
54115+ /* make space for pasted data when pasting at the beginning of
54116+ the item */
54117+ memmove(item + data->length, item, old_item_length);
54118+
54119+ if (coord->between == AFTER_UNIT)
54120+ coord->unit_pos++;
54121+
54122+ if (data->data) {
54123+ assert("vs-554", data->user == 0 || data->user == 1);
54124+ if (data->user) {
71430cf6 54125+ assert("nikita-3035", reiser4_schedulable());
44254afd
MT
54126+ /* copy from user space */
54127+ if (__copy_from_user(item + coord->unit_pos,
54128+ (const char __user *)data->data,
54129+ (unsigned)data->length))
54130+ return RETERR(-EFAULT);
54131+ } else
54132+ /* copy from kernel space */
54133+ memcpy(item + coord->unit_pos, data->data,
54134+ (unsigned)data->length);
54135+ } else {
54136+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
54137+ }
54138+ return 0;
54139+}
54140+
54141+/* plugin->u.item.b.fast_paste */
54142+
54143+/* plugin->u.item.b.can_shift
54144+ number of units is returned via return value, number of bytes via @size. For
54145+ tail items they coincide */
54146+int
54147+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54148+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54149+ unsigned *size, unsigned want)
54150+{
54151+ /* make sure that that we do not want to shift more than we have */
54152+ assert("vs-364", want > 0
54153+ && want <= (unsigned)item_length_by_coord(source));
54154+
54155+ *size = min(want, free_space);
54156+ return *size;
54157+}
54158+
54159+/* plugin->u.item.b.copy_units */
54160+void
54161+copy_units_tail(coord_t * target, coord_t * source,
54162+ unsigned from, unsigned count,
54163+ shift_direction where_is_free_space,
54164+ unsigned free_space UNUSED_ARG)
54165+{
54166+ /* make sure that item @target is expanded already */
54167+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54168+ assert("vs-370", free_space >= count);
54169+
54170+ if (where_is_free_space == SHIFT_LEFT) {
54171+ /* append item @target with @count first bytes of @source */
54172+ assert("vs-365", from == 0);
54173+
54174+ memcpy((char *)item_body_by_coord(target) +
54175+ item_length_by_coord(target) - count,
54176+ (char *)item_body_by_coord(source), count);
54177+ } else {
54178+ /* target item is moved to right already */
54179+ reiser4_key key;
54180+
54181+ assert("vs-367",
54182+ (unsigned)item_length_by_coord(source) == from + count);
54183+
54184+ memcpy((char *)item_body_by_coord(target),
54185+ (char *)item_body_by_coord(source) + from, count);
54186+
54187+ /* new units are inserted before first unit in an item,
54188+ therefore, we have to update item key */
54189+ item_key_by_coord(source, &key);
54190+ set_key_offset(&key, get_key_offset(&key) + from);
54191+
54192+ node_plugin_by_node(target->node)->update_item_key(target, &key,
54193+ NULL /*info */);
54194+ }
54195+}
54196+
54197+/* plugin->u.item.b.create_hook */
54198+
54199+/* item_plugin->b.kill_hook
54200+ this is called when @count units starting from @from-th one are going to be removed
54201+ */
54202+int
54203+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54204+ pos_in_node_t count, struct carry_kill_data *kdata)
54205+{
54206+ reiser4_key key;
54207+ loff_t start, end;
54208+
54209+ assert("vs-1577", kdata);
54210+ assert("vs-1579", kdata->inode);
54211+
54212+ item_key_by_coord(coord, &key);
54213+ start = get_key_offset(&key) + from;
54214+ end = start + count;
54215+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54216+ return 0;
54217+}
54218+
54219+/* plugin->u.item.b.shift_hook */
54220+
54221+/* helper for kill_units_tail and cut_units_tail */
54222+static int
54223+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54224+ reiser4_key * smallest_removed, reiser4_key * new_first)
54225+{
54226+ pos_in_node_t count;
54227+
54228+ /* this method is only called to remove part of item */
54229+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54230+ /* tails items are never cut from the middle of an item */
54231+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54232+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54233+
54234+ count = to - from + 1;
54235+
54236+ if (smallest_removed) {
54237+ /* store smallest key removed */
54238+ item_key_by_coord(coord, smallest_removed);
54239+ set_key_offset(smallest_removed,
54240+ get_key_offset(smallest_removed) + from);
54241+ }
54242+ if (new_first) {
54243+ /* head of item is cut */
54244+ assert("vs-1529", from == 0);
54245+
54246+ item_key_by_coord(coord, new_first);
54247+ set_key_offset(new_first,
54248+ get_key_offset(new_first) + from + count);
54249+ }
54250+
54251+ if (REISER4_DEBUG)
54252+ memset((char *)item_body_by_coord(coord) + from, 0, count);
54253+ return count;
54254+}
54255+
54256+/* plugin->u.item.b.cut_units */
54257+int
54258+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54259+ struct carry_cut_data *cdata UNUSED_ARG,
54260+ reiser4_key * smallest_removed, reiser4_key * new_first)
54261+{
54262+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54263+}
54264+
54265+/* plugin->u.item.b.kill_units */
54266+int
54267+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54268+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54269+ reiser4_key * new_first)
54270+{
54271+ kill_hook_tail(coord, from, to - from + 1, kdata);
54272+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54273+}
54274+
54275+/* plugin->u.item.b.unit_key */
54276+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54277+{
54278+ assert("vs-375", coord_is_existing_unit(coord));
54279+
54280+ item_key_by_coord(coord, key);
54281+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54282+
54283+ return key;
54284+}
54285+
54286+/* plugin->u.item.b.estimate
54287+ plugin->u.item.b.item_data_by_flow */
54288+
54289+/* tail redpage function. It is called from readpage_tail(). */
54290+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54291+{
54292+ tap_t tap;
54293+ int result;
54294+ coord_t coord;
54295+ lock_handle lh;
54296+ int count, mapped;
54297+ struct inode *inode;
54298+ char *pagedata;
54299+
54300+ /* saving passed coord in order to do not move it by tap. */
54301+ init_lh(&lh);
54302+ copy_lh(&lh, uf_coord->lh);
54303+ inode = page->mapping->host;
54304+ coord_dup(&coord, &uf_coord->coord);
54305+
71430cf6 54306+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
44254afd 54307+
71430cf6 54308+ if ((result = reiser4_tap_load(&tap)))
44254afd
MT
54309+ goto out_tap_done;
54310+
54311+ /* lookup until page is filled up. */
54312+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54313+ /* number of bytes to be copied to page */
54314+ count = item_length_by_coord(&coord) - coord.unit_pos;
54315+ if (count > PAGE_CACHE_SIZE - mapped)
54316+ count = PAGE_CACHE_SIZE - mapped;
54317+
54318+ /* attach @page to address space and get data address */
54319+ pagedata = kmap_atomic(page, KM_USER0);
54320+
54321+ /* copy tail item to page */
54322+ memcpy(pagedata + mapped,
54323+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54324+ count);
54325+ mapped += count;
54326+
54327+ flush_dcache_page(page);
54328+
54329+ /* dettach page from address space */
54330+ kunmap_atomic(pagedata, KM_USER0);
54331+
54332+ /* Getting next tail item. */
54333+ if (mapped < PAGE_CACHE_SIZE) {
54334+ /*
54335+ * unlock page in order to avoid keep it locked
54336+ * during tree lookup, which takes long term locks
54337+ */
54338+ unlock_page(page);
54339+
54340+ /* getting right neighbour. */
54341+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
54342+
54343+ /* lock page back */
54344+ lock_page(page);
54345+ if (PageUptodate(page)) {
54346+ /*
54347+ * another thread read the page, we have
54348+ * nothing to do
54349+ */
54350+ result = 0;
54351+ goto out_unlock_page;
54352+ }
54353+
54354+ if (result) {
54355+ if (result == -E_NO_NEIGHBOR) {
54356+ /*
54357+ * rigth neighbor is not a formatted
54358+ * node
54359+ */
54360+ result = 0;
54361+ goto done;
54362+ } else {
54363+ goto out_tap_relse;
54364+ }
54365+ } else {
54366+ if (!inode_file_plugin(inode)->
54367+ owns_item(inode, &coord)) {
54368+ /* item of another file is found */
54369+ result = 0;
54370+ goto done;
54371+ }
54372+ }
54373+ }
54374+ }
54375+
54376+ done:
71430cf6
MT
54377+ if (mapped != PAGE_CACHE_SIZE)
54378+ zero_user_page(page, mapped, PAGE_CACHE_SIZE - mapped,
54379+ KM_USER0);
44254afd
MT
54380+ SetPageUptodate(page);
54381+ out_unlock_page:
54382+ unlock_page(page);
54383+ out_tap_relse:
71430cf6 54384+ reiser4_tap_relse(&tap);
44254afd 54385+ out_tap_done:
71430cf6 54386+ reiser4_tap_done(&tap);
44254afd
MT
54387+ return result;
54388+}
54389+
54390+/*
54391+ plugin->s.file.readpage
54392+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54393+ or
54394+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
54395+
54396+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
54397+ item. */
54398+int readpage_tail(void *vp, struct page *page)
54399+{
54400+ uf_coord_t *uf_coord = vp;
54401+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
54402+ ON_DEBUG(reiser4_key key);
54403+
54404+ assert("umka-2515", PageLocked(page));
54405+ assert("umka-2516", !PageUptodate(page));
54406+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
54407+ assert("umka-2518", page->mapping && page->mapping->host);
54408+
54409+ assert("umka-2519", znode_is_loaded(coord->node));
54410+ assert("umka-2520", item_is_tail(coord));
54411+ assert("umka-2521", coord_is_existing_unit(coord));
54412+ assert("umka-2522", znode_is_rlocked(coord->node));
54413+ assert("umka-2523",
54414+ page->mapping->host->i_ino ==
54415+ get_key_objectid(item_key_by_coord(coord, &key)));
54416+
54417+ return do_readpage_tail(uf_coord, page);
54418+}
54419+
54420+/**
54421+ * overwrite_tail
54422+ * @flow:
54423+ * @coord:
54424+ *
54425+ * Overwrites tail item or its part by user data. Returns number of bytes
54426+ * written or error code.
54427+ */
54428+static int overwrite_tail(flow_t *flow, coord_t *coord)
54429+{
54430+ unsigned count;
54431+
54432+ assert("vs-570", flow->user == 1);
54433+ assert("vs-946", flow->data);
54434+ assert("vs-947", coord_is_existing_unit(coord));
54435+ assert("vs-948", znode_is_write_locked(coord->node));
71430cf6 54436+ assert("nikita-3036", reiser4_schedulable());
44254afd
MT
54437+
54438+ count = item_length_by_coord(coord) - coord->unit_pos;
54439+ if (count > flow->length)
54440+ count = flow->length;
54441+
54442+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
54443+ (const char __user *)flow->data, count))
54444+ return RETERR(-EFAULT);
54445+
54446+ znode_make_dirty(coord->node);
54447+ return count;
54448+}
54449+
54450+/**
54451+ * insert_first_tail
54452+ * @inode:
54453+ * @flow:
54454+ * @coord:
54455+ * @lh:
54456+ *
54457+ * Returns number of bytes written or error code.
54458+ */
54459+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
54460+ coord_t *coord, lock_handle *lh)
54461+{
54462+ int result;
54463+ loff_t to_write;
71430cf6 54464+ struct unix_file_info *uf_info;
44254afd
MT
54465+
54466+ if (get_key_offset(&flow->key) != 0) {
54467+ /*
54468+ * file is empty and we have to write not to the beginning of
54469+ * file. Create a hole at the beginning of file. On success
54470+ * insert_flow returns 0 as number of written bytes which is
54471+ * what we have to return on padding a file with holes
54472+ */
54473+ flow->data = NULL;
54474+ flow->length = get_key_offset(&flow->key);
54475+ set_key_offset(&flow->key, 0);
54476+ /*
54477+ * holes in files built of tails are stored just like if there
54478+ * were real data which are all zeros. Therefore we have to
54479+ * allocate quota here as well
54480+ */
54481+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54482+ return RETERR(-EDQUOT);
71430cf6 54483+ result = reiser4_insert_flow(coord, lh, flow);
44254afd
MT
54484+ if (flow->length)
54485+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54486+
54487+ uf_info = unix_file_inode_data(inode);
54488+
54489+ /*
54490+ * first item insertion is only possible when writing to empty
54491+ * file or performing tail conversion
54492+ */
54493+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
71430cf6
MT
54494+ (reiser4_inode_get_flag(inode,
54495+ REISER4_PART_MIXED) &&
54496+ reiser4_inode_get_flag(inode,
54497+ REISER4_PART_IN_CONV))));
44254afd
MT
54498+ /* if file was empty - update its state */
54499+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
54500+ uf_info->container = UF_CONTAINER_TAILS;
54501+ return result;
54502+ }
54503+
54504+ /* check quota before appending data */
54505+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54506+ return RETERR(-EDQUOT);
54507+
54508+ to_write = flow->length;
71430cf6 54509+ result = reiser4_insert_flow(coord, lh, flow);
44254afd
MT
54510+ if (flow->length)
54511+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54512+ return (to_write - flow->length) ? (to_write - flow->length) : result;
54513+}
54514+
54515+/**
54516+ * append_tail
54517+ * @inode:
54518+ * @flow:
54519+ * @coord:
54520+ * @lh:
54521+ *
54522+ * Returns number of bytes written or error code.
54523+ */
54524+static ssize_t append_tail(struct inode *inode,
54525+ flow_t *flow, coord_t *coord, lock_handle *lh)
54526+{
54527+ int result;
54528+ reiser4_key append_key;
54529+ loff_t to_write;
71430cf6 54530+
44254afd
MT
54531+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
54532+ flow->data = NULL;
54533+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
54534+ set_key_offset(&flow->key, get_key_offset(&append_key));
54535+ /*
54536+ * holes in files built of tails are stored just like if there
54537+ * were real data which are all zeros. Therefore we have to
54538+ * allocate quota here as well
54539+ */
54540+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54541+ return RETERR(-EDQUOT);
71430cf6 54542+ result = reiser4_insert_flow(coord, lh, flow);
44254afd
MT
54543+ if (flow->length)
54544+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54545+ return result;
54546+ }
54547+
54548+ /* check quota before appending data */
54549+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
54550+ return RETERR(-EDQUOT);
54551+
54552+ to_write = flow->length;
71430cf6 54553+ result = reiser4_insert_flow(coord, lh, flow);
44254afd
MT
54554+ if (flow->length)
54555+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
54556+ return (to_write - flow->length) ? (to_write - flow->length) : result;
54557+}
54558+
54559+/**
54560+ * write_tail_reserve_space - reserve space for tail write operation
54561+ * @inode:
54562+ *
54563+ * Estimates and reserves space which may be required for writing one flow to a
54564+ * file
54565+ */
54566+static int write_extent_reserve_space(struct inode *inode)
54567+{
54568+ __u64 count;
54569+ reiser4_tree *tree;
54570+
54571+ /*
54572+ * to write one flow to a file by tails we have to reserve disk space for:
71430cf6 54573+
44254afd
MT
54574+ * 1. find_file_item may have to insert empty node to the tree (empty
54575+ * leaf node between two extent items). This requires 1 block and
54576+ * number of blocks which are necessary to perform insertion of an
54577+ * internal item into twig level.
54578+ *
54579+ * 2. flow insertion
54580+ *
54581+ * 3. stat data update
54582+ */
71430cf6
MT
54583+ tree = reiser4_tree_by_inode(inode);
54584+ count = estimate_one_insert_item(tree) +
44254afd
MT
54585+ estimate_insert_flow(tree->height) +
54586+ estimate_one_insert_item(tree);
54587+ grab_space_enable();
54588+ return reiser4_grab_space(count, 0 /* flags */);
54589+}
54590+
54591+#define PAGE_PER_FLOW 4
54592+
54593+static loff_t faultin_user_pages(const char __user *buf, size_t count)
54594+{
71430cf6 54595+ loff_t faulted;
44254afd
MT
54596+ int to_fault;
54597+
54598+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
54599+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
54600+ faulted = 0;
54601+ while (count > 0) {
54602+ to_fault = PAGE_CACHE_SIZE;
54603+ if (count < to_fault)
54604+ to_fault = count;
54605+ fault_in_pages_readable(buf + faulted, to_fault);
54606+ count -= to_fault;
54607+ faulted += to_fault;
54608+ }
54609+ return faulted;
54610+}
54611+
54612+/**
71430cf6 54613+ * reiser4_write_extent - write method of tail item plugin
44254afd
MT
54614+ * @file: file to write to
54615+ * @buf: address of user-space buffer
54616+ * @count: number of bytes to write
54617+ * @pos: position in file to write to
54618+ *
54619+ * Returns number of written bytes or error code.
54620+ */
71430cf6
MT
54621+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
54622+ size_t count, loff_t *pos)
44254afd
MT
54623+{
54624+ struct inode *inode;
54625+ struct hint hint;
54626+ int result;
54627+ flow_t flow;
54628+ coord_t *coord;
54629+ lock_handle *lh;
54630+ znode *loaded;
54631+
54632+ inode = file->f_dentry->d_inode;
54633+
54634+ if (write_extent_reserve_space(inode))
54635+ return RETERR(-ENOSPC);
54636+
54637+ result = load_file_hint(file, &hint);
54638+ BUG_ON(result != 0);
54639+
54640+ flow.length = faultin_user_pages(buf, count);
54641+ flow.user = 1;
54642+ memcpy(&flow.data, &buf, sizeof(buf));
54643+ flow.op = WRITE_OP;
54644+ key_by_inode_and_offset_common(inode, *pos, &flow.key);
54645+
54646+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
54647+ if (IS_CBKERR(result))
54648+ return result;
54649+
54650+ coord = &hint.ext_coord.coord;
54651+ lh = hint.ext_coord.lh;
54652+
54653+ result = zload(coord->node);
54654+ BUG_ON(result != 0);
54655+ loaded = coord->node;
71430cf6 54656+
44254afd
MT
54657+ if (coord->between == AFTER_UNIT) {
54658+ /* append with data or hole */
54659+ result = append_tail(inode, &flow, coord, lh);
54660+ } else if (coord->between == AT_UNIT) {
54661+ /* overwrite */
54662+ result = overwrite_tail(&flow, coord);
54663+ } else {
54664+ /* no items of this file yet. insert data or hole */
54665+ result = insert_first_tail(inode, &flow, coord, lh);
54666+ }
54667+ zrelse(loaded);
54668+ if (result < 0) {
54669+ done_lh(lh);
54670+ return result;
54671+ }
71430cf6 54672+
44254afd
MT
54673+ /* seal and unlock znode */
54674+ hint.ext_coord.valid = 0;
54675+ if (hint.ext_coord.valid)
71430cf6 54676+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
44254afd 54677+ else
71430cf6 54678+ reiser4_unset_hint(&hint);
44254afd
MT
54679+
54680+ save_file_hint(file, &hint);
54681+ return result;
54682+}
54683+
54684+#if REISER4_DEBUG
54685+
54686+static int
54687+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
54688+{
54689+ reiser4_key item_key;
54690+
54691+ assert("vs-1356", coord_is_existing_unit(coord));
54692+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
54693+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
54694+ return get_key_offset(key) ==
54695+ get_key_offset(&item_key) + coord->unit_pos;
54696+
54697+}
54698+
54699+#endif
54700+
54701+/* plugin->u.item.s.file.read */
71430cf6 54702+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
44254afd
MT
54703+{
54704+ unsigned count;
54705+ int item_length;
54706+ coord_t *coord;
54707+ uf_coord_t *uf_coord;
54708+
54709+ uf_coord = &hint->ext_coord;
54710+ coord = &uf_coord->coord;
54711+
54712+ assert("vs-571", f->user == 1);
54713+ assert("vs-571", f->data);
54714+ assert("vs-967", coord && coord->node);
54715+ assert("vs-1117", znode_is_rlocked(coord->node));
54716+ assert("vs-1118", znode_is_loaded(coord->node));
54717+
71430cf6 54718+ assert("nikita-3037", reiser4_schedulable());
44254afd
MT
54719+ assert("vs-1357", coord_matches_key_tail(coord, &f->key));
54720+
54721+ /* calculate number of bytes to read off the item */
54722+ item_length = item_length_by_coord(coord);
54723+ count = item_length_by_coord(coord) - coord->unit_pos;
54724+ if (count > f->length)
54725+ count = f->length;
54726+
54727+ /* user page has to be brought in so that major page fault does not
54728+ * occur here when longtem lock is held */
54729+ if (__copy_to_user((char __user *)f->data,
54730+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
54731+ count))
54732+ return RETERR(-EFAULT);
54733+
54734+ /* probably mark_page_accessed() should only be called if
54735+ * coord->unit_pos is zero. */
54736+ mark_page_accessed(znode_page(coord->node));
54737+ move_flow_forward(f, count);
54738+
54739+ coord->unit_pos += count;
54740+ if (item_length == coord->unit_pos) {
54741+ coord->unit_pos--;
54742+ coord->between = AFTER_UNIT;
54743+ }
71430cf6 54744+ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
44254afd
MT
54745+ return 0;
54746+}
54747+
54748+/*
54749+ plugin->u.item.s.file.append_key
54750+ key of first byte which is the next to last byte by addressed by this item
54751+*/
54752+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
54753+{
54754+ item_key_by_coord(coord, key);
54755+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
54756+ return key;
54757+}
54758+
54759+/* plugin->u.item.s.file.init_coord_extension */
54760+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
54761+{
54762+ uf_coord->valid = 1;
54763+}
54764+
54765+/*
54766+ plugin->u.item.s.file.get_block
54767+*/
54768+int
54769+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
54770+{
54771+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
54772+
71430cf6
MT
54773+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
54774+ /* if node has'nt obtainet its block number yet, return 0.
54775+ * Lets avoid upsetting users with some cosmic numbers beyond
54776+ * the device capacity.*/
54777+ *block = 0;
54778+ else
54779+ *block = *znode_get_block(coord->node);
44254afd
MT
54780+ return 0;
54781+}
54782+
54783+/*
54784+ * Local variables:
54785+ * c-indentation-style: "K&R"
54786+ * mode-name: "LC"
54787+ * c-basic-offset: 8
54788+ * tab-width: 8
54789+ * fill-column: 79
54790+ * scroll-step: 1
54791+ * End:
54792+ */
71430cf6
MT
54793diff -urN linux-2.6.22.orig/fs/reiser4/plugin/item/tail.h linux-2.6.22/fs/reiser4/plugin/item/tail.h
54794--- linux-2.6.22.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 03:00:00.000000000 +0300
54795+++ linux-2.6.22/fs/reiser4/plugin/item/tail.h 2007-07-29 00:25:34.980723395 +0400
44254afd
MT
54796@@ -0,0 +1,58 @@
54797+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54798+
54799+#if !defined( __REISER4_TAIL_H__ )
54800+#define __REISER4_TAIL_H__
54801+
71430cf6 54802+struct tail_coord_extension {
44254afd 54803+ int not_used;
71430cf6 54804+};
44254afd
MT
54805+
54806+struct cut_list;
54807+
54808+/* plugin->u.item.b.* */
54809+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
54810+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
54811+ const reiser4_item_data *);
54812+int mergeable_tail(const coord_t * p1, const coord_t * p2);
54813+pos_in_node_t nr_units_tail(const coord_t *);
54814+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
54815+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
54816+int can_shift_tail(unsigned free_space, coord_t * source,
54817+ znode * target, shift_direction, unsigned *size,
54818+ unsigned want);
54819+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
54820+ unsigned count, shift_direction, unsigned free_space);
54821+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
54822+ struct carry_kill_data *);
54823+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
54824+ struct carry_cut_data *, reiser4_key * smallest_removed,
54825+ reiser4_key * new_first);
54826+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
54827+ struct carry_kill_data *, reiser4_key * smallest_removed,
54828+ reiser4_key * new_first);
54829+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
54830+
54831+/* plugin->u.item.s.* */
71430cf6
MT
54832+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
54833+ size_t count, loff_t *pos);
54834+int reiser4_read_tail(struct file *, flow_t *, hint_t *);
44254afd
MT
54835+int readpage_tail(void *vp, struct page *page);
54836+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
54837+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
54838+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
54839+int item_balance_dirty_pages(struct address_space *, const flow_t *,
54840+ hint_t *, int back_to_dirty, int set_hint);
54841+
54842+/* __REISER4_TAIL_H__ */
54843+#endif
54844+
54845+/* Make Linus happy.
54846+ Local variables:
54847+ c-indentation-style: "K&R"
54848+ mode-name: "LC"
54849+ c-basic-offset: 8
54850+ tab-width: 8
54851+ fill-column: 120
54852+ scroll-step: 1
54853+ End:
54854+*/
71430cf6
MT
54855diff -urN linux-2.6.22.orig/fs/reiser4/plugin/Makefile linux-2.6.22/fs/reiser4/plugin/Makefile
54856--- linux-2.6.22.orig/fs/reiser4/plugin/Makefile 1970-01-01 03:00:00.000000000 +0300
54857+++ linux-2.6.22/fs/reiser4/plugin/Makefile 2007-07-29 00:25:34.980723395 +0400
54858@@ -0,0 +1,26 @@
54859+obj-$(CONFIG_REISER4_FS) += plugins.o
54860+
54861+plugins-objs := \
54862+ plugin.o \
54863+ plugin_set.o \
54864+ object.o \
54865+ inode_ops.o \
54866+ inode_ops_rename.o \
54867+ file_ops.o \
54868+ file_ops_readdir.o \
54869+ file_plugin_common.o \
54870+ dir_plugin_common.o \
54871+ digest.o \
54872+ hash.o \
54873+ fibration.o \
54874+ tail_policy.o \
54875+ regular.o
54876+
54877+obj-$(CONFIG_REISER4_FS) += item/
54878+obj-$(CONFIG_REISER4_FS) += file/
54879+obj-$(CONFIG_REISER4_FS) += dir/
54880+obj-$(CONFIG_REISER4_FS) += node/
54881+obj-$(CONFIG_REISER4_FS) += compress/
54882+obj-$(CONFIG_REISER4_FS) += space/
54883+obj-$(CONFIG_REISER4_FS) += disk_format/
54884+obj-$(CONFIG_REISER4_FS) += security/
54885diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/Makefile linux-2.6.22/fs/reiser4/plugin/node/Makefile
54886--- linux-2.6.22.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 03:00:00.000000000 +0300
54887+++ linux-2.6.22/fs/reiser4/plugin/node/Makefile 2007-07-29 00:25:34.980723395 +0400
44254afd
MT
54888@@ -0,0 +1,5 @@
54889+obj-$(CONFIG_REISER4_FS) += node_plugins.o
54890+
54891+node_plugins-objs := \
54892+ node.o \
54893+ node40.o
71430cf6
MT
54894diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node40.c linux-2.6.22/fs/reiser4/plugin/node/node40.c
54895--- linux-2.6.22.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 03:00:00.000000000 +0300
54896+++ linux-2.6.22/fs/reiser4/plugin/node/node40.c 2007-07-29 00:25:34.988725466 +0400
44254afd
MT
54897@@ -0,0 +1,2924 @@
54898+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54899+
54900+#include "../../debug.h"
54901+#include "../../key.h"
54902+#include "../../coord.h"
54903+#include "../plugin_header.h"
54904+#include "../item/item.h"
54905+#include "node.h"
54906+#include "node40.h"
54907+#include "../plugin.h"
54908+#include "../../jnode.h"
54909+#include "../../znode.h"
54910+#include "../../pool.h"
54911+#include "../../carry.h"
54912+#include "../../tap.h"
54913+#include "../../tree.h"
54914+#include "../../super.h"
54915+#include "../../reiser4.h"
54916+
54917+#include <asm/uaccess.h>
54918+#include <linux/types.h>
54919+#include <linux/prefetch.h>
54920+
54921+/* leaf 40 format:
54922+
54923+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
54924+ plugin_id (16) key
54925+ free_space (16) pluginid (16)
54926+ free_space_start (16) offset (16)
54927+ level (8)
54928+ num_items (16)
54929+ magic (32)
54930+ flush_time (32)
54931+*/
54932+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
54933+/* magic number that is stored in ->magic field of node header */
54934+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
54935+
54936+static int prepare_for_update(znode * left, znode * right,
54937+ carry_plugin_info * info);
54938+
54939+/* header of node of reiser40 format is at the beginning of node */
54940+static inline node40_header *node40_node_header(const znode * node /* node to
54941+ * query */ )
54942+{
54943+ assert("nikita-567", node != NULL);
54944+ assert("nikita-568", znode_page(node) != NULL);
54945+ assert("nikita-569", zdata(node) != NULL);
54946+ return (node40_header *) zdata(node);
54947+}
54948+
54949+/* functions to get/set fields of node40_header */
54950+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
54951+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
54952+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
54953+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
54954+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
54955+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
54956+
54957+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
54958+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
54959+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
54960+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
54961+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
54962+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
54963+
44254afd
MT
54964+/* plugin field of node header should be read/set by
54965+ plugin_by_disk_id/save_disk_plugin */
54966+
54967+/* array of item headers is at the end of node */
54968+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
54969+{
54970+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
54971+}
54972+
54973+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
54974+ */
54975+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
54976+{
54977+ return (item_header40 *) (zdata(coord->node) +
54978+ znode_size(coord->node)) - (coord->item_pos) -
54979+ 1;
54980+}
54981+
54982+/* functions to get/set fields of item_header40 */
54983+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
54984+
54985+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
54986+
54987+/* plugin field of item header should be read/set by
54988+ plugin_by_disk_id/save_disk_plugin */
54989+
54990+/* plugin methods */
54991+
54992+/* plugin->u.node.item_overhead
54993+ look for description of this method in plugin/node/node.h */
54994+size_t
54995+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
54996+{
54997+ return sizeof(item_header40);
54998+}
54999+
55000+/* plugin->u.node.free_space
55001+ look for description of this method in plugin/node/node.h */
55002+size_t free_space_node40(znode * node)
55003+{
55004+ assert("nikita-577", node != NULL);
55005+ assert("nikita-578", znode_is_loaded(node));
55006+ assert("nikita-579", zdata(node) != NULL);
55007+
55008+ return nh40_get_free_space(node40_node_header(node));
55009+}
55010+
55011+/* private inline version of node40_num_of_items() for use in this file. This
55012+ is necessary, because address of node40_num_of_items() is taken and it is
55013+ never inlined as a result. */
55014+static inline short node40_num_of_items_internal(const znode * node)
55015+{
55016+ return nh40_get_num_items(node40_node_header(node));
55017+}
55018+
55019+#if REISER4_DEBUG
55020+static inline void check_num_items(const znode * node)
55021+{
55022+ assert("nikita-2749",
55023+ node40_num_of_items_internal(node) == node->nr_items);
55024+ assert("nikita-2746", znode_is_write_locked(node));
55025+}
55026+#else
55027+#define check_num_items(node) noop
55028+#endif
55029+
55030+/* plugin->u.node.num_of_items
55031+ look for description of this method in plugin/node/node.h */
55032+int num_of_items_node40(const znode * node)
55033+{
55034+ return node40_num_of_items_internal(node);
55035+}
55036+
55037+static void
55038+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
55039+{
55040+ assert("nikita-2751", node != NULL);
55041+ assert("nikita-2750", nh == node40_node_header(node));
55042+
55043+ check_num_items(node);
55044+ nh40_set_num_items(nh, value);
55045+ node->nr_items = value;
55046+ check_num_items(node);
55047+}
55048+
55049+/* plugin->u.node.item_by_coord
55050+ look for description of this method in plugin/node/node.h */
55051+char *item_by_coord_node40(const coord_t * coord)
55052+{
55053+ item_header40 *ih;
55054+ char *p;
55055+
55056+ /* @coord is set to existing item */
55057+ assert("nikita-596", coord != NULL);
55058+ assert("vs-255", coord_is_existing_item(coord));
55059+
55060+ ih = node40_ih_at_coord(coord);
55061+ p = zdata(coord->node) + ih40_get_offset(ih);
55062+ return p;
55063+}
55064+
55065+/* plugin->u.node.length_by_coord
55066+ look for description of this method in plugin/node/node.h */
55067+int length_by_coord_node40(const coord_t * coord)
55068+{
55069+ item_header40 *ih;
55070+ int result;
55071+
55072+ /* @coord is set to existing item */
55073+ assert("vs-256", coord != NULL);
55074+ assert("vs-257", coord_is_existing_item(coord));
55075+
55076+ ih = node40_ih_at_coord(coord);
55077+ if ((int)coord->item_pos ==
55078+ node40_num_of_items_internal(coord->node) - 1)
55079+ result =
55080+ nh40_get_free_space_start(node40_node_header(coord->node)) -
55081+ ih40_get_offset(ih);
55082+ else
55083+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55084+
55085+ return result;
55086+}
55087+
55088+static pos_in_node_t
55089+node40_item_length(const znode * node, pos_in_node_t item_pos)
55090+{
55091+ item_header40 *ih;
55092+ pos_in_node_t result;
55093+
55094+ /* @coord is set to existing item */
55095+ assert("vs-256", node != NULL);
55096+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
55097+
55098+ ih = node40_ih_at(node, item_pos);
55099+ if (item_pos == node40_num_of_items_internal(node) - 1)
55100+ result =
55101+ nh40_get_free_space_start(node40_node_header(node)) -
55102+ ih40_get_offset(ih);
55103+ else
55104+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
55105+
55106+ return result;
55107+}
55108+
55109+/* plugin->u.node.plugin_by_coord
55110+ look for description of this method in plugin/node/node.h */
55111+item_plugin *plugin_by_coord_node40(const coord_t * coord)
55112+{
55113+ item_header40 *ih;
55114+ item_plugin *result;
55115+
55116+ /* @coord is set to existing item */
55117+ assert("vs-258", coord != NULL);
55118+ assert("vs-259", coord_is_existing_item(coord));
55119+
55120+ ih = node40_ih_at_coord(coord);
55121+ /* pass NULL in stead of current tree. This is time critical call. */
55122+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
55123+ return result;
55124+}
55125+
55126+/* plugin->u.node.key_at
55127+ look for description of this method in plugin/node/node.h */
55128+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
55129+{
55130+ item_header40 *ih;
55131+
55132+ assert("nikita-1765", coord_is_existing_item(coord));
55133+
55134+ /* @coord is set to existing item */
55135+ ih = node40_ih_at_coord(coord);
55136+ memcpy(key, &ih->key, sizeof(reiser4_key));
55137+ return key;
55138+}
55139+
55140+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
55141+
55142+#define NODE_INCSTAT(n, counter) \
55143+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
55144+
55145+#define NODE_ADDSTAT(n, counter, val) \
55146+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
55147+
55148+/* plugin->u.node.lookup
55149+ look for description of this method in plugin/node/node.h */
55150+node_search_result lookup_node40(znode * node /* node to query */ ,
55151+ const reiser4_key * key /* key to look for */ ,
55152+ lookup_bias bias /* search bias */ ,
55153+ coord_t * coord /* resulting coord */ )
55154+{
55155+ int left;
55156+ int right;
55157+ int found;
55158+ int items;
55159+
55160+ item_header40 *lefth;
55161+ item_header40 *righth;
55162+
55163+ item_plugin *iplug;
55164+ item_header40 *bstop;
55165+ item_header40 *ih;
55166+ cmp_t order;
55167+
55168+ assert("nikita-583", node != NULL);
55169+ assert("nikita-584", key != NULL);
55170+ assert("nikita-585", coord != NULL);
55171+ assert("nikita-2693", znode_is_any_locked(node));
55172+ cassert(REISER4_SEQ_SEARCH_BREAK > 2);
55173+
55174+ items = node_num_items(node);
55175+
55176+ if (unlikely(items == 0)) {
55177+ coord_init_first_unit(coord, node);
55178+ return NS_NOT_FOUND;
55179+ }
55180+
55181+ /* binary search for item that can contain given key */
55182+ left = 0;
55183+ right = items - 1;
55184+ coord->node = node;
55185+ coord_clear_iplug(coord);
55186+ found = 0;
55187+
55188+ lefth = node40_ih_at(node, left);
55189+ righth = node40_ih_at(node, right);
55190+
55191+ /* It is known that for small arrays sequential search is on average
55192+ more efficient than binary. This is because sequential search is
55193+ coded as tight loop that can be better optimized by compilers and
55194+ for small array size gain from this optimization makes sequential
55195+ search the winner. Another, maybe more important, reason for this,
55196+ is that sequential array is more CPU cache friendly, whereas binary
55197+ search effectively destroys CPU caching.
55198+
55199+ Critical here is the notion of "smallness". Reasonable value of
55200+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
55201+ fs/reiser4/ulevel/ulevel.c:test_search().
55202+
55203+ Don't try to further optimize sequential search by scanning from
55204+ right to left in attempt to use more efficient loop termination
55205+ condition (comparison with 0). This doesn't work.
55206+
55207+ */
55208+
55209+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
55210+ int median;
55211+ item_header40 *medianh;
55212+
55213+ median = (left + right) / 2;
55214+ medianh = node40_ih_at(node, median);
55215+
55216+ assert("nikita-1084", median >= 0);
55217+ assert("nikita-1085", median < items);
55218+ switch (keycmp(key, &medianh->key)) {
55219+ case LESS_THAN:
55220+ right = median;
55221+ righth = medianh;
55222+ break;
55223+ default:
55224+ wrong_return_value("nikita-586", "keycmp");
55225+ case GREATER_THAN:
55226+ left = median;
55227+ lefth = medianh;
55228+ break;
55229+ case EQUAL_TO:
55230+ do {
55231+ --median;
55232+ /* headers are ordered from right to left */
55233+ ++medianh;
55234+ } while (median >= 0 && keyeq(key, &medianh->key));
55235+ right = left = median + 1;
55236+ ih = lefth = righth = medianh - 1;
55237+ found = 1;
55238+ break;
55239+ }
55240+ }
55241+ /* sequential scan. Item headers, and, therefore, keys are stored at
55242+ the rightmost part of a node from right to left. We are trying to
55243+ access memory from left to right, and hence, scan in _descending_
55244+ order of item numbers.
55245+ */
55246+ if (!found) {
55247+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
55248+ cmp_t comparison;
55249+
55250+ prefetchkey(&(ih + 1)->key);
55251+ comparison = keycmp(&ih->key, key);
55252+ if (comparison == GREATER_THAN)
55253+ continue;
55254+ if (comparison == EQUAL_TO) {
55255+ found = 1;
55256+ do {
55257+ --left;
55258+ ++ih;
55259+ } while (left >= 0 && keyeq(&ih->key, key));
55260+ ++left;
55261+ --ih;
55262+ } else {
55263+ assert("nikita-1256", comparison == LESS_THAN);
55264+ }
55265+ break;
55266+ }
55267+ if (unlikely(left < 0))
55268+ left = 0;
55269+ }
55270+
55271+ assert("nikita-3212", right >= left);
55272+ assert("nikita-3214",
55273+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
55274+
55275+ coord_set_item_pos(coord, left);
55276+ coord->unit_pos = 0;
55277+ coord->between = AT_UNIT;
55278+
55279+ /* key < leftmost key in a mode or node is corrupted and keys
55280+ are not sorted */
55281+ bstop = node40_ih_at(node, (unsigned)left);
55282+ order = keycmp(&bstop->key, key);
55283+ if (unlikely(order == GREATER_THAN)) {
55284+ if (unlikely(left != 0)) {
55285+ /* screw up */
55286+ warning("nikita-587", "Key less than %i key in a node",
55287+ left);
71430cf6
MT
55288+ reiser4_print_key("key", key);
55289+ reiser4_print_key("min", &bstop->key);
44254afd
MT
55290+ print_coord_content("coord", coord);
55291+ return RETERR(-EIO);
55292+ } else {
55293+ coord->between = BEFORE_UNIT;
55294+ return NS_NOT_FOUND;
55295+ }
55296+ }
55297+ /* left <= key, ok */
55298+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
55299+
55300+ if (unlikely(iplug == NULL)) {
55301+ warning("nikita-588", "Unknown plugin %i",
55302+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
71430cf6 55303+ reiser4_print_key("key", key);
44254afd
MT
55304+ print_coord_content("coord", coord);
55305+ return RETERR(-EIO);
55306+ }
55307+
55308+ coord_set_iplug(coord, iplug);
55309+
55310+ /* if exact key from item header was found by binary search, no
55311+ further checks are necessary. */
55312+ if (found) {
55313+ assert("nikita-1259", order == EQUAL_TO);
55314+ return NS_FOUND;
55315+ }
55316+ if (iplug->b.max_key_inside != NULL) {
55317+ reiser4_key max_item_key;
55318+
55319+ /* key > max_item_key --- outside of an item */
55320+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
55321+ coord->unit_pos = 0;
55322+ coord->between = AFTER_ITEM;
55323+ /* FIXME-VS: key we are looking for does not fit into
55324+ found item. Return NS_NOT_FOUND then. Without that
55325+ the following case does not work: there is extent of
55326+ file 10000, 10001. File 10000, 10002 has been just
55327+ created. When writing to position 0 in that file -
55328+ traverse_tree will stop here on twig level. When we
55329+ want it to go down to leaf level
55330+ */
55331+ return NS_NOT_FOUND;
55332+ }
55333+ }
55334+
55335+ if (iplug->b.lookup != NULL) {
55336+ return iplug->b.lookup(key, bias, coord);
55337+ } else {
55338+ assert("nikita-1260", order == LESS_THAN);
55339+ coord->between = AFTER_UNIT;
55340+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
55341+ }
55342+}
55343+
55344+#undef NODE_ADDSTAT
55345+#undef NODE_INCSTAT
55346+
55347+/* plugin->u.node.estimate
55348+ look for description of this method in plugin/node/node.h */
55349+size_t estimate_node40(znode * node)
55350+{
55351+ size_t result;
55352+
55353+ assert("nikita-597", node != NULL);
55354+
55355+ result = free_space_node40(node) - sizeof(item_header40);
55356+
55357+ return (result > 0) ? result : 0;
55358+}
55359+
55360+/* plugin->u.node.check
55361+ look for description of this method in plugin/node/node.h */
55362+int check_node40(const znode * node /* node to check */ ,
55363+ __u32 flags /* check flags */ ,
55364+ const char **error /* where to store error message */ )
55365+{
55366+ int nr_items;
55367+ int i;
55368+ reiser4_key prev;
55369+ unsigned old_offset;
55370+ tree_level level;
55371+ coord_t coord;
55372+ int result;
55373+
55374+ assert("nikita-580", node != NULL);
55375+ assert("nikita-581", error != NULL);
55376+ assert("nikita-2948", znode_is_loaded(node));
55377+
55378+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
55379+ return 0;
55380+
55381+ assert("nikita-582", zdata(node) != NULL);
55382+
55383+ nr_items = node40_num_of_items_internal(node);
55384+ if (nr_items < 0) {
55385+ *error = "Negative number of items";
55386+ return -1;
55387+ }
55388+
55389+ if (flags & REISER4_NODE_DKEYS)
55390+ prev = *znode_get_ld_key((znode *) node);
55391+ else
71430cf6 55392+ prev = *reiser4_min_key();
44254afd
MT
55393+
55394+ old_offset = 0;
55395+ coord_init_zero(&coord);
55396+ coord.node = (znode *) node;
55397+ coord.unit_pos = 0;
55398+ coord.between = AT_UNIT;
55399+ level = znode_get_level(node);
55400+ for (i = 0; i < nr_items; i++) {
55401+ item_header40 *ih;
55402+ reiser4_key unit_key;
55403+ unsigned j;
55404+
55405+ ih = node40_ih_at(node, (unsigned)i);
55406+ coord_set_item_pos(&coord, i);
55407+ if ((ih40_get_offset(ih) >=
55408+ znode_size(node) - nr_items * sizeof(item_header40)) ||
55409+ (ih40_get_offset(ih) < sizeof(node40_header))) {
55410+ *error = "Offset is out of bounds";
55411+ return -1;
55412+ }
55413+ if (ih40_get_offset(ih) <= old_offset) {
55414+ *error = "Offsets are in wrong order";
55415+ return -1;
55416+ }
55417+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
55418+ *error = "Wrong offset of first item";
55419+ return -1;
55420+ }
55421+ old_offset = ih40_get_offset(ih);
55422+
55423+ if (keygt(&prev, &ih->key)) {
55424+ *error = "Keys are in wrong order";
55425+ return -1;
55426+ }
55427+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
55428+ *error = "Wrong key of first unit";
55429+ return -1;
55430+ }
55431+ prev = ih->key;
55432+ for (j = 0; j < coord_num_units(&coord); ++j) {
55433+ coord.unit_pos = j;
55434+ unit_key_by_coord(&coord, &unit_key);
55435+ if (keygt(&prev, &unit_key)) {
55436+ *error = "Unit keys are in wrong order";
55437+ return -1;
55438+ }
55439+ prev = unit_key;
55440+ }
55441+ coord.unit_pos = 0;
55442+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
55443+ *error = "extent on the wrong level";
55444+ return -1;
55445+ }
55446+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
55447+ *error = "internal item on the wrong level";
55448+ return -1;
55449+ }
55450+ if (level != LEAF_LEVEL &&
55451+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
55452+ *error = "wrong item on the internal level";
55453+ return -1;
55454+ }
55455+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
55456+ *error = "non-internal item on the internal level";
55457+ return -1;
55458+ }
55459+#if REISER4_DEBUG
55460+ if (item_plugin_by_coord(&coord)->b.check
55461+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
55462+ return -1;
55463+#endif
55464+ if (i) {
55465+ coord_t prev_coord;
55466+ /* two neighboring items can not be mergeable */
55467+ coord_dup(&prev_coord, &coord);
55468+ coord_prev_item(&prev_coord);
55469+ if (are_items_mergeable(&prev_coord, &coord)) {
55470+ *error = "mergeable items in one node";
55471+ return -1;
55472+ }
55473+
55474+ }
55475+ }
55476+
55477+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
55478+ coord_t coord;
55479+ item_plugin *iplug;
55480+
55481+ coord_init_last_unit(&coord, node);
55482+ iplug = item_plugin_by_coord(&coord);
55483+ if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
55484+ iplug->s.file.append_key != NULL) {
55485+ reiser4_key mkey;
55486+
55487+ iplug->s.file.append_key(&coord, &mkey);
55488+ set_key_offset(&mkey, get_key_offset(&mkey) - 1);
55489+ read_lock_dk(current_tree);
55490+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
55491+ read_unlock_dk(current_tree);
55492+ if (result) {
55493+ *error = "key of rightmost item is too large";
55494+ return -1;
55495+ }
55496+ }
55497+ }
55498+ if (flags & REISER4_NODE_DKEYS) {
55499+ read_lock_tree(current_tree);
55500+ read_lock_dk(current_tree);
55501+
55502+ flags |= REISER4_NODE_TREE_STABLE;
55503+
55504+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
55505+ if (flags & REISER4_NODE_TREE_STABLE) {
55506+ *error = "Last key is greater than rdkey";
55507+ read_unlock_dk(current_tree);
55508+ read_unlock_tree(current_tree);
55509+ return -1;
55510+ }
55511+ }
55512+ if (keygt
55513+ (znode_get_ld_key((znode *) node),
55514+ znode_get_rd_key((znode *) node))) {
55515+ *error = "ldkey is greater than rdkey";
55516+ read_unlock_dk(current_tree);
55517+ read_unlock_tree(current_tree);
55518+ return -1;
55519+ }
55520+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
55521+ (node->left != NULL) &&
55522+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
55523+ ergo(flags & REISER4_NODE_TREE_STABLE,
55524+ !keyeq(znode_get_rd_key(node->left),
55525+ znode_get_ld_key((znode *) node)))
55526+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55527+ keygt(znode_get_rd_key(node->left),
55528+ znode_get_ld_key((znode *) node)))) {
55529+ *error = "left rdkey or ldkey is wrong";
55530+ read_unlock_dk(current_tree);
55531+ read_unlock_tree(current_tree);
55532+ return -1;
55533+ }
55534+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
55535+ (node->right != NULL) &&
55536+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
55537+ ergo(flags & REISER4_NODE_TREE_STABLE,
55538+ !keyeq(znode_get_rd_key((znode *) node),
55539+ znode_get_ld_key(node->right)))
55540+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55541+ keygt(znode_get_rd_key((znode *) node),
55542+ znode_get_ld_key(node->right)))) {
55543+ *error = "rdkey or right ldkey is wrong";
55544+ read_unlock_dk(current_tree);
55545+ read_unlock_tree(current_tree);
55546+ return -1;
55547+ }
55548+
55549+ read_unlock_dk(current_tree);
55550+ read_unlock_tree(current_tree);
55551+ }
55552+
55553+ return 0;
55554+}
55555+
55556+/* plugin->u.node.parse
55557+ look for description of this method in plugin/node/node.h */
55558+int parse_node40(znode * node /* node to parse */ )
55559+{
55560+ node40_header *header;
55561+ int result;
55562+ d8 level;
55563+
55564+ header = node40_node_header((znode *) node);
55565+ result = -EIO;
55566+ level = nh40_get_level(header);
55567+ if (unlikely(((__u8) znode_get_level(node)) != level))
55568+ warning("nikita-494", "Wrong level found in node: %i != %i",
55569+ znode_get_level(node), level);
55570+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
55571+ warning("nikita-495",
55572+ "Wrong magic in tree node: want %x, got %x",
55573+ REISER4_NODE_MAGIC, nh40_get_magic(header));
55574+ else {
55575+ node->nr_items = node40_num_of_items_internal(node);
55576+ result = 0;
55577+ }
44254afd
MT
55578+ return RETERR(result);
55579+}
55580+
55581+/* plugin->u.node.init
55582+ look for description of this method in plugin/node/node.h */
55583+int init_node40(znode * node /* node to initialise */ )
55584+{
55585+ node40_header *header;
55586+
55587+ assert("nikita-570", node != NULL);
55588+ assert("nikita-572", zdata(node) != NULL);
55589+
55590+ header = node40_node_header(node);
55591+ memset(header, 0, sizeof(node40_header));
55592+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
55593+ nh40_set_free_space_start(header, sizeof(node40_header));
55594+ /* sane hypothesis: 0 in CPU format is 0 in disk format */
55595+ /* items: 0 */
55596+ save_plugin_id(node_plugin_to_plugin(node->nplug),
55597+ &header->common_header.plugin_id);
55598+ nh40_set_level(header, znode_get_level(node));
55599+ nh40_set_magic(header, REISER4_NODE_MAGIC);
55600+ node->nr_items = 0;
55601+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
55602+
55603+ /* flags: 0 */
55604+ return 0;
55605+}
55606+
55607+#ifdef GUESS_EXISTS
55608+int guess_node40(const znode * node /* node to guess plugin of */ )
55609+{
55610+ node40_header *nethack;
55611+
55612+ assert("nikita-1058", node != NULL);
55613+ nethack = node40_node_header(node);
55614+ return
55615+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
55616+ (plugin_by_disk_id(znode_get_tree(node),
55617+ REISER4_NODE_PLUGIN_TYPE,
55618+ &nethack->common_header.plugin_id)->h.id ==
55619+ NODE40_ID);
55620+}
55621+#endif
55622+
55623+/* plugin->u.node.chage_item_size
55624+ look for description of this method in plugin/node/node.h */
55625+void change_item_size_node40(coord_t * coord, int by)
55626+{
55627+ node40_header *nh;
55628+ item_header40 *ih;
55629+ char *item_data;
55630+ int item_length;
55631+ unsigned i;
55632+
55633+ /* make sure that @item is coord of existing item */
55634+ assert("vs-210", coord_is_existing_item(coord));
55635+
55636+ nh = node40_node_header(coord->node);
55637+
55638+ item_data = item_by_coord_node40(coord);
55639+ item_length = length_by_coord_node40(coord);
55640+
55641+ /* move item bodies */
55642+ ih = node40_ih_at_coord(coord);
55643+ memmove(item_data + item_length + by, item_data + item_length,
55644+ nh40_get_free_space_start(node40_node_header(coord->node)) -
55645+ (ih40_get_offset(ih) + item_length));
55646+
55647+ /* update offsets of moved items */
55648+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
55649+ ih = node40_ih_at(coord->node, i);
55650+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
55651+ }
55652+
55653+ /* update node header */
55654+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
55655+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
55656+}
55657+
55658+static int should_notify_parent(const znode * node)
55659+{
55660+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
55661+ return !disk_addr_eq(znode_get_block(node),
55662+ &znode_get_tree(node)->root_block);
55663+}
55664+
55665+/* plugin->u.node.create_item
55666+ look for description of this method in plugin/node/node.h */
55667+int
55668+create_item_node40(coord_t *target, const reiser4_key *key,
55669+ reiser4_item_data *data, carry_plugin_info *info)
55670+{
55671+ node40_header *nh;
55672+ item_header40 *ih;
55673+ unsigned offset;
55674+ unsigned i;
55675+
55676+ nh = node40_node_header(target->node);
55677+
55678+ assert("vs-212", coord_is_between_items(target));
55679+ /* node must have enough free space */
55680+ assert("vs-254",
55681+ free_space_node40(target->node) >=
55682+ data->length + sizeof(item_header40));
55683+ assert("vs-1410", data->length >= 0);
55684+
55685+ if (coord_set_to_right(target))
55686+ /* there are not items to the right of @target, so, new item
55687+ will be inserted after last one */
55688+ coord_set_item_pos(target, nh40_get_num_items(nh));
55689+
55690+ if (target->item_pos < nh40_get_num_items(nh)) {
55691+ /* there are items to be moved to prepare space for new
55692+ item */
55693+ ih = node40_ih_at_coord(target);
55694+ /* new item will start at this offset */
55695+ offset = ih40_get_offset(ih);
55696+
55697+ memmove(zdata(target->node) + offset + data->length,
55698+ zdata(target->node) + offset,
55699+ nh40_get_free_space_start(nh) - offset);
55700+ /* update headers of moved items */
55701+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
55702+ ih = node40_ih_at(target->node, i);
55703+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
55704+ }
55705+
55706+ /* @ih is set to item header of the last item, move item headers */
55707+ memmove(ih - 1, ih,
55708+ sizeof(item_header40) * (nh40_get_num_items(nh) -
55709+ target->item_pos));
55710+ } else {
55711+ /* new item will start at this offset */
55712+ offset = nh40_get_free_space_start(nh);
55713+ }
55714+
55715+ /* make item header for the new item */
55716+ ih = node40_ih_at_coord(target);
55717+ memcpy(&ih->key, key, sizeof(reiser4_key));
55718+ ih40_set_offset(ih, offset);
55719+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
55720+
55721+ /* update node header */
55722+ nh40_set_free_space(nh,
55723+ nh40_get_free_space(nh) - data->length -
55724+ sizeof(item_header40));
55725+ nh40_set_free_space_start(nh,
55726+ nh40_get_free_space_start(nh) + data->length);
55727+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
55728+
55729+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
55730+ target->unit_pos = 0;
55731+ target->between = AT_UNIT;
55732+ coord_clear_iplug(target);
55733+
55734+ /* initialize item */
55735+ if (data->iplug->b.init != NULL) {
55736+ data->iplug->b.init(target, NULL, data);
55737+ }
55738+ /* copy item body */
55739+ if (data->iplug->b.paste != NULL) {
55740+ data->iplug->b.paste(target, data, info);
55741+ } else if (data->data != NULL) {
55742+ if (data->user) {
55743+ /* AUDIT: Are we really should not check that pointer
55744+ from userspace was valid and data bytes were
55745+ available? How will we return -EFAULT of some kind
55746+ without this check? */
71430cf6 55747+ assert("nikita-3038", reiser4_schedulable());
44254afd
MT
55748+ /* copy data from user space */
55749+ __copy_from_user(zdata(target->node) + offset,
55750+ (const char __user *)data->data,
55751+ (unsigned)data->length);
55752+ } else
55753+ /* copy from kernel space */
55754+ memcpy(zdata(target->node) + offset, data->data,
55755+ (unsigned)data->length);
55756+ }
55757+
55758+ if (target->item_pos == 0) {
55759+ /* left delimiting key has to be updated */
55760+ prepare_for_update(NULL, target->node, info);
55761+ }
55762+
55763+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
55764+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
55765+ }
55766+
55767+ return 0;
55768+}
55769+
55770+/* plugin->u.node.update_item_key
55771+ look for description of this method in plugin/node/node.h */
55772+void
55773+update_item_key_node40(coord_t * target, const reiser4_key * key,
55774+ carry_plugin_info * info)
55775+{
55776+ item_header40 *ih;
55777+
55778+ ih = node40_ih_at_coord(target);
55779+ memcpy(&ih->key, key, sizeof(reiser4_key));
55780+
55781+ if (target->item_pos == 0) {
55782+ prepare_for_update(NULL, target->node, info);
55783+ }
55784+}
55785+
55786+/* this bits encode cut mode */
55787+#define CMODE_TAIL 1
55788+#define CMODE_WHOLE 2
55789+#define CMODE_HEAD 4
55790+
55791+struct cut40_info {
55792+ int mode;
55793+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
55794+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
55795+ pos_in_node_t removed_count; /* number of items removed completely */
55796+ pos_in_node_t head_removed; /* position of item which gets head removed */
55797+
55798+ pos_in_node_t freed_space_start;
55799+ pos_in_node_t freed_space_end;
55800+ pos_in_node_t first_moved;
55801+ pos_in_node_t head_removed_location;
55802+};
55803+
55804+static void init_cinfo(struct cut40_info *cinfo)
55805+{
55806+ cinfo->mode = 0;
55807+ cinfo->tail_removed = MAX_POS_IN_NODE;
55808+ cinfo->first_removed = MAX_POS_IN_NODE;
55809+ cinfo->removed_count = MAX_POS_IN_NODE;
55810+ cinfo->head_removed = MAX_POS_IN_NODE;
55811+ cinfo->freed_space_start = MAX_POS_IN_NODE;
55812+ cinfo->freed_space_end = MAX_POS_IN_NODE;
55813+ cinfo->first_moved = MAX_POS_IN_NODE;
55814+ cinfo->head_removed_location = MAX_POS_IN_NODE;
55815+}
55816+
55817+/* complete cut_node40/kill_node40 content by removing the gap created by */
55818+static void compact(znode * node, struct cut40_info *cinfo)
55819+{
55820+ node40_header *nh;
55821+ item_header40 *ih;
55822+ pos_in_node_t freed;
55823+ pos_in_node_t pos, nr_items;
55824+
55825+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
55826+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
55827+ cinfo->first_moved != MAX_POS_IN_NODE));
55828+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
55829+
55830+ nh = node40_node_header(node);
55831+ nr_items = nh40_get_num_items(nh);
55832+
55833+ /* remove gap made up by removal */
55834+ memmove(zdata(node) + cinfo->freed_space_start,
55835+ zdata(node) + cinfo->freed_space_end,
55836+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
55837+
55838+ /* update item headers of moved items - change their locations */
55839+ pos = cinfo->first_moved;
55840+ ih = node40_ih_at(node, pos);
55841+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
55842+ assert("vs-1580", pos == cinfo->head_removed);
55843+ ih40_set_offset(ih, cinfo->head_removed_location);
55844+ pos++;
55845+ ih--;
55846+ }
55847+
55848+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
55849+ for (; pos < nr_items; pos++, ih--) {
55850+ assert("vs-1581", ih == node40_ih_at(node, pos));
55851+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
55852+ }
55853+
55854+ /* free space start moved to right */
55855+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
55856+
55857+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
55858+ /* number of items changed. Remove item headers of those items */
55859+ ih = node40_ih_at(node, nr_items - 1);
55860+ memmove(ih + cinfo->removed_count, ih,
55861+ sizeof(item_header40) * (nr_items -
55862+ cinfo->removed_count -
55863+ cinfo->first_removed));
55864+ freed += sizeof(item_header40) * cinfo->removed_count;
55865+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
55866+ }
55867+
55868+ /* total amount of free space increased */
55869+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
55870+}
55871+
55872+int shrink_item_node40(coord_t * coord, int delta)
55873+{
55874+ node40_header *nh;
55875+ item_header40 *ih;
55876+ pos_in_node_t pos;
55877+ pos_in_node_t nr_items;
55878+ char *end;
55879+ znode *node;
55880+ int off;
55881+
55882+ assert("nikita-3487", coord != NULL);
55883+ assert("nikita-3488", delta >= 0);
55884+
55885+ node = coord->node;
55886+ nh = node40_node_header(node);
55887+ nr_items = nh40_get_num_items(nh);
55888+
55889+ ih = node40_ih_at_coord(coord);
55890+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
55891+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
55892+ end = zdata(node) + off;
55893+
55894+ /* remove gap made up by removal */
55895+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
55896+
55897+ /* update item headers of moved items - change their locations */
55898+ pos = coord->item_pos + 1;
55899+ ih = node40_ih_at(node, pos);
55900+ for (; pos < nr_items; pos++, ih--) {
55901+ assert("nikita-3490", ih == node40_ih_at(node, pos));
55902+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
55903+ }
55904+
55905+ /* free space start moved to left */
55906+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
55907+ /* total amount of free space increased */
55908+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
55909+ /*
55910+ * This method does _not_ changes number of items. Hence, it cannot
55911+ * make node empty. Also it doesn't remove items at all, which means
55912+ * that no keys have to be updated either.
55913+ */
55914+ return 0;
55915+}
55916+
55917+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
55918+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
55919+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
55920+ getting head cut. Function returns 0 in this case */
55921+static int
55922+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
55923+{
55924+ reiser4_key left_key, right_key;
55925+ reiser4_key min_from_key, max_to_key;
55926+ const reiser4_key *from_key, *to_key;
55927+
55928+ init_cinfo(cinfo);
55929+
55930+ /* calculate minimal key stored in first item of items to be cut (params->from) */
55931+ item_key_by_coord(params->from, &min_from_key);
55932+ /* and max key stored in last item of items to be cut (params->to) */
55933+ max_item_key_by_coord(params->to, &max_to_key);
55934+
55935+ /* if cut key range is not defined in input parameters - define it using cut coord range */
55936+ if (params->from_key == NULL) {
55937+ assert("vs-1513", params->to_key == NULL);
55938+ unit_key_by_coord(params->from, &left_key);
55939+ from_key = &left_key;
55940+ max_unit_key_by_coord(params->to, &right_key);
55941+ to_key = &right_key;
55942+ } else {
55943+ from_key = params->from_key;
55944+ to_key = params->to_key;
55945+ }
55946+
55947+ if (params->from->item_pos == params->to->item_pos) {
55948+ if (keylt(&min_from_key, from_key)
55949+ && keylt(to_key, &max_to_key))
55950+ return 1;
55951+
55952+ if (keygt(from_key, &min_from_key)) {
55953+ /* tail of item is to be cut cut */
55954+ cinfo->tail_removed = params->from->item_pos;
55955+ cinfo->mode |= CMODE_TAIL;
55956+ } else if (keylt(to_key, &max_to_key)) {
55957+ /* head of item is to be cut */
55958+ cinfo->head_removed = params->from->item_pos;
55959+ cinfo->mode |= CMODE_HEAD;
55960+ } else {
55961+ /* item is removed completely */
55962+ cinfo->first_removed = params->from->item_pos;
55963+ cinfo->removed_count = 1;
55964+ cinfo->mode |= CMODE_WHOLE;
55965+ }
55966+ } else {
55967+ cinfo->first_removed = params->from->item_pos + 1;
55968+ cinfo->removed_count =
55969+ params->to->item_pos - params->from->item_pos - 1;
55970+
55971+ if (keygt(from_key, &min_from_key)) {
55972+ /* first item is not cut completely */
55973+ cinfo->tail_removed = params->from->item_pos;
55974+ cinfo->mode |= CMODE_TAIL;
55975+ } else {
55976+ cinfo->first_removed--;
55977+ cinfo->removed_count++;
55978+ }
55979+ if (keylt(to_key, &max_to_key)) {
55980+ /* last item is not cut completely */
55981+ cinfo->head_removed = params->to->item_pos;
55982+ cinfo->mode |= CMODE_HEAD;
55983+ } else {
55984+ cinfo->removed_count++;
55985+ }
55986+ if (cinfo->removed_count)
55987+ cinfo->mode |= CMODE_WHOLE;
55988+ }
55989+
55990+ return 0;
55991+}
55992+
55993+static void
55994+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
55995+ carry_kill_data * kdata)
55996+{
55997+ coord_t coord;
55998+ item_plugin *iplug;
55999+ pos_in_node_t pos;
56000+
56001+ coord.node = node;
56002+ coord.unit_pos = 0;
56003+ coord.between = AT_UNIT;
56004+ for (pos = 0; pos < count; pos++) {
56005+ coord_set_item_pos(&coord, from + pos);
56006+ coord.unit_pos = 0;
56007+ coord.between = AT_UNIT;
56008+ iplug = item_plugin_by_coord(&coord);
56009+ if (iplug->b.kill_hook) {
56010+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
56011+ kdata);
56012+ }
56013+ }
56014+}
56015+
56016+/* this is used to kill item partially */
56017+static pos_in_node_t
56018+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56019+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
56020+{
56021+ struct carry_kill_data *kdata;
56022+ item_plugin *iplug;
56023+
56024+ kdata = data;
56025+ iplug = item_plugin_by_coord(coord);
56026+
56027+ assert("vs-1524", iplug->b.kill_units);
56028+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
56029+ new_first_key);
56030+}
56031+
56032+/* call item plugin to cut tail of file */
56033+static pos_in_node_t
56034+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56035+{
56036+ struct carry_kill_data *kdata;
56037+ pos_in_node_t to;
56038+
56039+ kdata = data;
56040+ to = coord_last_unit_pos(coord);
56041+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
56042+ NULL);
56043+}
56044+
56045+/* call item plugin to cut head of item */
56046+static pos_in_node_t
56047+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56048+ reiser4_key * new_first_key)
56049+{
56050+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
56051+ new_first_key);
56052+}
56053+
56054+/* this is used to cut item partially */
56055+static pos_in_node_t
56056+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
56057+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
56058+{
56059+ carry_cut_data *cdata;
56060+ item_plugin *iplug;
56061+
56062+ cdata = data;
56063+ iplug = item_plugin_by_coord(coord);
56064+ assert("vs-302", iplug->b.cut_units);
56065+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
56066+ new_first_key);
56067+}
56068+
56069+/* call item plugin to cut tail of file */
56070+static pos_in_node_t
56071+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
56072+{
56073+ carry_cut_data *cdata;
56074+ pos_in_node_t to;
56075+
56076+ cdata = data;
56077+ to = coord_last_unit_pos(cdata->params.from);
56078+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
56079+}
56080+
56081+/* call item plugin to cut head of item */
56082+static pos_in_node_t
56083+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
56084+ reiser4_key * new_first_key)
56085+{
56086+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
56087+ new_first_key);
56088+}
56089+
56090+/* this returns 1 of key of first item changed, 0 - if it did not */
56091+static int
56092+prepare_for_compact(struct cut40_info *cinfo,
56093+ const struct cut_kill_params *params, int is_cut,
56094+ void *data, carry_plugin_info * info)
56095+{
56096+ znode *node;
56097+ item_header40 *ih;
56098+ pos_in_node_t freed;
56099+ pos_in_node_t item_pos;
56100+ coord_t coord;
56101+ reiser4_key new_first_key;
56102+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
56103+ void *, reiser4_key *, reiser4_key *);
56104+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
56105+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
56106+ reiser4_key *);
56107+ int retval;
56108+
56109+ retval = 0;
56110+
56111+ node = params->from->node;
56112+
56113+ assert("vs-184", node == params->to->node);
56114+ assert("vs-312", !node_is_empty(node));
56115+ assert("vs-297",
56116+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
56117+
56118+ if (is_cut) {
56119+ kill_units_f = cut_units;
56120+ kill_tail_f = cut_tail;
56121+ kill_head_f = cut_head;
56122+ } else {
56123+ kill_units_f = kill_units;
56124+ kill_tail_f = kill_tail;
56125+ kill_head_f = kill_head;
56126+ }
56127+
56128+ if (parse_cut(cinfo, params) == 1) {
56129+ /* cut from the middle of item */
56130+ freed =
56131+ kill_units_f(params->from, params->from->unit_pos,
56132+ params->to->unit_pos, data,
56133+ params->smallest_removed, NULL);
56134+
56135+ item_pos = params->from->item_pos;
56136+ ih = node40_ih_at(node, item_pos);
56137+ cinfo->freed_space_start =
56138+ ih40_get_offset(ih) + node40_item_length(node,
56139+ item_pos) - freed;
56140+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
56141+ cinfo->first_moved = item_pos + 1;
56142+ } else {
56143+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
56144+ cinfo->first_removed != MAX_POS_IN_NODE ||
56145+ cinfo->head_removed != MAX_POS_IN_NODE));
56146+
56147+ switch (cinfo->mode) {
56148+ case CMODE_TAIL:
56149+ /* one item gets cut partially from its end */
56150+ assert("vs-1562",
56151+ cinfo->tail_removed == params->from->item_pos);
56152+
56153+ freed =
56154+ kill_tail_f(params->from, data,
56155+ params->smallest_removed);
56156+
56157+ item_pos = cinfo->tail_removed;
56158+ ih = node40_ih_at(node, item_pos);
56159+ cinfo->freed_space_start =
56160+ ih40_get_offset(ih) + node40_item_length(node,
56161+ item_pos) -
56162+ freed;
56163+ cinfo->freed_space_end =
56164+ cinfo->freed_space_start + freed;
56165+ cinfo->first_moved = cinfo->tail_removed + 1;
56166+ break;
56167+
56168+ case CMODE_WHOLE:
56169+ /* one or more items get removed completely */
56170+ assert("vs-1563",
56171+ cinfo->first_removed == params->from->item_pos);
56172+ assert("vs-1564", cinfo->removed_count > 0
56173+ && cinfo->removed_count != MAX_POS_IN_NODE);
56174+
56175+ /* call kill hook for all items removed completely */
56176+ if (is_cut == 0)
56177+ call_kill_hooks(node, cinfo->first_removed,
56178+ cinfo->removed_count, data);
56179+
56180+ item_pos = cinfo->first_removed;
56181+ ih = node40_ih_at(node, item_pos);
56182+
56183+ if (params->smallest_removed)
56184+ memcpy(params->smallest_removed, &ih->key,
56185+ sizeof(reiser4_key));
56186+
56187+ cinfo->freed_space_start = ih40_get_offset(ih);
56188+
56189+ item_pos += (cinfo->removed_count - 1);
56190+ ih -= (cinfo->removed_count - 1);
56191+ cinfo->freed_space_end =
56192+ ih40_get_offset(ih) + node40_item_length(node,
56193+ item_pos);
56194+ cinfo->first_moved = item_pos + 1;
56195+ if (cinfo->first_removed == 0)
56196+ /* key of first item of the node changes */
56197+ retval = 1;
56198+ break;
56199+
56200+ case CMODE_HEAD:
56201+ /* one item gets cut partially from its head */
56202+ assert("vs-1565",
56203+ cinfo->head_removed == params->from->item_pos);
56204+
56205+ freed =
56206+ kill_head_f(params->to, data,
56207+ params->smallest_removed,
56208+ &new_first_key);
56209+
56210+ item_pos = cinfo->head_removed;
56211+ ih = node40_ih_at(node, item_pos);
56212+ cinfo->freed_space_start = ih40_get_offset(ih);
56213+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56214+ cinfo->first_moved = cinfo->head_removed + 1;
56215+
56216+ /* item head is removed, therefore, item key changed */
56217+ coord.node = node;
56218+ coord_set_item_pos(&coord, item_pos);
56219+ coord.unit_pos = 0;
56220+ coord.between = AT_UNIT;
56221+ update_item_key_node40(&coord, &new_first_key, NULL);
56222+ if (item_pos == 0)
56223+ /* key of first item of the node changes */
56224+ retval = 1;
56225+ break;
56226+
56227+ case CMODE_TAIL | CMODE_WHOLE:
56228+ /* one item gets cut from its end and one or more items get removed completely */
56229+ assert("vs-1566",
56230+ cinfo->tail_removed == params->from->item_pos);
56231+ assert("vs-1567",
56232+ cinfo->first_removed == cinfo->tail_removed + 1);
56233+ assert("vs-1564", cinfo->removed_count > 0
56234+ && cinfo->removed_count != MAX_POS_IN_NODE);
56235+
56236+ freed =
56237+ kill_tail_f(params->from, data,
56238+ params->smallest_removed);
56239+
56240+ item_pos = cinfo->tail_removed;
56241+ ih = node40_ih_at(node, item_pos);
56242+ cinfo->freed_space_start =
56243+ ih40_get_offset(ih) + node40_item_length(node,
56244+ item_pos) -
56245+ freed;
56246+
56247+ /* call kill hook for all items removed completely */
56248+ if (is_cut == 0)
56249+ call_kill_hooks(node, cinfo->first_removed,
56250+ cinfo->removed_count, data);
56251+
56252+ item_pos += cinfo->removed_count;
56253+ ih -= cinfo->removed_count;
56254+ cinfo->freed_space_end =
56255+ ih40_get_offset(ih) + node40_item_length(node,
56256+ item_pos);
56257+ cinfo->first_moved = item_pos + 1;
56258+ break;
56259+
56260+ case CMODE_WHOLE | CMODE_HEAD:
56261+ /* one or more items get removed completely and one item gets cut partially from its head */
56262+ assert("vs-1568",
56263+ cinfo->first_removed == params->from->item_pos);
56264+ assert("vs-1564", cinfo->removed_count > 0
56265+ && cinfo->removed_count != MAX_POS_IN_NODE);
56266+ assert("vs-1569",
56267+ cinfo->head_removed ==
56268+ cinfo->first_removed + cinfo->removed_count);
56269+
56270+ /* call kill hook for all items removed completely */
56271+ if (is_cut == 0)
56272+ call_kill_hooks(node, cinfo->first_removed,
56273+ cinfo->removed_count, data);
56274+
56275+ item_pos = cinfo->first_removed;
56276+ ih = node40_ih_at(node, item_pos);
56277+
56278+ if (params->smallest_removed)
56279+ memcpy(params->smallest_removed, &ih->key,
56280+ sizeof(reiser4_key));
56281+
56282+ freed =
56283+ kill_head_f(params->to, data, NULL, &new_first_key);
56284+
56285+ cinfo->freed_space_start = ih40_get_offset(ih);
56286+
56287+ ih = node40_ih_at(node, cinfo->head_removed);
56288+ /* this is the most complex case. Item which got head removed and items which are to be moved
56289+ intact change their location differently. */
56290+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
56291+ cinfo->first_moved = cinfo->head_removed;
56292+ cinfo->head_removed_location = cinfo->freed_space_start;
56293+
56294+ /* item head is removed, therefore, item key changed */
56295+ coord.node = node;
56296+ coord_set_item_pos(&coord, cinfo->head_removed);
56297+ coord.unit_pos = 0;
56298+ coord.between = AT_UNIT;
56299+ update_item_key_node40(&coord, &new_first_key, NULL);
56300+
56301+ assert("vs-1579", cinfo->first_removed == 0);
56302+ /* key of first item of the node changes */
56303+ retval = 1;
56304+ break;
56305+
56306+ case CMODE_TAIL | CMODE_HEAD:
56307+ /* one item get cut from its end and its neighbor gets cut from its tail */
56308+ impossible("vs-1576", "this can not happen currently");
56309+ break;
56310+
56311+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
56312+ impossible("vs-1577", "this can not happen currently");
56313+ break;
56314+ default:
56315+ impossible("vs-1578", "unexpected cut mode");
56316+ break;
56317+ }
56318+ }
56319+ return retval;
56320+}
56321+
56322+/* plugin->u.node.kill
56323+ return value is number of items removed completely */
56324+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
56325+{
56326+ znode *node;
56327+ struct cut40_info cinfo;
56328+ int first_key_changed;
56329+
56330+ node = kdata->params.from->node;
56331+
56332+ first_key_changed =
56333+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
56334+ info);
56335+ compact(node, &cinfo);
56336+
56337+ if (info) {
56338+ /* it is not called by node40_shift, so we have to take care
56339+ of changes on upper levels */
56340+ if (node_is_empty(node)
56341+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
56342+ /* all contents of node is deleted */
56343+ prepare_removal_node40(node, info);
56344+ else if (first_key_changed) {
56345+ prepare_for_update(NULL, node, info);
56346+ }
56347+ }
56348+
56349+ coord_clear_iplug(kdata->params.from);
56350+ coord_clear_iplug(kdata->params.to);
56351+
56352+ znode_make_dirty(node);
56353+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56354+}
56355+
56356+/* plugin->u.node.cut
56357+ return value is number of items removed completely */
56358+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
56359+{
56360+ znode *node;
56361+ struct cut40_info cinfo;
56362+ int first_key_changed;
56363+
56364+ node = cdata->params.from->node;
56365+
56366+ first_key_changed =
56367+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
56368+ info);
56369+ compact(node, &cinfo);
56370+
56371+ if (info) {
56372+ /* it is not called by node40_shift, so we have to take care
56373+ of changes on upper levels */
56374+ if (node_is_empty(node))
56375+ /* all contents of node is deleted */
56376+ prepare_removal_node40(node, info);
56377+ else if (first_key_changed) {
56378+ prepare_for_update(NULL, node, info);
56379+ }
56380+ }
56381+
56382+ coord_clear_iplug(cdata->params.from);
56383+ coord_clear_iplug(cdata->params.to);
56384+
56385+ znode_make_dirty(node);
56386+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
56387+}
56388+
56389+/* this structure is used by shift method of node40 plugin */
56390+struct shift_params {
56391+ shift_direction pend; /* when @pend == append - we are shifting to
56392+ left, when @pend == prepend - to right */
56393+ coord_t wish_stop; /* when shifting to left this is last unit we
56394+ want shifted, when shifting to right - this
56395+ is set to unit we want to start shifting
56396+ from */
56397+ znode *target;
56398+ int everything; /* it is set to 1 if everything we have to shift is
56399+ shifted, 0 - otherwise */
56400+
56401+ /* FIXME-VS: get rid of read_stop */
56402+
56403+ /* these are set by estimate_shift */
56404+ coord_t real_stop; /* this will be set to last unit which will be
56405+ really shifted */
56406+
56407+ /* coordinate in source node before operation of unit which becomes
56408+ first after shift to left of last after shift to right */
56409+ union {
56410+ coord_t future_first;
56411+ coord_t future_last;
56412+ } u;
56413+
56414+ unsigned merging_units; /* number of units of first item which have to
56415+ be merged with last item of target node */
56416+ unsigned merging_bytes; /* number of bytes in those units */
56417+
56418+ unsigned entire; /* items shifted in their entirety */
56419+ unsigned entire_bytes; /* number of bytes in those items */
56420+
56421+ unsigned part_units; /* number of units of partially copied item */
56422+ unsigned part_bytes; /* number of bytes in those units */
56423+
56424+ unsigned shift_bytes; /* total number of bytes in items shifted (item
56425+ headers not included) */
56426+
56427+};
56428+
56429+static int item_creation_overhead(coord_t *item)
56430+{
56431+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
56432+}
56433+
56434+/* how many units are there in @source starting from source->unit_pos
56435+ but not further than @stop_coord */
56436+static int
56437+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
56438+{
56439+ if (pend == SHIFT_LEFT) {
56440+ assert("vs-181", source->unit_pos == 0);
56441+ } else {
56442+ assert("vs-182",
56443+ source->unit_pos == coord_last_unit_pos(source));
56444+ }
56445+
56446+ if (source->item_pos != stop_coord->item_pos) {
56447+ /* @source and @stop_coord are different items */
56448+ return coord_last_unit_pos(source) + 1;
56449+ }
56450+
56451+ if (pend == SHIFT_LEFT) {
56452+ return stop_coord->unit_pos + 1;
56453+ } else {
56454+ return source->unit_pos - stop_coord->unit_pos + 1;
56455+ }
56456+}
56457+
56458+/* this calculates what can be copied from @shift->wish_stop.node to
56459+ @shift->target */
56460+static void
56461+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
56462+{
56463+ unsigned target_free_space, size;
56464+ pos_in_node_t stop_item; /* item which estimating should not consider */
56465+ unsigned want; /* number of units of item we want shifted */
56466+ coord_t source; /* item being estimated */
56467+ item_plugin *iplug;
56468+
56469+ /* shifting to left/right starts from first/last units of
56470+ @shift->wish_stop.node */
56471+ if (shift->pend == SHIFT_LEFT) {
56472+ coord_init_first_unit(&source, shift->wish_stop.node);
56473+ } else {
56474+ coord_init_last_unit(&source, shift->wish_stop.node);
56475+ }
56476+ shift->real_stop = source;
56477+
56478+ /* free space in target node and number of items in source */
56479+ target_free_space = znode_free_space(shift->target);
56480+
56481+ shift->everything = 0;
56482+ if (!node_is_empty(shift->target)) {
56483+ /* target node is not empty, check for boundary items
56484+ mergeability */
56485+ coord_t to;
56486+
56487+ /* item we try to merge @source with */
56488+ if (shift->pend == SHIFT_LEFT) {
56489+ coord_init_last_unit(&to, shift->target);
56490+ } else {
56491+ coord_init_first_unit(&to, shift->target);
56492+ }
56493+
56494+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
56495+ &source) :
56496+ are_items_mergeable(&source, &to)) {
56497+ /* how many units of @source do we want to merge to
56498+ item @to */
56499+ want =
56500+ wanted_units(&source, &shift->wish_stop,
56501+ shift->pend);
56502+
56503+ /* how many units of @source we can merge to item
56504+ @to */
56505+ iplug = item_plugin_by_coord(&source);
56506+ if (iplug->b.can_shift != NULL)
56507+ shift->merging_units =
56508+ iplug->b.can_shift(target_free_space,
56509+ &source, shift->target,
56510+ shift->pend, &size,
56511+ want);
56512+ else {
56513+ shift->merging_units = 0;
56514+ size = 0;
56515+ }
56516+ shift->merging_bytes = size;
56517+ shift->shift_bytes += size;
56518+ /* update stop coord to be set to last unit of @source
56519+ we can merge to @target */
56520+ if (shift->merging_units)
56521+ /* at least one unit can be shifted */
56522+ shift->real_stop.unit_pos =
56523+ (shift->merging_units - source.unit_pos -
56524+ 1) * shift->pend;
56525+ else {
56526+ /* nothing can be shifted */
56527+ if (shift->pend == SHIFT_LEFT)
56528+ coord_init_before_first_item(&shift->
56529+ real_stop,
56530+ source.
56531+ node);
56532+ else
56533+ coord_init_after_last_item(&shift->
56534+ real_stop,
56535+ source.node);
56536+ }
56537+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
56538+
56539+ if (shift->merging_units != want) {
56540+ /* we could not copy as many as we want, so,
56541+ there is no reason for estimating any
56542+ longer */
56543+ return;
56544+ }
56545+
56546+ target_free_space -= size;
56547+ coord_add_item_pos(&source, shift->pend);
56548+ }
56549+ }
56550+
56551+ /* number of item nothing of which we want to shift */
56552+ stop_item = shift->wish_stop.item_pos + shift->pend;
56553+
56554+ /* calculate how many items can be copied into given free
56555+ space as whole */
56556+ for (; source.item_pos != stop_item;
56557+ coord_add_item_pos(&source, shift->pend)) {
56558+ if (shift->pend == SHIFT_RIGHT)
56559+ source.unit_pos = coord_last_unit_pos(&source);
56560+
56561+ /* how many units of @source do we want to copy */
56562+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
56563+
56564+ if (want == coord_last_unit_pos(&source) + 1) {
56565+ /* we want this item to be copied entirely */
56566+ size =
56567+ item_length_by_coord(&source) +
56568+ item_creation_overhead(&source);
56569+ if (size <= target_free_space) {
56570+ /* item fits into target node as whole */
56571+ target_free_space -= size;
56572+ shift->shift_bytes +=
56573+ size - item_creation_overhead(&source);
56574+ shift->entire_bytes +=
56575+ size - item_creation_overhead(&source);
56576+ shift->entire++;
56577+
56578+ /* update shift->real_stop coord to be set to
56579+ last unit of @source we can merge to
56580+ @target */
56581+ shift->real_stop = source;
56582+ if (shift->pend == SHIFT_LEFT)
56583+ shift->real_stop.unit_pos =
56584+ coord_last_unit_pos(&shift->
56585+ real_stop);
56586+ else
56587+ shift->real_stop.unit_pos = 0;
56588+ continue;
56589+ }
56590+ }
56591+
56592+ /* we reach here only for an item which does not fit into
56593+ target node in its entirety. This item may be either
56594+ partially shifted, or not shifted at all. We will have to
56595+ create new item in target node, so decrease amout of free
56596+ space by an item creation overhead. We can reach here also
56597+ if stop coord is in this item */
56598+ if (target_free_space >=
56599+ (unsigned)item_creation_overhead(&source)) {
56600+ target_free_space -= item_creation_overhead(&source);
56601+ iplug = item_plugin_by_coord(&source);
56602+ if (iplug->b.can_shift) {
56603+ shift->part_units = iplug->b.can_shift(target_free_space,
56604+ &source,
56605+ NULL, /* target */
56606+ shift->pend,
56607+ &size,
56608+ want);
56609+ } else {
56610+ target_free_space = 0;
56611+ shift->part_units = 0;
56612+ size = 0;
56613+ }
56614+ } else {
56615+ target_free_space = 0;
56616+ shift->part_units = 0;
56617+ size = 0;
56618+ }
56619+ shift->part_bytes = size;
56620+ shift->shift_bytes += size;
56621+
56622+ /* set @shift->real_stop to last unit of @source we can merge
56623+ to @shift->target */
56624+ if (shift->part_units) {
56625+ shift->real_stop = source;
56626+ shift->real_stop.unit_pos =
56627+ (shift->part_units - source.unit_pos -
56628+ 1) * shift->pend;
56629+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
56630+ }
56631+
56632+ if (want != shift->part_units)
56633+ /* not everything wanted were shifted */
56634+ return;
56635+ break;
56636+ }
56637+
56638+ shift->everything = 1;
56639+}
56640+
56641+static void
56642+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
56643+ shift_direction dir, unsigned free_space)
56644+{
56645+ item_plugin *iplug;
56646+
56647+ assert("nikita-1463", target != NULL);
56648+ assert("nikita-1464", source != NULL);
56649+ assert("nikita-1465", from + count <= coord_num_units(source));
56650+
56651+ iplug = item_plugin_by_coord(source);
56652+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
56653+ iplug->b.copy_units(target, source, from, count, dir, free_space);
56654+
56655+ if (dir == SHIFT_RIGHT) {
56656+ /* FIXME-VS: this looks not necessary. update_item_key was
56657+ called already by copy_units method */
56658+ reiser4_key split_key;
56659+
56660+ assert("nikita-1469", target->unit_pos == 0);
56661+
56662+ unit_key_by_coord(target, &split_key);
56663+ node_plugin_by_coord(target)->update_item_key(target,
56664+ &split_key, NULL);
56665+ }
56666+}
56667+
56668+/* copy part of @shift->real_stop.node starting either from its beginning or
56669+ from its end and ending at @shift->real_stop to either the end or the
56670+ beginning of @shift->target */
56671+static void copy(struct shift_params *shift)
56672+{
56673+ node40_header *nh;
56674+ coord_t from;
56675+ coord_t to;
56676+ item_header40 *from_ih, *to_ih;
56677+ int free_space_start;
56678+ int new_items;
56679+ unsigned old_items;
56680+ int old_offset;
56681+ unsigned i;
56682+
56683+ nh = node40_node_header(shift->target);
56684+ free_space_start = nh40_get_free_space_start(nh);
56685+ old_items = nh40_get_num_items(nh);
56686+ new_items = shift->entire + (shift->part_units ? 1 : 0);
56687+ assert("vs-185",
56688+ shift->shift_bytes ==
56689+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
56690+
56691+ from = shift->wish_stop;
56692+
56693+ coord_init_first_unit(&to, shift->target);
56694+
56695+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
56696+ hence to.between is set to EMPTY_NODE above. Looks like we want it
56697+ to be AT_UNIT.
56698+
56699+ Oh, wonders of ->betweeness...
56700+
56701+ */
56702+ to.between = AT_UNIT;
56703+
56704+ if (shift->pend == SHIFT_LEFT) {
56705+ /* copying to left */
56706+
56707+ coord_set_item_pos(&from, 0);
56708+ from_ih = node40_ih_at(from.node, 0);
56709+
56710+ coord_set_item_pos(&to,
56711+ node40_num_of_items_internal(to.node) - 1);
56712+ if (shift->merging_units) {
56713+ /* expand last item, so that plugin methods will see
56714+ correct data */
56715+ free_space_start += shift->merging_bytes;
56716+ nh40_set_free_space_start(nh,
56717+ (unsigned)free_space_start);
56718+ nh40_set_free_space(nh,
56719+ nh40_get_free_space(nh) -
56720+ shift->merging_bytes);
56721+
56722+ /* appending last item of @target */
56723+ copy_units(&to, &from, 0, /* starting from 0-th unit */
56724+ shift->merging_units, SHIFT_LEFT,
56725+ shift->merging_bytes);
56726+ coord_inc_item_pos(&from);
56727+ from_ih--;
56728+ coord_inc_item_pos(&to);
56729+ }
56730+
56731+ to_ih = node40_ih_at(shift->target, old_items);
56732+ if (shift->entire) {
56733+ /* copy @entire items entirely */
56734+
56735+ /* copy item headers */
56736+ memcpy(to_ih - shift->entire + 1,
56737+ from_ih - shift->entire + 1,
56738+ shift->entire * sizeof(item_header40));
56739+ /* update item header offset */
56740+ old_offset = ih40_get_offset(from_ih);
56741+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
56742+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
56743+ ih40_set_offset(to_ih,
56744+ ih40_get_offset(from_ih) -
56745+ old_offset + free_space_start);
56746+
56747+ /* copy item bodies */
56748+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
56749+ shift->entire_bytes);
56750+
56751+ coord_add_item_pos(&from, (int)shift->entire);
56752+ coord_add_item_pos(&to, (int)shift->entire);
56753+ }
56754+
56755+ nh40_set_free_space_start(nh,
56756+ free_space_start +
56757+ shift->shift_bytes -
56758+ shift->merging_bytes);
56759+ nh40_set_free_space(nh,
56760+ nh40_get_free_space(nh) -
56761+ (shift->shift_bytes - shift->merging_bytes +
56762+ sizeof(item_header40) * new_items));
56763+
56764+ /* update node header */
56765+ node40_set_num_items(shift->target, nh, old_items + new_items);
56766+ assert("vs-170",
56767+ nh40_get_free_space(nh) < znode_size(shift->target));
56768+
56769+ if (shift->part_units) {
56770+ /* copy heading part (@part units) of @source item as
56771+ a new item into @target->node */
56772+
56773+ /* copy item header of partially copied item */
56774+ coord_set_item_pos(&to,
56775+ node40_num_of_items_internal(to.node)
56776+ - 1);
56777+ memcpy(to_ih, from_ih, sizeof(item_header40));
56778+ ih40_set_offset(to_ih,
56779+ nh40_get_free_space_start(nh) -
56780+ shift->part_bytes);
56781+ if (item_plugin_by_coord(&to)->b.init)
56782+ item_plugin_by_coord(&to)->b.init(&to, &from,
56783+ NULL);
56784+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
56785+ shift->part_bytes);
56786+ }
56787+
56788+ } else {
56789+ /* copying to right */
56790+
56791+ coord_set_item_pos(&from,
56792+ node40_num_of_items_internal(from.node) - 1);
56793+ from_ih = node40_ih_at_coord(&from);
56794+
56795+ coord_set_item_pos(&to, 0);
56796+
56797+ /* prepare space for new items */
56798+ memmove(zdata(to.node) + sizeof(node40_header) +
56799+ shift->shift_bytes,
56800+ zdata(to.node) + sizeof(node40_header),
56801+ free_space_start - sizeof(node40_header));
56802+ /* update item headers of moved items */
56803+ to_ih = node40_ih_at(to.node, 0);
56804+ /* first item gets @merging_bytes longer. free space appears
56805+ at its beginning */
56806+ if (!node_is_empty(to.node))
56807+ ih40_set_offset(to_ih,
56808+ ih40_get_offset(to_ih) +
56809+ shift->shift_bytes -
56810+ shift->merging_bytes);
56811+
56812+ for (i = 1; i < old_items; i++)
56813+ ih40_set_offset(to_ih - i,
56814+ ih40_get_offset(to_ih - i) +
56815+ shift->shift_bytes);
56816+
56817+ /* move item headers to make space for new items */
56818+ memmove(to_ih - old_items + 1 - new_items,
56819+ to_ih - old_items + 1,
56820+ sizeof(item_header40) * old_items);
56821+ to_ih -= (new_items - 1);
56822+
56823+ nh40_set_free_space_start(nh,
56824+ free_space_start +
56825+ shift->shift_bytes);
56826+ nh40_set_free_space(nh,
56827+ nh40_get_free_space(nh) -
56828+ (shift->shift_bytes +
56829+ sizeof(item_header40) * new_items));
56830+
56831+ /* update node header */
56832+ node40_set_num_items(shift->target, nh, old_items + new_items);
56833+ assert("vs-170",
56834+ nh40_get_free_space(nh) < znode_size(shift->target));
56835+
56836+ if (shift->merging_units) {
56837+ coord_add_item_pos(&to, new_items);
56838+ to.unit_pos = 0;
56839+ to.between = AT_UNIT;
56840+ /* prepend first item of @to */
56841+ copy_units(&to, &from,
56842+ coord_last_unit_pos(&from) -
56843+ shift->merging_units + 1,
56844+ shift->merging_units, SHIFT_RIGHT,
56845+ shift->merging_bytes);
56846+ coord_dec_item_pos(&from);
56847+ from_ih++;
56848+ }
56849+
56850+ if (shift->entire) {
56851+ /* copy @entire items entirely */
56852+
56853+ /* copy item headers */
56854+ memcpy(to_ih, from_ih,
56855+ shift->entire * sizeof(item_header40));
56856+
56857+ /* update item header offset */
56858+ old_offset =
56859+ ih40_get_offset(from_ih + shift->entire - 1);
56860+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
56861+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
56862+ ih40_set_offset(to_ih,
56863+ ih40_get_offset(from_ih) -
56864+ old_offset +
56865+ sizeof(node40_header) +
56866+ shift->part_bytes);
56867+ /* copy item bodies */
56868+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
56869+ memcpy(zdata(to.node) + sizeof(node40_header) +
56870+ shift->part_bytes, item_by_coord_node40(&from),
56871+ shift->entire_bytes);
56872+ coord_dec_item_pos(&from);
56873+ }
56874+
56875+ if (shift->part_units) {
56876+ coord_set_item_pos(&to, 0);
56877+ to.unit_pos = 0;
56878+ to.between = AT_UNIT;
56879+ /* copy heading part (@part units) of @source item as
56880+ a new item into @target->node */
56881+
56882+ /* copy item header of partially copied item */
56883+ memcpy(to_ih, from_ih, sizeof(item_header40));
56884+ ih40_set_offset(to_ih, sizeof(node40_header));
56885+ if (item_plugin_by_coord(&to)->b.init)
56886+ item_plugin_by_coord(&to)->b.init(&to, &from,
56887+ NULL);
56888+ copy_units(&to, &from,
56889+ coord_last_unit_pos(&from) -
56890+ shift->part_units + 1, shift->part_units,
56891+ SHIFT_RIGHT, shift->part_bytes);
56892+ }
56893+ }
56894+}
56895+
56896+/* remove everything either before or after @fact_stop. Number of items
56897+ removed completely is returned */
56898+static int delete_copied(struct shift_params *shift)
56899+{
56900+ coord_t from;
56901+ coord_t to;
56902+ struct carry_cut_data cdata;
56903+
56904+ if (shift->pend == SHIFT_LEFT) {
56905+ /* we were shifting to left, remove everything from the
56906+ beginning of @shift->wish_stop->node upto
56907+ @shift->wish_stop */
56908+ coord_init_first_unit(&from, shift->real_stop.node);
56909+ to = shift->real_stop;
56910+
56911+ /* store old coordinate of unit which will be first after
56912+ shift to left */
56913+ shift->u.future_first = to;
56914+ coord_next_unit(&shift->u.future_first);
56915+ } else {
56916+ /* we were shifting to right, remove everything from
56917+ @shift->stop_coord upto to end of
56918+ @shift->stop_coord->node */
56919+ from = shift->real_stop;
56920+ coord_init_last_unit(&to, from.node);
56921+
56922+ /* store old coordinate of unit which will be last after
56923+ shift to right */
56924+ shift->u.future_last = from;
56925+ coord_prev_unit(&shift->u.future_last);
56926+ }
56927+
56928+ cdata.params.from = &from;
56929+ cdata.params.to = &to;
56930+ cdata.params.from_key = NULL;
56931+ cdata.params.to_key = NULL;
56932+ cdata.params.smallest_removed = NULL;
56933+ return cut_node40(&cdata, NULL);
56934+}
56935+
56936+/* something was moved between @left and @right. Add carry operation to @info
56937+ list to have carry to update delimiting key between them */
56938+static int
56939+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
56940+{
56941+ carry_op *op;
56942+ carry_node *cn;
56943+
56944+ if (info == NULL)
56945+ /* nowhere to send operation to. */
56946+ return 0;
56947+
56948+ if (!should_notify_parent(right))
56949+ return 0;
56950+
56951+ op = node_post_carry(info, COP_UPDATE, right, 1);
56952+ if (IS_ERR(op) || op == NULL)
56953+ return op ? PTR_ERR(op) : -EIO;
56954+
56955+ if (left != NULL) {
56956+ carry_node *reference;
56957+
56958+ if (info->doing)
56959+ reference = insert_carry_node(info->doing,
56960+ info->todo, left);
56961+ else
56962+ reference = op->node;
56963+ assert("nikita-2992", reference != NULL);
71430cf6 56964+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
44254afd
MT
56965+ if (IS_ERR(cn))
56966+ return PTR_ERR(cn);
56967+ cn->parent = 1;
56968+ cn->node = left;
56969+ if (ZF_ISSET(left, JNODE_ORPHAN))
56970+ cn->left_before = 1;
56971+ op->u.update.left = cn;
56972+ } else
56973+ op->u.update.left = NULL;
56974+ return 0;
56975+}
56976+
56977+/* plugin->u.node.prepare_removal
56978+ to delete a pointer to @empty from the tree add corresponding carry
56979+ operation (delete) to @info list */
56980+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
56981+{
56982+ carry_op *op;
56983+ reiser4_tree *tree;
56984+
56985+ if (!should_notify_parent(empty))
56986+ return 0;
56987+ /* already on a road to Styx */
56988+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
56989+ return 0;
56990+ op = node_post_carry(info, COP_DELETE, empty, 1);
56991+ if (IS_ERR(op) || op == NULL)
56992+ return RETERR(op ? PTR_ERR(op) : -EIO);
56993+
56994+ op->u.delete.child = NULL;
56995+ op->u.delete.flags = 0;
56996+
56997+ /* fare thee well */
56998+ tree = znode_get_tree(empty);
56999+ read_lock_tree(tree);
57000+ write_lock_dk(tree);
57001+ znode_set_ld_key(empty, znode_get_rd_key(empty));
57002+ if (znode_is_left_connected(empty) && empty->left)
57003+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57004+ write_unlock_dk(tree);
57005+ read_unlock_tree(tree);
57006+
57007+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
57008+ return 0;
57009+}
57010+
57011+/* something were shifted from @insert_coord->node to @shift->target, update
57012+ @insert_coord correspondingly */
57013+static void
57014+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
57015+ int including_insert_coord)
57016+{
57017+ /* item plugin was invalidated by shifting */
57018+ coord_clear_iplug(insert_coord);
57019+
57020+ if (node_is_empty(shift->wish_stop.node)) {
57021+ assert("vs-242", shift->everything);
57022+ if (including_insert_coord) {
57023+ if (shift->pend == SHIFT_RIGHT) {
57024+ /* set @insert_coord before first unit of
57025+ @shift->target node */
57026+ coord_init_before_first_item(insert_coord,
57027+ shift->target);
57028+ } else {
57029+ /* set @insert_coord after last in target node */
57030+ coord_init_after_last_item(insert_coord,
57031+ shift->target);
57032+ }
57033+ } else {
57034+ /* set @insert_coord inside of empty node. There is
57035+ only one possible coord within an empty
57036+ node. init_first_unit will set that coord */
57037+ coord_init_first_unit(insert_coord,
57038+ shift->wish_stop.node);
57039+ }
57040+ return;
57041+ }
57042+
57043+ if (shift->pend == SHIFT_RIGHT) {
57044+ /* there was shifting to right */
57045+ if (shift->everything) {
57046+ /* everything wanted was shifted */
57047+ if (including_insert_coord) {
57048+ /* @insert_coord is set before first unit of
57049+ @to node */
57050+ coord_init_before_first_item(insert_coord,
57051+ shift->target);
57052+ insert_coord->between = BEFORE_UNIT;
57053+ } else {
57054+ /* @insert_coord is set after last unit of
57055+ @insert->node */
57056+ coord_init_last_unit(insert_coord,
57057+ shift->wish_stop.node);
57058+ insert_coord->between = AFTER_UNIT;
57059+ }
57060+ }
57061+ return;
57062+ }
57063+
57064+ /* there was shifting to left */
57065+ if (shift->everything) {
57066+ /* everything wanted was shifted */
57067+ if (including_insert_coord) {
57068+ /* @insert_coord is set after last unit in @to node */
57069+ coord_init_after_last_item(insert_coord, shift->target);
57070+ } else {
57071+ /* @insert_coord is set before first unit in the same
57072+ node */
57073+ coord_init_before_first_item(insert_coord,
57074+ shift->wish_stop.node);
57075+ }
57076+ return;
57077+ }
57078+
57079+ /* FIXME-VS: the code below is complicated because with between ==
57080+ AFTER_ITEM unit_pos is set to 0 */
57081+
57082+ if (!removed) {
57083+ /* no items were shifted entirely */
57084+ assert("vs-195", shift->merging_units == 0
57085+ || shift->part_units == 0);
57086+
57087+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
57088+ if (shift->merging_units) {
57089+ if (insert_coord->between == AFTER_UNIT) {
57090+ assert("nikita-1441",
57091+ insert_coord->unit_pos >=
57092+ shift->merging_units);
57093+ insert_coord->unit_pos -=
57094+ shift->merging_units;
57095+ } else if (insert_coord->between == BEFORE_UNIT) {
57096+ assert("nikita-2090",
57097+ insert_coord->unit_pos >
57098+ shift->merging_units);
57099+ insert_coord->unit_pos -=
57100+ shift->merging_units;
57101+ }
57102+
57103+ assert("nikita-2083",
57104+ insert_coord->unit_pos + 1);
57105+ } else {
57106+ if (insert_coord->between == AFTER_UNIT) {
57107+ assert("nikita-1442",
57108+ insert_coord->unit_pos >=
57109+ shift->part_units);
57110+ insert_coord->unit_pos -=
57111+ shift->part_units;
57112+ } else if (insert_coord->between == BEFORE_UNIT) {
57113+ assert("nikita-2089",
57114+ insert_coord->unit_pos >
57115+ shift->part_units);
57116+ insert_coord->unit_pos -=
57117+ shift->part_units;
57118+ }
57119+
57120+ assert("nikita-2084",
57121+ insert_coord->unit_pos + 1);
57122+ }
57123+ }
57124+ return;
57125+ }
57126+
57127+ /* we shifted to left and there was no enough space for everything */
57128+ switch (insert_coord->between) {
57129+ case AFTER_UNIT:
57130+ case BEFORE_UNIT:
57131+ if (shift->real_stop.item_pos == insert_coord->item_pos)
57132+ insert_coord->unit_pos -= shift->part_units;
57133+ case AFTER_ITEM:
57134+ coord_add_item_pos(insert_coord, -removed);
57135+ break;
57136+ default:
57137+ impossible("nikita-2087", "not ready");
57138+ }
57139+ assert("nikita-2085", insert_coord->unit_pos + 1);
57140+}
57141+
57142+static int call_shift_hooks(struct shift_params *shift)
57143+{
57144+ unsigned i, shifted;
57145+ coord_t coord;
57146+ item_plugin *iplug;
57147+
57148+ assert("vs-275", !node_is_empty(shift->target));
57149+
57150+ /* number of items shift touches */
57151+ shifted =
57152+ shift->entire + (shift->merging_units ? 1 : 0) +
57153+ (shift->part_units ? 1 : 0);
57154+
57155+ if (shift->pend == SHIFT_LEFT) {
57156+ /* moved items are at the end */
57157+ coord_init_last_unit(&coord, shift->target);
57158+ coord.unit_pos = 0;
57159+
57160+ assert("vs-279", shift->pend == 1);
57161+ for (i = 0; i < shifted; i++) {
57162+ unsigned from, count;
57163+
57164+ iplug = item_plugin_by_coord(&coord);
57165+ if (i == 0 && shift->part_units) {
57166+ assert("vs-277",
57167+ coord_num_units(&coord) ==
57168+ shift->part_units);
57169+ count = shift->part_units;
57170+ from = 0;
57171+ } else if (i == shifted - 1 && shift->merging_units) {
57172+ count = shift->merging_units;
57173+ from = coord_num_units(&coord) - count;
57174+ } else {
57175+ count = coord_num_units(&coord);
57176+ from = 0;
57177+ }
57178+
57179+ if (iplug->b.shift_hook) {
57180+ iplug->b.shift_hook(&coord, from, count,
57181+ shift->wish_stop.node);
57182+ }
57183+ coord_add_item_pos(&coord, -shift->pend);
57184+ }
57185+ } else {
57186+ /* moved items are at the beginning */
57187+ coord_init_first_unit(&coord, shift->target);
57188+
57189+ assert("vs-278", shift->pend == -1);
57190+ for (i = 0; i < shifted; i++) {
57191+ unsigned from, count;
57192+
57193+ iplug = item_plugin_by_coord(&coord);
57194+ if (i == 0 && shift->part_units) {
57195+ assert("vs-277",
57196+ coord_num_units(&coord) ==
57197+ shift->part_units);
57198+ count = coord_num_units(&coord);
57199+ from = 0;
57200+ } else if (i == shifted - 1 && shift->merging_units) {
57201+ count = shift->merging_units;
57202+ from = 0;
57203+ } else {
57204+ count = coord_num_units(&coord);
57205+ from = 0;
57206+ }
57207+
57208+ if (iplug->b.shift_hook) {
57209+ iplug->b.shift_hook(&coord, from, count,
57210+ shift->wish_stop.node);
57211+ }
57212+ coord_add_item_pos(&coord, -shift->pend);
57213+ }
57214+ }
57215+
57216+ return 0;
57217+}
57218+
57219+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
57220+static int
57221+unit_moved_left(const struct shift_params *shift, const coord_t * old)
57222+{
57223+ assert("vs-944", shift->real_stop.node == old->node);
57224+
57225+ if (shift->real_stop.item_pos < old->item_pos)
57226+ return 0;
57227+ if (shift->real_stop.item_pos == old->item_pos) {
57228+ if (shift->real_stop.unit_pos < old->unit_pos)
57229+ return 0;
57230+ }
57231+ return 1;
57232+}
57233+
57234+/* shift to right is completed. Return 1 if unit @old was moved to right
57235+ neighbor */
57236+static int
57237+unit_moved_right(const struct shift_params *shift, const coord_t * old)
57238+{
57239+ assert("vs-944", shift->real_stop.node == old->node);
57240+
57241+ if (shift->real_stop.item_pos > old->item_pos)
57242+ return 0;
57243+ if (shift->real_stop.item_pos == old->item_pos) {
57244+ if (shift->real_stop.unit_pos > old->unit_pos)
57245+ return 0;
57246+ }
57247+ return 1;
57248+}
57249+
57250+/* coord @old was set in node from which shift was performed. What was shifted
57251+ is stored in @shift. Update @old correspondingly to performed shift */
57252+static coord_t *adjust_coord2(const struct shift_params *shift,
57253+ const coord_t * old, coord_t * new)
57254+{
57255+ coord_clear_iplug(new);
57256+ new->between = old->between;
57257+
57258+ coord_clear_iplug(new);
57259+ if (old->node == shift->target) {
57260+ if (shift->pend == SHIFT_LEFT) {
57261+ /* coord which is set inside of left neighbor does not
57262+ change during shift to left */
57263+ coord_dup(new, old);
57264+ return new;
57265+ }
57266+ new->node = old->node;
57267+ coord_set_item_pos(new,
57268+ old->item_pos + shift->entire +
57269+ (shift->part_units ? 1 : 0));
57270+ new->unit_pos = old->unit_pos;
57271+ if (old->item_pos == 0 && shift->merging_units)
57272+ new->unit_pos += shift->merging_units;
57273+ return new;
57274+ }
57275+
57276+ assert("vs-977", old->node == shift->wish_stop.node);
57277+ if (shift->pend == SHIFT_LEFT) {
57278+ if (unit_moved_left(shift, old)) {
57279+ /* unit @old moved to left neighbor. Calculate its
57280+ coordinate there */
57281+ new->node = shift->target;
57282+ coord_set_item_pos(new,
57283+ node_num_items(shift->target) -
57284+ shift->entire -
57285+ (shift->part_units ? 1 : 0) +
57286+ old->item_pos);
57287+
57288+ new->unit_pos = old->unit_pos;
57289+ if (shift->merging_units) {
57290+ coord_dec_item_pos(new);
57291+ if (old->item_pos == 0) {
57292+ /* unit_pos only changes if item got
57293+ merged */
57294+ new->unit_pos =
57295+ coord_num_units(new) -
57296+ (shift->merging_units -
57297+ old->unit_pos);
57298+ }
57299+ }
57300+ } else {
57301+ /* unit @old did not move to left neighbor.
57302+
57303+ Use _nocheck, because @old is outside of its node.
57304+ */
57305+ coord_dup_nocheck(new, old);
57306+ coord_add_item_pos(new,
57307+ -shift->u.future_first.item_pos);
57308+ if (new->item_pos == 0)
57309+ new->unit_pos -= shift->u.future_first.unit_pos;
57310+ }
57311+ } else {
57312+ if (unit_moved_right(shift, old)) {
57313+ /* unit @old moved to right neighbor */
57314+ new->node = shift->target;
57315+ coord_set_item_pos(new,
57316+ old->item_pos -
57317+ shift->real_stop.item_pos);
57318+ if (new->item_pos == 0) {
57319+ /* unit @old might change unit pos */
57320+ coord_set_item_pos(new,
57321+ old->unit_pos -
57322+ shift->real_stop.unit_pos);
57323+ }
57324+ } else {
57325+ /* unit @old did not move to right neighbor, therefore
57326+ it did not change */
57327+ coord_dup(new, old);
57328+ }
57329+ }
57330+ coord_set_iplug(new, item_plugin_by_coord(new));
57331+ return new;
57332+}
57333+
57334+/* this is called when shift is completed (something of source node is copied
57335+ to target and deleted in source) to update all taps set in current
57336+ context */
57337+static void update_taps(const struct shift_params *shift)
57338+{
57339+ tap_t *tap;
57340+ coord_t new;
57341+
57342+ for_all_taps(tap) {
57343+ /* update only taps set to nodes participating in shift */
57344+ if (tap->coord->node == shift->wish_stop.node
57345+ || tap->coord->node == shift->target)
57346+ tap_to_coord(tap,
57347+ adjust_coord2(shift, tap->coord, &new));
57348+ }
57349+}
57350+
57351+#if REISER4_DEBUG
57352+
57353+struct shift_check {
57354+ reiser4_key key;
57355+ __u16 plugin_id;
57356+ union {
57357+ __u64 bytes;
57358+ __u64 entries;
57359+ void *unused;
57360+ } u;
57361+};
57362+
57363+void *shift_check_prepare(const znode * left, const znode * right)
57364+{
57365+ pos_in_node_t i, nr_items;
57366+ int mergeable;
57367+ struct shift_check *data;
57368+ item_header40 *ih;
57369+
57370+ if (node_is_empty(left) || node_is_empty(right))
57371+ mergeable = 0;
57372+ else {
57373+ coord_t l, r;
57374+
57375+ coord_init_last_unit(&l, left);
57376+ coord_init_first_unit(&r, right);
57377+ mergeable = are_items_mergeable(&l, &r);
57378+ }
57379+ nr_items =
57380+ node40_num_of_items_internal(left) +
57381+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57382+ data =
71430cf6
MT
57383+ kmalloc(sizeof(struct shift_check) * nr_items,
57384+ reiser4_ctx_gfp_mask_get());
44254afd
MT
57385+ if (data != NULL) {
57386+ coord_t coord;
57387+ pos_in_node_t item_pos;
57388+
57389+ coord_init_first_unit(&coord, left);
57390+ i = 0;
57391+
57392+ for (item_pos = 0;
57393+ item_pos < node40_num_of_items_internal(left);
57394+ item_pos++) {
57395+
57396+ coord_set_item_pos(&coord, item_pos);
57397+ ih = node40_ih_at_coord(&coord);
57398+
57399+ data[i].key = ih->key;
57400+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57401+ switch (data[i].plugin_id) {
57402+ case CTAIL_ID:
57403+ case FORMATTING_ID:
57404+ data[i].u.bytes = coord_num_units(&coord);
57405+ break;
57406+ case EXTENT_POINTER_ID:
57407+ data[i].u.bytes =
71430cf6
MT
57408+ reiser4_extent_size(&coord,
57409+ coord_num_units(&coord));
44254afd
MT
57410+ break;
57411+ case COMPOUND_DIR_ID:
57412+ data[i].u.entries = coord_num_units(&coord);
57413+ break;
57414+ default:
57415+ data[i].u.unused = NULL;
57416+ break;
57417+ }
57418+ i++;
57419+ }
57420+
57421+ coord_init_first_unit(&coord, right);
57422+
57423+ if (mergeable) {
57424+ assert("vs-1609", i != 0);
57425+
57426+ ih = node40_ih_at_coord(&coord);
57427+
57428+ assert("vs-1589",
57429+ data[i - 1].plugin_id ==
57430+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
57431+ switch (data[i - 1].plugin_id) {
57432+ case CTAIL_ID:
57433+ case FORMATTING_ID:
57434+ data[i - 1].u.bytes += coord_num_units(&coord);
57435+ break;
57436+ case EXTENT_POINTER_ID:
57437+ data[i - 1].u.bytes +=
71430cf6 57438+ reiser4_extent_size(&coord,
44254afd
MT
57439+ coord_num_units(&coord));
57440+ break;
57441+ case COMPOUND_DIR_ID:
57442+ data[i - 1].u.entries +=
57443+ coord_num_units(&coord);
57444+ break;
57445+ default:
57446+ impossible("vs-1605", "wrong mergeable item");
57447+ break;
57448+ }
57449+ item_pos = 1;
57450+ } else
57451+ item_pos = 0;
57452+ for (; item_pos < node40_num_of_items_internal(right);
57453+ item_pos++) {
57454+
57455+ assert("vs-1604", i < nr_items);
57456+ coord_set_item_pos(&coord, item_pos);
57457+ ih = node40_ih_at_coord(&coord);
57458+
57459+ data[i].key = ih->key;
57460+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57461+ switch (data[i].plugin_id) {
57462+ case CTAIL_ID:
57463+ case FORMATTING_ID:
57464+ data[i].u.bytes = coord_num_units(&coord);
57465+ break;
57466+ case EXTENT_POINTER_ID:
57467+ data[i].u.bytes =
71430cf6 57468+ reiser4_extent_size(&coord,
44254afd
MT
57469+ coord_num_units(&coord));
57470+ break;
57471+ case COMPOUND_DIR_ID:
57472+ data[i].u.entries = coord_num_units(&coord);
57473+ break;
57474+ default:
57475+ data[i].u.unused = NULL;
57476+ break;
57477+ }
57478+ i++;
57479+ }
57480+ assert("vs-1606", i == nr_items);
57481+ }
57482+ return data;
57483+}
57484+
57485+void shift_check(void *vp, const znode * left, const znode * right)
57486+{
57487+ pos_in_node_t i, nr_items;
57488+ coord_t coord;
57489+ __u64 last_bytes;
57490+ int mergeable;
57491+ item_header40 *ih;
57492+ pos_in_node_t item_pos;
57493+ struct shift_check *data;
57494+
57495+ data = (struct shift_check *)vp;
57496+
57497+ if (data == NULL)
57498+ return;
57499+
57500+ if (node_is_empty(left) || node_is_empty(right))
57501+ mergeable = 0;
57502+ else {
57503+ coord_t l, r;
57504+
57505+ coord_init_last_unit(&l, left);
57506+ coord_init_first_unit(&r, right);
57507+ mergeable = are_items_mergeable(&l, &r);
57508+ }
57509+
57510+ nr_items =
57511+ node40_num_of_items_internal(left) +
57512+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57513+
57514+ i = 0;
57515+ last_bytes = 0;
57516+
57517+ coord_init_first_unit(&coord, left);
57518+
57519+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
57520+ item_pos++) {
57521+
57522+ coord_set_item_pos(&coord, item_pos);
57523+ ih = node40_ih_at_coord(&coord);
57524+
57525+ assert("vs-1611", i == item_pos);
57526+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
57527+ assert("vs-1591",
57528+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57529+ if ((i < (node40_num_of_items_internal(left) - 1))
57530+ || !mergeable) {
57531+ switch (data[i].plugin_id) {
57532+ case CTAIL_ID:
57533+ case FORMATTING_ID:
57534+ assert("vs-1592",
57535+ data[i].u.bytes ==
57536+ coord_num_units(&coord));
57537+ break;
57538+ case EXTENT_POINTER_ID:
57539+ assert("vs-1593",
71430cf6
MT
57540+ data[i].u.bytes ==
57541+ reiser4_extent_size(&coord,
57542+ coord_num_units
57543+ (&coord)));
44254afd
MT
57544+ break;
57545+ case COMPOUND_DIR_ID:
57546+ assert("vs-1594",
57547+ data[i].u.entries ==
57548+ coord_num_units(&coord));
57549+ break;
57550+ default:
57551+ break;
57552+ }
57553+ }
57554+ if (item_pos == (node40_num_of_items_internal(left) - 1)
57555+ && mergeable) {
57556+ switch (data[i].plugin_id) {
57557+ case CTAIL_ID:
57558+ case FORMATTING_ID:
57559+ last_bytes = coord_num_units(&coord);
57560+ break;
57561+ case EXTENT_POINTER_ID:
57562+ last_bytes =
71430cf6 57563+ reiser4_extent_size(&coord,
44254afd
MT
57564+ coord_num_units(&coord));
57565+ break;
57566+ case COMPOUND_DIR_ID:
57567+ last_bytes = coord_num_units(&coord);
57568+ break;
57569+ default:
57570+ impossible("vs-1595", "wrong mergeable item");
57571+ break;
57572+ }
57573+ }
57574+ i++;
57575+ }
57576+
57577+ coord_init_first_unit(&coord, right);
57578+ if (mergeable) {
57579+ ih = node40_ih_at_coord(&coord);
57580+
57581+ assert("vs-1589",
57582+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
57583+ assert("vs-1608", last_bytes != 0);
57584+ switch (data[i - 1].plugin_id) {
57585+ case CTAIL_ID:
57586+ case FORMATTING_ID:
57587+ assert("vs-1596",
57588+ data[i - 1].u.bytes ==
57589+ last_bytes + coord_num_units(&coord));
57590+ break;
57591+
57592+ case EXTENT_POINTER_ID:
57593+ assert("vs-1597",
57594+ data[i - 1].u.bytes ==
71430cf6
MT
57595+ last_bytes + reiser4_extent_size(&coord,
57596+ coord_num_units
57597+ (&coord)));
44254afd
MT
57598+ break;
57599+
57600+ case COMPOUND_DIR_ID:
57601+ assert("vs-1598",
57602+ data[i - 1].u.bytes ==
57603+ last_bytes + coord_num_units(&coord));
57604+ break;
57605+ default:
57606+ impossible("vs-1599", "wrong mergeable item");
57607+ break;
57608+ }
57609+ item_pos = 1;
57610+ } else
57611+ item_pos = 0;
57612+
57613+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
57614+
57615+ coord_set_item_pos(&coord, item_pos);
57616+ ih = node40_ih_at_coord(&coord);
57617+
57618+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
57619+ assert("vs-1613",
57620+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57621+ switch (data[i].plugin_id) {
57622+ case CTAIL_ID:
57623+ case FORMATTING_ID:
57624+ assert("vs-1600",
57625+ data[i].u.bytes == coord_num_units(&coord));
57626+ break;
57627+ case EXTENT_POINTER_ID:
57628+ assert("vs-1601",
71430cf6
MT
57629+ data[i].u.bytes ==
57630+ reiser4_extent_size(&coord,
57631+ coord_num_units
57632+ (&coord)));
44254afd
MT
57633+ break;
57634+ case COMPOUND_DIR_ID:
57635+ assert("vs-1602",
57636+ data[i].u.entries == coord_num_units(&coord));
57637+ break;
57638+ default:
57639+ break;
57640+ }
57641+ i++;
57642+ }
57643+
57644+ assert("vs-1603", i == nr_items);
57645+ kfree(data);
57646+}
57647+
57648+#endif
57649+
57650+/* plugin->u.node.shift
57651+ look for description of this method in plugin/node/node.h */
57652+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
57653+ deleted from the tree if this is set to 1 */
57654+ int including_stop_coord, carry_plugin_info * info)
57655+{
57656+ struct shift_params shift;
57657+ int result;
57658+ znode *left, *right;
57659+ znode *source;
57660+ int target_empty;
57661+
57662+ assert("nikita-2161", coord_check(from));
57663+
57664+ memset(&shift, 0, sizeof(shift));
57665+ shift.pend = pend;
57666+ shift.wish_stop = *from;
57667+ shift.target = to;
57668+
57669+ assert("nikita-1473", znode_is_write_locked(from->node));
57670+ assert("nikita-1474", znode_is_write_locked(to));
57671+
57672+ source = from->node;
57673+
57674+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
57675+ shifted */
57676+ if (pend == SHIFT_LEFT) {
57677+ result = coord_set_to_left(&shift.wish_stop);
57678+ left = to;
57679+ right = from->node;
57680+ } else {
57681+ result = coord_set_to_right(&shift.wish_stop);
57682+ left = from->node;
57683+ right = to;
57684+ }
57685+
57686+ if (result) {
57687+ /* move insertion coord even if there is nothing to move */
57688+ if (including_stop_coord) {
57689+ /* move insertion coord (@from) */
57690+ if (pend == SHIFT_LEFT) {
57691+ /* after last item in target node */
57692+ coord_init_after_last_item(from, to);
57693+ } else {
57694+ /* before first item in target node */
57695+ coord_init_before_first_item(from, to);
57696+ }
57697+ }
57698+
57699+ if (delete_child && node_is_empty(shift.wish_stop.node))
57700+ result =
57701+ prepare_removal_node40(shift.wish_stop.node, info);
57702+ else
57703+ result = 0;
57704+ /* there is nothing to shift */
57705+ assert("nikita-2078", coord_check(from));
57706+ return result;
57707+ }
57708+
57709+ target_empty = node_is_empty(to);
57710+
57711+ /* when first node plugin with item body compression is implemented,
57712+ this must be changed to call node specific plugin */
57713+
57714+ /* shift->stop_coord is updated to last unit which really will be
57715+ shifted */
57716+ estimate_shift(&shift, get_current_context());
57717+ if (!shift.shift_bytes) {
57718+ /* we could not shift anything */
57719+ assert("nikita-2079", coord_check(from));
57720+ return 0;
57721+ }
57722+
57723+ copy(&shift);
57724+
57725+ /* result value of this is important. It is used by adjust_coord below */
57726+ result = delete_copied(&shift);
57727+
57728+ assert("vs-1610", result >= 0);
57729+ assert("vs-1471",
57730+ ((reiser4_context *) current->journal_info)->magic ==
57731+ context_magic);
57732+
57733+ /* item which has been moved from one node to another might want to do
57734+ something on that event. This can be done by item's shift_hook
57735+ method, which will be now called for every moved items */
57736+ call_shift_hooks(&shift);
57737+
57738+ assert("vs-1472",
57739+ ((reiser4_context *) current->journal_info)->magic ==
57740+ context_magic);
57741+
57742+ update_taps(&shift);
57743+
57744+ assert("vs-1473",
57745+ ((reiser4_context *) current->journal_info)->magic ==
57746+ context_magic);
57747+
57748+ /* adjust @from pointer in accordance with @including_stop_coord flag
57749+ and amount of data which was really shifted */
57750+ adjust_coord(from, &shift, result, including_stop_coord);
57751+
57752+ if (target_empty)
57753+ /*
57754+ * items were shifted into empty node. Update delimiting key.
57755+ */
57756+ result = prepare_for_update(NULL, left, info);
57757+
57758+ /* add update operation to @info, which is the list of operations to
57759+ be performed on a higher level */
57760+ result = prepare_for_update(left, right, info);
57761+ if (!result && node_is_empty(source) && delete_child) {
57762+ /* all contents of @from->node is moved to @to and @from->node
57763+ has to be removed from the tree, so, on higher level we
57764+ will be removing the pointer to node @from->node */
57765+ result = prepare_removal_node40(source, info);
57766+ }
57767+ assert("nikita-2080", coord_check(from));
57768+ return result ? result : (int)shift.shift_bytes;
57769+}
57770+
57771+/* plugin->u.node.fast_insert()
57772+ look for description of this method in plugin/node/node.h */
57773+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57774+{
57775+ return 1;
57776+}
57777+
57778+/* plugin->u.node.fast_paste()
57779+ look for description of this method in plugin/node/node.h */
57780+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57781+{
57782+ return 1;
57783+}
57784+
57785+/* plugin->u.node.fast_cut()
57786+ look for description of this method in plugin/node/node.h */
57787+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57788+{
57789+ return 1;
57790+}
57791+
57792+/* plugin->u.node.modify - not defined */
57793+
71430cf6
MT
57794+/* plugin->u.node.max_item_size */
57795+int max_item_size_node40(void)
57796+{
57797+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
57798+ sizeof(item_header40);
57799+}
57800+
57801+/* plugin->u.node.set_item_plugin */
57802+int set_item_plugin_node40(coord_t *coord, item_id id)
57803+{
57804+ item_header40 *ih;
57805+
57806+ ih = node40_ih_at_coord(coord);
57807+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
57808+ coord->iplugid = id;
57809+ return 0;
57810+}
57811+
57812+/*
57813+ Local variables:
57814+ c-indentation-style: "K&R"
57815+ mode-name: "LC"
57816+ c-basic-offset: 8
57817+ tab-width: 8
57818+ fill-column: 120
57819+ scroll-step: 1
57820+ End:
57821+*/
57822diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node40.h linux-2.6.22/fs/reiser4/plugin/node/node40.h
57823--- linux-2.6.22.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 03:00:00.000000000 +0300
57824+++ linux-2.6.22/fs/reiser4/plugin/node/node40.h 2007-07-29 00:25:34.988725466 +0400
57825@@ -0,0 +1,125 @@
57826+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57827+
57828+#if !defined( __REISER4_NODE40_H__ )
57829+#define __REISER4_NODE40_H__
57830+
57831+#include "../../forward.h"
57832+#include "../../dformat.h"
57833+#include "node.h"
57834+
57835+#include <linux/types.h>
57836+
57837+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
57838+typedef struct node40_header {
57839+ /* identifier of node plugin. Must be located at the very beginning
57840+ of a node. */
57841+ common_node_header common_header; /* this is 16 bits */
57842+ /* number of items. Should be first element in the node header,
57843+ because we haven't yet finally decided whether it shouldn't go into
57844+ common_header.
57845+ */
57846+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
57847+ * node format at compile time, and it is this one, accesses do not function dereference when
57848+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
57849+ d16 nr_items;
57850+ /* free space in node measured in bytes */
57851+ d16 free_space;
57852+ /* offset to start of free space in node */
57853+ d16 free_space_start;
57854+ /* for reiser4_fsck. When information about what is a free
57855+ block is corrupted, and we try to recover everything even
57856+ if marked as freed, then old versions of data may
57857+ duplicate newer versions, and this field allows us to
57858+ restore the newer version. Also useful for when users
57859+ who don't have the new trashcan installed on their linux distro
57860+ delete the wrong files and send us desperate emails
57861+ offering $25 for them back. */
57862+
57863+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
57864+ d32 magic;
57865+ /* flushstamp is made of mk_id and write_counter. mk_id is an
57866+ id generated randomly at mkreiserfs time. So we can just
57867+ skip all nodes with different mk_id. write_counter is d64
57868+ incrementing counter of writes on disk. It is used for
57869+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
57870+
57871+ d32 mkfs_id;
57872+ d64 flush_id;
57873+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
57874+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
57875+ d16 flags;
57876+
57877+ /* 1 is leaf level, 2 is twig level, root is the numerically
57878+ largest level */
57879+ d8 level;
57880+
57881+ d8 pad;
57882+} PACKED node40_header;
57883+
57884+/* item headers are not standard across all node layouts, pass
57885+ pos_in_node to functions instead */
57886+typedef struct item_header40 {
57887+ /* key of item */
57888+ /* 0 */ reiser4_key key;
57889+ /* offset from start of a node measured in 8-byte chunks */
57890+ /* 24 */ d16 offset;
57891+ /* 26 */ d16 flags;
57892+ /* 28 */ d16 plugin_id;
57893+} PACKED item_header40;
57894+
57895+size_t item_overhead_node40(const znode * node, flow_t * aflow);
57896+size_t free_space_node40(znode * node);
57897+node_search_result lookup_node40(znode * node, const reiser4_key * key,
57898+ lookup_bias bias, coord_t * coord);
57899+int num_of_items_node40(const znode * node);
57900+char *item_by_coord_node40(const coord_t * coord);
57901+int length_by_coord_node40(const coord_t * coord);
57902+item_plugin *plugin_by_coord_node40(const coord_t * coord);
57903+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
57904+size_t estimate_node40(znode * node);
57905+int check_node40(const znode * node, __u32 flags, const char **error);
57906+int parse_node40(znode * node);
57907+int init_node40(znode * node);
57908+#ifdef GUESS_EXISTS
57909+int guess_node40(const znode * node);
57910+#endif
57911+void change_item_size_node40(coord_t * coord, int by);
57912+int create_item_node40(coord_t * target, const reiser4_key * key,
57913+ reiser4_item_data * data, carry_plugin_info * info);
57914+void update_item_key_node40(coord_t * target, const reiser4_key * key,
57915+ carry_plugin_info * info);
57916+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
57917+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
57918+int shift_node40(coord_t * from, znode * to, shift_direction pend,
57919+ /* if @from->node becomes
57920+ empty - it will be deleted from
57921+ the tree if this is set to 1
57922+ */
57923+ int delete_child, int including_stop_coord,
57924+ carry_plugin_info * info);
57925+
57926+int fast_insert_node40(const coord_t * coord);
57927+int fast_paste_node40(const coord_t * coord);
57928+int fast_cut_node40(const coord_t * coord);
57929+int max_item_size_node40(void);
57930+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
57931+int set_item_plugin_node40(coord_t * coord, item_id id);
57932+int shrink_item_node40(coord_t * coord, int delta);
57933+
57934+#if REISER4_DEBUG
57935+void *shift_check_prepare(const znode *left, const znode *right);
57936+void shift_check(void *vp, const znode *left, const znode *right);
57937+#endif
57938+
57939+/* __REISER4_NODE40_H__ */
57940+#endif
57941+/*
57942+ Local variables:
57943+ c-indentation-style: "K&R"
57944+ mode-name: "LC"
57945+ c-basic-offset: 8
57946+ tab-width: 8
57947+ fill-column: 120
57948+ scroll-step: 1
57949+ End:
57950+*/
57951diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node.c linux-2.6.22/fs/reiser4/plugin/node/node.c
57952--- linux-2.6.22.orig/fs/reiser4/plugin/node/node.c 1970-01-01 03:00:00.000000000 +0300
57953+++ linux-2.6.22/fs/reiser4/plugin/node/node.c 2007-07-29 00:25:34.988725466 +0400
57954@@ -0,0 +1,131 @@
57955+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57956+
57957+/* Node plugin interface.
57958+
57959+ Description: The tree provides the abstraction of flows, which it
57960+ internally fragments into items which it stores in nodes.
57961+
57962+ A key_atom is a piece of data bound to a single key.
57963+
57964+ For reasonable space efficiency to be achieved it is often
57965+ necessary to store key_atoms in the nodes in the form of items, where
57966+ an item is a sequence of key_atoms of the same or similar type. It is
57967+ more space-efficient, because the item can implement (very)
57968+ efficient compression of key_atom's bodies using internal knowledge
57969+ about their semantics, and it can often avoid having a key for each
57970+ key_atom. Each type of item has specific operations implemented by its
57971+ item handler (see balance.c).
57972+
57973+ Rationale: the rest of the code (specifically balancing routines)
57974+ accesses leaf level nodes through this interface. This way we can
57975+ implement various block layouts and even combine various layouts
57976+ within the same tree. Balancing/allocating algorithms should not
57977+ care about peculiarities of splitting/merging specific item types,
57978+ but rather should leave that to the item's item handler.
57979+
57980+ Items, including those that provide the abstraction of flows, have
57981+ the property that if you move them in part or in whole to another
57982+ node, the balancing code invokes their is_left_mergeable()
57983+ item_operation to determine if they are mergeable with their new
57984+ neighbor in the node you have moved them to. For some items the
57985+ is_left_mergeable() function always returns null.
57986+
57987+ When moving the bodies of items from one node to another:
57988+
57989+ if a partial item is shifted to another node the balancing code invokes
57990+ an item handler method to handle the item splitting.
57991+
57992+ if the balancing code needs to merge with an item in the node it
57993+ is shifting to, it will invoke an item handler method to handle
57994+ the item merging.
57995+
57996+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
57997+ adjusting the item headers after the move is done using the node handler.
57998+*/
57999+
58000+#include "../../forward.h"
58001+#include "../../debug.h"
58002+#include "../../key.h"
58003+#include "../../coord.h"
58004+#include "../plugin_header.h"
58005+#include "../item/item.h"
58006+#include "node.h"
58007+#include "../plugin.h"
58008+#include "../../znode.h"
58009+#include "../../tree.h"
58010+#include "../../super.h"
58011+#include "../../reiser4.h"
58012+
58013+/**
58014+ * leftmost_key_in_node - get the smallest key in node
58015+ * @node:
58016+ * @key: store result here
58017+ *
58018+ * Stores the leftmost key of @node in @key.
58019+ */
58020+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
58021+{
58022+ assert("nikita-1634", node != NULL);
58023+ assert("nikita-1635", key != NULL);
58024+
58025+ if (!node_is_empty(node)) {
58026+ coord_t first_item;
58027+
58028+ coord_init_first_unit(&first_item, (znode *) node);
58029+ item_key_by_coord(&first_item, key);
58030+ } else
58031+ *key = *reiser4_max_key();
58032+ return key;
58033+}
58034+
58035+node_plugin node_plugins[LAST_NODE_ID] = {
58036+ [NODE40_ID] = {
58037+ .h = {
58038+ .type_id = REISER4_NODE_PLUGIN_TYPE,
58039+ .id = NODE40_ID,
58040+ .pops = NULL,
58041+ .label = "unified",
58042+ .desc = "unified node layout",
58043+ .linkage = {NULL, NULL}
58044+ },
58045+ .item_overhead = item_overhead_node40,
58046+ .free_space = free_space_node40,
58047+ .lookup = lookup_node40,
58048+ .num_of_items = num_of_items_node40,
58049+ .item_by_coord = item_by_coord_node40,
58050+ .length_by_coord = length_by_coord_node40,
58051+ .plugin_by_coord = plugin_by_coord_node40,
58052+ .key_at = key_at_node40,
58053+ .estimate = estimate_node40,
58054+ .check = check_node40,
58055+ .parse = parse_node40,
58056+ .init = init_node40,
58057+#ifdef GUESS_EXISTS
58058+ .guess = guess_node40,
58059+#endif
58060+ .change_item_size = change_item_size_node40,
58061+ .create_item = create_item_node40,
58062+ .update_item_key = update_item_key_node40,
58063+ .cut_and_kill = kill_node40,
58064+ .cut = cut_node40,
58065+ .shift = shift_node40,
58066+ .shrink_item = shrink_item_node40,
58067+ .fast_insert = fast_insert_node40,
58068+ .fast_paste = fast_paste_node40,
58069+ .fast_cut = fast_cut_node40,
58070+ .max_item_size = max_item_size_node40,
58071+ .prepare_removal = prepare_removal_node40,
58072+ .set_item_plugin = set_item_plugin_node40
58073+ }
58074+};
58075+
58076+/*
58077+ Local variables:
58078+ c-indentation-style: "K&R"
58079+ mode-name: "LC"
58080+ c-basic-offset: 8
58081+ tab-width: 8
58082+ fill-column: 120
58083+ scroll-step: 1
58084+ End:
58085+*/
58086diff -urN linux-2.6.22.orig/fs/reiser4/plugin/node/node.h linux-2.6.22/fs/reiser4/plugin/node/node.h
58087--- linux-2.6.22.orig/fs/reiser4/plugin/node/node.h 1970-01-01 03:00:00.000000000 +0300
58088+++ linux-2.6.22/fs/reiser4/plugin/node/node.h 2007-07-29 00:25:34.988725466 +0400
58089@@ -0,0 +1,272 @@
58090+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58091+
58092+/* We need a definition of the default node layout here. */
58093+
58094+/* Generally speaking, it is best to have free space in the middle of the
58095+ node so that two sets of things can grow towards it, and to have the
58096+ item bodies on the left so that the last one of them grows into free
58097+ space. We optimize for the case where we append new items to the end
58098+ of the node, or grow the last item, because it hurts nothing to so
58099+ optimize and it is a common special case to do massive insertions in
58100+ increasing key order (and one of cases more likely to have a real user
58101+ notice the delay time for).
58102+
58103+ formatted leaf default layout: (leaf1)
58104+
58105+ |node header:item bodies:free space:key + pluginid + item offset|
58106+
58107+ We grow towards the middle, optimizing layout for the case where we
58108+ append new items to the end of the node. The node header is fixed
58109+ length. Keys, and item offsets plus pluginids for the items
58110+ corresponding to them are in increasing key order, and are fixed
58111+ length. Item offsets are relative to start of node (16 bits creating
58112+ a node size limit of 64k, 12 bits might be a better choice....). Item
58113+ bodies are in decreasing key order. Item bodies have a variable size.
58114+ There is a one to one to one mapping of keys to item offsets to item
58115+ bodies. Item offsets consist of pointers to the zeroth byte of the
58116+ item body. Item length equals the start of the next item minus the
58117+ start of this item, except the zeroth item whose length equals the end
58118+ of the node minus the start of that item (plus a byte). In other
58119+ words, the item length is not recorded anywhere, and it does not need
58120+ to be since it is computable.
58121+
58122+ Leaf variable length items and keys layout : (lvar)
58123+
58124+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
58125+
58126+ We grow towards the middle, optimizing layout for the case where we
58127+ append new items to the end of the node. The node header is fixed
58128+ length. Keys and item offsets for the items corresponding to them are
58129+ in increasing key order, and keys are variable length. Item offsets
58130+ are relative to start of node (16 bits). Item bodies are in
58131+ decreasing key order. Item bodies have a variable size. There is a
58132+ one to one to one mapping of keys to item offsets to item bodies.
58133+ Item offsets consist of pointers to the zeroth byte of the item body.
58134+ Item length equals the start of the next item's key minus the start of
58135+ this item, except the zeroth item whose length equals the end of the
58136+ node minus the start of that item (plus a byte).
58137+
58138+ leaf compressed keys layout: (lcomp)
58139+
58140+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
58141+
58142+ We grow towards the middle, optimizing layout for the case where we
58143+ append new items to the end of the node. The node header is fixed
58144+ length. Keys and item offsets for the items corresponding to them are
58145+ in increasing key order, and keys are variable length. The "key
58146+ inherit" field indicates how much of the key prefix is identical to
58147+ the previous key (stem compression as described in "Managing
58148+ Gigabytes" is used). key_inherit is a one byte integer. The
58149+ intra-node searches performed through this layout are linear searches,
58150+ and this is theorized to not hurt performance much due to the high
58151+ cost of processor stalls on modern CPUs, and the small number of keys
58152+ in a single node. Item offsets are relative to start of node (16
58153+ bits). Item bodies are in decreasing key order. Item bodies have a
58154+ variable size. There is a one to one to one mapping of keys to item
58155+ offsets to item bodies. Item offsets consist of pointers to the
58156+ zeroth byte of the item body. Item length equals the start of the
58157+ next item minus the start of this item, except the zeroth item whose
58158+ length equals the end of the node minus the start of that item (plus a
58159+ byte). In other words, item length and key length is not recorded
58160+ anywhere, and it does not need to be since it is computable.
58161+
58162+ internal node default layout: (idef1)
58163+
58164+ just like ldef1 except that item bodies are either blocknrs of
58165+ children or extents, and moving them may require updating parent
58166+ pointers in the nodes that they point to.
58167+*/
58168+
58169+/* There is an inherent 3-way tradeoff between optimizing and
58170+ exchanging disks between different architectures and code
58171+ complexity. This is optimal and simple and inexchangeable.
58172+ Someone else can do the code for exchanging disks and make it
58173+ complex. It would not be that hard. Using other than the PAGE_SIZE
58174+ might be suboptimal.
58175+*/
58176+
58177+#if !defined( __REISER4_NODE_H__ )
58178+#define __REISER4_NODE_H__
58179+
58180+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
58181+
58182+#include "../../dformat.h"
58183+#include "../plugin_header.h"
58184+
58185+#include <linux/types.h>
58186+
58187+typedef enum {
58188+ NS_FOUND = 0,
58189+ NS_NOT_FOUND = -ENOENT
58190+} node_search_result;
58191+
58192+/* Maximal possible space overhead for creation of new item in a node */
58193+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
58194+
58195+typedef enum {
58196+ REISER4_NODE_DKEYS = (1 << 0),
58197+ REISER4_NODE_TREE_STABLE = (1 << 1)
58198+} reiser4_node_check_flag;
58199+
58200+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
58201+struct cut_list {
58202+ coord_t *from;
58203+ coord_t *to;
58204+ const reiser4_key *from_key;
58205+ const reiser4_key *to_key;
58206+ reiser4_key *smallest_removed;
58207+ carry_plugin_info *info;
58208+ __u32 flags;
58209+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
58210+ lock_handle *left;
58211+ lock_handle *right;
58212+};
58213+
58214+struct carry_cut_data;
58215+struct carry_kill_data;
44254afd 58216+
71430cf6
MT
58217+/* The responsibility of the node plugin is to store and give access
58218+ to the sequence of items within the node. */
58219+typedef struct node_plugin {
58220+ /* generic plugin fields */
58221+ plugin_header h;
44254afd 58222+
71430cf6
MT
58223+ /* calculates the amount of space that will be required to store an
58224+ item which is in addition to the space consumed by the item body.
58225+ (the space consumed by the item body can be gotten by calling
58226+ item->estimate) */
58227+ size_t(*item_overhead) (const znode * node, flow_t * f);
44254afd 58228+
71430cf6
MT
58229+ /* returns free space by looking into node (i.e., without using
58230+ znode->free_space). */
58231+ size_t(*free_space) (znode * node);
58232+ /* search within the node for the one item which might
58233+ contain the key, invoking item->search_within to search within
58234+ that item to see if it is in there */
58235+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
58236+ lookup_bias bias, coord_t * coord);
58237+ /* number of items in node */
58238+ int (*num_of_items) (const znode * node);
44254afd 58239+
71430cf6
MT
58240+ /* store information about item in @coord in @data */
58241+ /* break into several node ops, don't add any more uses of this before doing so */
58242+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
58243+ char *(*item_by_coord) (const coord_t * coord);
58244+ int (*length_by_coord) (const coord_t * coord);
58245+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
44254afd 58246+
71430cf6
MT
58247+ /* store item key in @key */
58248+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
58249+ /* conservatively estimate whether unit of what size can fit
58250+ into node. This estimation should be performed without
58251+ actually looking into the node's content (free space is saved in
58252+ znode). */
58253+ size_t(*estimate) (znode * node);
44254afd 58254+
71430cf6
MT
58255+ /* performs every consistency check the node plugin author could
58256+ imagine. Optional. */
58257+ int (*check) (const znode * node, __u32 flags, const char **error);
44254afd 58258+
71430cf6
MT
58259+ /* Called when node is read into memory and node plugin is
58260+ already detected. This should read some data into znode (like free
58261+ space counter) and, optionally, check data consistency.
44254afd 58262+ */
71430cf6
MT
58263+ int (*parse) (znode * node);
58264+ /* This method is called on a new node to initialise plugin specific
58265+ data (header, etc.) */
58266+ int (*init) (znode * node);
58267+ /* Check whether @node content conforms to this plugin format.
58268+ Probably only useful after support for old V3.x formats is added.
58269+ Uncomment after 4.0 only.
58270+ */
58271+ /* int ( *guess )( const znode *node ); */
58272+#if REISER4_DEBUG
58273+ void (*print) (const char *prefix, const znode * node, __u32 flags);
58274+#endif
58275+ /* change size of @item by @by bytes. @item->node has enough free
58276+ space. When @by > 0 - free space is appended to end of item. When
58277+ @by < 0 - item is truncated - it is assumed that last @by bytes if
58278+ the item are freed already */
58279+ void (*change_item_size) (coord_t * item, int by);
44254afd 58280+
71430cf6
MT
58281+ /* create new item @length bytes long in coord @target */
58282+ int (*create_item) (coord_t * target, const reiser4_key * key,
58283+ reiser4_item_data * data, carry_plugin_info * info);
44254afd 58284+
71430cf6
MT
58285+ /* update key of item. */
58286+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
58287+ carry_plugin_info * info);
44254afd 58288+
71430cf6
MT
58289+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
58290+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
44254afd 58291+
71430cf6
MT
58292+ /*
58293+ * shrink item pointed to by @coord by @delta bytes.
58294+ */
58295+ int (*shrink_item) (coord_t * coord, int delta);
44254afd 58296+
71430cf6
MT
58297+ /* copy as much as possible but not more than up to @stop from
58298+ @stop->node to @target. If (pend == append) then data from beginning of
58299+ @stop->node are copied to the end of @target. If (pend == prepend) then
58300+ data from the end of @stop->node are copied to the beginning of
58301+ @target. Copied data are removed from @stop->node. Information
58302+ about what to do on upper level is stored in @todo */
58303+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
58304+ int delete_node, int including_insert_coord,
58305+ carry_plugin_info * info);
58306+ /* return true if this node allows skip carry() in some situations
58307+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
58308+ emulation doesn't.
44254afd 58309+
71430cf6
MT
58310+ This will speedup insertions that doesn't require updates to the
58311+ parent, by bypassing initialisation of carry() structures. It's
58312+ believed that majority of insertions will fit there.
44254afd 58313+
71430cf6
MT
58314+ */
58315+ int (*fast_insert) (const coord_t * coord);
58316+ int (*fast_paste) (const coord_t * coord);
58317+ int (*fast_cut) (const coord_t * coord);
58318+ /* this limits max size of item which can be inserted into a node and
58319+ number of bytes item in a node may be appended with */
58320+ int (*max_item_size) (void);
58321+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
58322+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
58323+ * files */
58324+ int (*set_item_plugin) (coord_t * coord, item_id);
58325+} node_plugin;
58326+
58327+typedef enum {
58328+ /* standard unified node layout used for both leaf and internal
58329+ nodes */
58330+ NODE40_ID,
58331+ LAST_NODE_ID
58332+} reiser4_node_id;
44254afd 58333+
71430cf6 58334+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
44254afd 58335+#if REISER4_DEBUG
71430cf6
MT
58336+extern void print_node_content(const char *prefix, const znode * node,
58337+ __u32 flags);
44254afd
MT
58338+#endif
58339+
71430cf6
MT
58340+extern void indent_znode(const znode * node);
58341+
58342+typedef struct common_node_header {
58343+ /*
58344+ * identifier of node plugin. Must be located at the very beginning of
58345+ * a node.
58346+ */
58347+ __le16 plugin_id;
58348+} common_node_header;
58349+
58350+/* __REISER4_NODE_H__ */
44254afd
MT
58351+#endif
58352+/*
71430cf6
MT
58353+ * Local variables:
58354+ * c-indentation-style: "K&R"
58355+ * mode-name: "LC"
58356+ * c-basic-offset: 8
58357+ * tab-width: 8
58358+ * fill-column: 79
58359+ * scroll-step: 1
58360+ * End:
58361+ */
58362diff -urN linux-2.6.22.orig/fs/reiser4/plugin/object.c linux-2.6.22/fs/reiser4/plugin/object.c
58363--- linux-2.6.22.orig/fs/reiser4/plugin/object.c 1970-01-01 03:00:00.000000000 +0300
58364+++ linux-2.6.22/fs/reiser4/plugin/object.c 2007-07-29 00:25:34.992726502 +0400
58365@@ -0,0 +1,516 @@
44254afd
MT
58366+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58367+ * reiser4/README */
58368+
58369+/*
58370+ * Examples of object plugins: file, directory, symlink, special file.
58371+ *
58372+ * Plugins associated with inode:
58373+ *
58374+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
58375+ * stat-data. How we store this plugin in in-core inode is not
58376+ * important. Currently pointers are used, another variant is to store offsets
58377+ * and do array lookup on each access.
58378+ *
58379+ * Now, each inode has one selected plugin: object plugin that
58380+ * determines what type of file this object is: directory, regular etc.
58381+ *
58382+ * This main plugin can use other plugins that are thus subordinated to
58383+ * it. Directory instance of object plugin uses hash; regular file
58384+ * instance uses tail policy plugin.
58385+ *
58386+ * Object plugin is either taken from id in stat-data or guessed from
58387+ * i_mode bits. Once it is established we ask it to install its
58388+ * subordinate plugins, by looking again in stat-data or inheriting them
58389+ * from parent.
58390+ *
58391+ * How new inode is initialized during ->read_inode():
58392+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
58393+ * i_generation, capabilities etc.
58394+ * 2 read plugin id from stat data or try to guess plugin id
58395+ * from inode->i_mode bits if plugin id is missing.
58396+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58397+ *
58398+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
58399+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
58400+ *
58401+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
58402+ * from stat-data or guessed from mode bits
58403+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58404+ * plugins from parent.
58405+ *
58406+ * Easy induction proves that on last step all plugins of inode would be
58407+ * initialized.
58408+ *
58409+ * When creating new object:
58410+ * 1 obtain object plugin id (see next period)
58411+ * NIKITA-FIXME-HANS: period?
58412+ * 2 ->install() this plugin
58413+ * 3 ->inherit() the rest from the parent
58414+ *
58415+ * We need some examples of creating an object with default and non-default
58416+ * plugin ids. Nikita, please create them.
58417+ */
58418+
58419+#include "../inode.h"
58420+
58421+static int _bugop(void)
58422+{
58423+ BUG_ON(1);
58424+ return 0;
58425+}
58426+
58427+#define bugop ((void *)_bugop)
58428+
58429+static int _dummyop(void)
58430+{
58431+ return 0;
58432+}
58433+
58434+#define dummyop ((void *)_dummyop)
58435+
71430cf6
MT
58436+static int change_file(struct inode *inode,
58437+ reiser4_plugin * plugin,
58438+ pset_member memb)
44254afd
MT
58439+{
58440+ /* cannot change object plugin of already existing object */
71430cf6
MT
58441+ if (memb == PSET_FILE)
58442+ return RETERR(-EINVAL);
58443+
58444+ /* Change PSET_CREATE */
58445+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
44254afd
MT
58446+}
58447+
58448+static reiser4_plugin_ops file_plugin_ops = {
58449+ .change = change_file
58450+};
58451+
58452+/*
58453+ * Definitions of object plugins.
58454+ */
58455+
58456+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
58457+ [UNIX_FILE_PLUGIN_ID] = {
58458+ .h = {
58459+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58460+ .id = UNIX_FILE_PLUGIN_ID,
71430cf6 58461+ .groups = (1 << REISER4_REGULAR_FILE),
44254afd
MT
58462+ .pops = &file_plugin_ops,
58463+ .label = "reg",
58464+ .desc = "regular file",
58465+ .linkage = {NULL, NULL},
58466+ },
58467+ .inode_ops = {
71430cf6 58468+ .permission = reiser4_permission_common,
44254afd 58469+ .setattr = setattr_unix_file,
71430cf6 58470+ .getattr = reiser4_getattr_common
44254afd
MT
58471+ },
58472+ .file_ops = {
58473+ .llseek = generic_file_llseek,
58474+ .read = read_unix_file,
58475+ .write = write_unix_file,
71430cf6 58476+ .aio_read = generic_file_aio_read,
44254afd
MT
58477+ .ioctl = ioctl_unix_file,
58478+ .mmap = mmap_unix_file,
58479+ .open = open_unix_file,
58480+ .release = release_unix_file,
58481+ .fsync = sync_unix_file,
58482+ .sendfile = sendfile_unix_file
58483+ },
58484+ .as_ops = {
58485+ .writepage = reiser4_writepage,
58486+ .readpage = readpage_unix_file,
58487+ .sync_page = block_sync_page,
58488+ .writepages = writepages_unix_file,
58489+ .set_page_dirty = reiser4_set_page_dirty,
71430cf6 58490+ .readpages = readpages_unix_file,
44254afd
MT
58491+ .prepare_write = prepare_write_unix_file,
58492+ .commit_write = commit_write_unix_file,
58493+ .bmap = bmap_unix_file,
58494+ .invalidatepage = reiser4_invalidatepage,
58495+ .releasepage = reiser4_releasepage
58496+ },
58497+ .write_sd_by_inode = write_sd_by_inode_common,
58498+ .flow_by_inode = flow_by_inode_unix_file,
58499+ .key_by_inode = key_by_inode_and_offset_common,
58500+ .set_plug_in_inode = set_plug_in_inode_common,
58501+ .adjust_to_parent = adjust_to_parent_common,
71430cf6 58502+ .create_object = reiser4_create_object_common,
44254afd 58503+ .delete_object = delete_object_unix_file,
71430cf6
MT
58504+ .add_link = reiser4_add_link_common,
58505+ .rem_link = reiser4_rem_link_common,
44254afd
MT
58506+ .owns_item = owns_item_unix_file,
58507+ .can_add_link = can_add_link_common,
58508+ .detach = dummyop,
58509+ .bind = dummyop,
58510+ .safelink = safelink_common,
58511+ .estimate = {
58512+ .create = estimate_create_common,
58513+ .update = estimate_update_common,
58514+ .unlink = estimate_unlink_common
58515+ },
58516+ .init_inode_data = init_inode_data_unix_file,
58517+ .cut_tree_worker = cut_tree_worker_common,
58518+ .wire = {
58519+ .write = wire_write_common,
58520+ .read = wire_read_common,
58521+ .get = wire_get_common,
58522+ .size = wire_size_common,
58523+ .done = wire_done_common
58524+ }
58525+ },
58526+ [DIRECTORY_FILE_PLUGIN_ID] = {
58527+ .h = {
58528+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58529+ .id = DIRECTORY_FILE_PLUGIN_ID,
71430cf6 58530+ .groups = (1 << REISER4_DIRECTORY_FILE),
44254afd
MT
58531+ .pops = &file_plugin_ops,
58532+ .label = "dir",
58533+ .desc = "directory",
58534+ .linkage = {NULL, NULL}
58535+ },
71430cf6
MT
58536+ .inode_ops = {.create = NULL},
58537+ .file_ops = {.owner = NULL},
58538+ .as_ops = {.writepage = NULL},
44254afd
MT
58539+
58540+ .write_sd_by_inode = write_sd_by_inode_common,
58541+ .flow_by_inode = bugop,
58542+ .key_by_inode = bugop,
58543+ .set_plug_in_inode = set_plug_in_inode_common,
58544+ .adjust_to_parent = adjust_to_parent_common_dir,
71430cf6
MT
58545+ .create_object = reiser4_create_object_common,
58546+ .delete_object = reiser4_delete_dir_common,
58547+ .add_link = reiser4_add_link_common,
44254afd
MT
58548+ .rem_link = rem_link_common_dir,
58549+ .owns_item = owns_item_common_dir,
58550+ .can_add_link = can_add_link_common,
58551+ .can_rem_link = can_rem_link_common_dir,
71430cf6
MT
58552+ .detach = reiser4_detach_common_dir,
58553+ .bind = reiser4_bind_common_dir,
44254afd
MT
58554+ .safelink = safelink_common,
58555+ .estimate = {
58556+ .create = estimate_create_common_dir,
58557+ .update = estimate_update_common,
58558+ .unlink = estimate_unlink_common_dir
58559+ },
58560+ .wire = {
58561+ .write = wire_write_common,
58562+ .read = wire_read_common,
58563+ .get = wire_get_common,
58564+ .size = wire_size_common,
58565+ .done = wire_done_common
58566+ },
58567+ .init_inode_data = init_inode_ordering,
58568+ .cut_tree_worker = cut_tree_worker_common,
58569+ },
58570+ [SYMLINK_FILE_PLUGIN_ID] = {
58571+ .h = {
58572+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58573+ .id = SYMLINK_FILE_PLUGIN_ID,
71430cf6 58574+ .groups = (1 << REISER4_SYMLINK_FILE),
44254afd
MT
58575+ .pops = &file_plugin_ops,
58576+ .label = "symlink",
58577+ .desc = "symbolic link",
58578+ .linkage = {NULL,NULL}
58579+ },
58580+ .inode_ops = {
58581+ .readlink = generic_readlink,
71430cf6
MT
58582+ .follow_link = reiser4_follow_link_common,
58583+ .permission = reiser4_permission_common,
58584+ .setattr = reiser4_setattr_common,
58585+ .getattr = reiser4_getattr_common
44254afd
MT
58586+ },
58587+ /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
71430cf6
MT
58588+ .file_ops = {.owner = NULL},
58589+ .as_ops = {.writepage = NULL},
44254afd
MT
58590+
58591+ .write_sd_by_inode = write_sd_by_inode_common,
58592+ .set_plug_in_inode = set_plug_in_inode_common,
58593+ .adjust_to_parent = adjust_to_parent_common,
71430cf6
MT
58594+ .create_object = reiser4_create_symlink,
58595+ .delete_object = reiser4_delete_object_common,
58596+ .add_link = reiser4_add_link_common,
58597+ .rem_link = reiser4_rem_link_common,
44254afd
MT
58598+ .can_add_link = can_add_link_common,
58599+ .detach = dummyop,
58600+ .bind = dummyop,
58601+ .safelink = safelink_common,
58602+ .estimate = {
58603+ .create = estimate_create_common,
58604+ .update = estimate_update_common,
58605+ .unlink = estimate_unlink_common
58606+ },
58607+ .init_inode_data = init_inode_ordering,
58608+ .cut_tree_worker = cut_tree_worker_common,
58609+ .destroy_inode = destroy_inode_symlink,
58610+ .wire = {
58611+ .write = wire_write_common,
58612+ .read = wire_read_common,
58613+ .get = wire_get_common,
58614+ .size = wire_size_common,
58615+ .done = wire_done_common
58616+ }
58617+ },
58618+ [SPECIAL_FILE_PLUGIN_ID] = {
58619+ .h = {
58620+ .type_id = REISER4_FILE_PLUGIN_TYPE,
58621+ .id = SPECIAL_FILE_PLUGIN_ID,
71430cf6 58622+ .groups = (1 << REISER4_SPECIAL_FILE),
44254afd
MT
58623+ .pops = &file_plugin_ops,
58624+ .label = "special",
58625+ .desc =
58626+ "special: fifo, device or socket",
58627+ .linkage = {NULL, NULL}
58628+ },
58629+ .inode_ops = {
71430cf6
MT
58630+ .permission = reiser4_permission_common,
58631+ .setattr = reiser4_setattr_common,
58632+ .getattr = reiser4_getattr_common
44254afd
MT
58633+ },
58634+ /* file_ops of special files (sockets, block, char, fifo) are
58635+ initialized by init_special_inode. */
71430cf6
MT
58636+ .file_ops = {.owner = NULL},
58637+ .as_ops = {.writepage = NULL},
44254afd
MT
58638+
58639+ .write_sd_by_inode = write_sd_by_inode_common,
58640+ .set_plug_in_inode = set_plug_in_inode_common,
58641+ .adjust_to_parent = adjust_to_parent_common,
71430cf6
MT
58642+ .create_object = reiser4_create_object_common,
58643+ .delete_object = reiser4_delete_object_common,
58644+ .add_link = reiser4_add_link_common,
58645+ .rem_link = reiser4_rem_link_common,
44254afd
MT
58646+ .owns_item = owns_item_common,
58647+ .can_add_link = can_add_link_common,
58648+ .detach = dummyop,
58649+ .bind = dummyop,
58650+ .safelink = safelink_common,
58651+ .estimate = {
58652+ .create = estimate_create_common,
58653+ .update = estimate_update_common,
58654+ .unlink = estimate_unlink_common
58655+ },
58656+ .init_inode_data = init_inode_ordering,
58657+ .cut_tree_worker = cut_tree_worker_common,
58658+ .wire = {
58659+ .write = wire_write_common,
58660+ .read = wire_read_common,
58661+ .get = wire_get_common,
58662+ .size = wire_size_common,
58663+ .done = wire_done_common
58664+ }
58665+ },
71430cf6 58666+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
44254afd
MT
58667+ .h = {
58668+ .type_id = REISER4_FILE_PLUGIN_TYPE,
71430cf6
MT
58669+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
58670+ .groups = (1 << REISER4_REGULAR_FILE),
58671+ .pops = &file_plugin_ops,
44254afd
MT
58672+ .label = "cryptcompress",
58673+ .desc = "cryptcompress file",
58674+ .linkage = {NULL, NULL}
58675+ },
58676+ .inode_ops = {
71430cf6
MT
58677+ .permission = reiser4_permission_common,
58678+ .setattr = prot_setattr_cryptcompress,
58679+ .getattr = reiser4_getattr_common
44254afd
MT
58680+ },
58681+ .file_ops = {
58682+ .llseek = generic_file_llseek,
71430cf6
MT
58683+ .read = prot_read_cryptcompress,
58684+ .write = prot_write_cryptcompress,
58685+ .aio_read = generic_file_aio_read,
58686+ .mmap = prot_mmap_cryptcompress,
58687+ .release = prot_release_cryptcompress,
58688+ .fsync = reiser4_sync_common,
58689+ .sendfile = prot_sendfile_cryptcompress
44254afd
MT
58690+ },
58691+ .as_ops = {
58692+ .writepage = reiser4_writepage,
58693+ .readpage = readpage_cryptcompress,
58694+ .sync_page = block_sync_page,
58695+ .writepages = writepages_cryptcompress,
58696+ .set_page_dirty = reiser4_set_page_dirty,
71430cf6 58697+ .readpages = readpages_cryptcompress,
44254afd
MT
58698+ .prepare_write = prepare_write_common,
58699+ .invalidatepage = reiser4_invalidatepage,
58700+ .releasepage = reiser4_releasepage
58701+ },
58702+ .write_sd_by_inode = write_sd_by_inode_common,
58703+ .flow_by_inode = flow_by_inode_cryptcompress,
58704+ .key_by_inode = key_by_inode_cryptcompress,
58705+ .set_plug_in_inode = set_plug_in_inode_common,
58706+ .adjust_to_parent = adjust_to_parent_cryptcompress,
58707+ .create_object = create_cryptcompress,
71430cf6
MT
58708+ .open_object = open_object_cryptcompress,
58709+ .delete_object = delete_object_cryptcompress,
58710+ .add_link = reiser4_add_link_common,
58711+ .rem_link = reiser4_rem_link_common,
44254afd
MT
58712+ .owns_item = owns_item_common,
58713+ .can_add_link = can_add_link_common,
58714+ .detach = dummyop,
58715+ .bind = dummyop,
58716+ .safelink = safelink_common,
58717+ .estimate = {
58718+ .create = estimate_create_common,
58719+ .update = estimate_update_common,
58720+ .unlink = estimate_unlink_common
58721+ },
58722+ .init_inode_data = init_inode_data_cryptcompress,
58723+ .cut_tree_worker = cut_tree_worker_cryptcompress,
58724+ .destroy_inode = destroy_inode_cryptcompress,
58725+ .wire = {
58726+ .write = wire_write_common,
58727+ .read = wire_read_common,
58728+ .get = wire_get_common,
58729+ .size = wire_size_common,
58730+ .done = wire_done_common
58731+ }
58732+ }
58733+};
58734+
71430cf6
MT
58735+static int change_dir(struct inode *inode,
58736+ reiser4_plugin * plugin,
58737+ pset_member memb)
44254afd
MT
58738+{
58739+ /* cannot change dir plugin of already existing object */
58740+ return RETERR(-EINVAL);
58741+}
58742+
58743+static reiser4_plugin_ops dir_plugin_ops = {
58744+ .change = change_dir
58745+};
58746+
58747+/*
58748+ * definition of directory plugins
58749+ */
58750+
58751+dir_plugin dir_plugins[LAST_DIR_ID] = {
58752+ /* standard hashed directory plugin */
58753+ [HASHED_DIR_PLUGIN_ID] = {
58754+ .h = {
58755+ .type_id = REISER4_DIR_PLUGIN_TYPE,
58756+ .id = HASHED_DIR_PLUGIN_ID,
58757+ .pops = &dir_plugin_ops,
58758+ .label = "dir",
58759+ .desc = "hashed directory",
58760+ .linkage = {NULL, NULL}
58761+ },
58762+ .inode_ops = {
71430cf6
MT
58763+ .create = reiser4_create_common,
58764+ .lookup = reiser4_lookup_common,
58765+ .link = reiser4_link_common,
58766+ .unlink = reiser4_unlink_common,
58767+ .symlink = reiser4_symlink_common,
58768+ .mkdir = reiser4_mkdir_common,
58769+ .rmdir = reiser4_unlink_common,
58770+ .mknod = reiser4_mknod_common,
58771+ .rename = reiser4_rename_common,
58772+ .permission = reiser4_permission_common,
58773+ .setattr = reiser4_setattr_common,
58774+ .getattr = reiser4_getattr_common
44254afd
MT
58775+ },
58776+ .file_ops = {
71430cf6 58777+ .llseek = reiser4_llseek_dir_common,
44254afd 58778+ .read = generic_read_dir,
71430cf6
MT
58779+ .readdir = reiser4_readdir_common,
58780+ .release = reiser4_release_dir_common,
58781+ .fsync = reiser4_sync_common
44254afd
MT
58782+ },
58783+ .as_ops = {
58784+ .writepage = bugop,
58785+ .sync_page = bugop,
58786+ .writepages = dummyop,
58787+ .set_page_dirty = bugop,
58788+ .readpages = bugop,
58789+ .prepare_write = bugop,
58790+ .commit_write = bugop,
58791+ .bmap = bugop,
58792+ .invalidatepage = bugop,
58793+ .releasepage = bugop
58794+ },
58795+ .get_parent = get_parent_common,
58796+ .is_name_acceptable = is_name_acceptable_common,
58797+ .build_entry_key = build_entry_key_hashed,
58798+ .build_readdir_key = build_readdir_key_common,
71430cf6
MT
58799+ .add_entry = reiser4_add_entry_common,
58800+ .rem_entry = reiser4_rem_entry_common,
58801+ .init = reiser4_dir_init_common,
58802+ .done = reiser4_dir_done_common,
58803+ .attach = reiser4_attach_common,
58804+ .detach = reiser4_detach_common,
44254afd
MT
58805+ .estimate = {
58806+ .add_entry = estimate_add_entry_common,
58807+ .rem_entry = estimate_rem_entry_common,
58808+ .unlink = dir_estimate_unlink_common
58809+ }
58810+ },
58811+ /* hashed directory for which seekdir/telldir are guaranteed to
58812+ * work. Brain-damage. */
58813+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
58814+ .h = {
58815+ .type_id = REISER4_DIR_PLUGIN_TYPE,
58816+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
58817+ .pops = &dir_plugin_ops,
58818+ .label = "dir32",
58819+ .desc = "directory hashed with 31 bit hash",
58820+ .linkage = {NULL, NULL}
58821+ },
58822+ .inode_ops = {
71430cf6
MT
58823+ .create = reiser4_create_common,
58824+ .lookup = reiser4_lookup_common,
58825+ .link = reiser4_link_common,
58826+ .unlink = reiser4_unlink_common,
58827+ .symlink = reiser4_symlink_common,
58828+ .mkdir = reiser4_mkdir_common,
58829+ .rmdir = reiser4_unlink_common,
58830+ .mknod = reiser4_mknod_common,
58831+ .rename = reiser4_rename_common,
58832+ .permission = reiser4_permission_common,
58833+ .setattr = reiser4_setattr_common,
58834+ .getattr = reiser4_getattr_common
44254afd
MT
58835+ },
58836+ .file_ops = {
71430cf6 58837+ .llseek = reiser4_llseek_dir_common,
44254afd 58838+ .read = generic_read_dir,
71430cf6
MT
58839+ .readdir = reiser4_readdir_common,
58840+ .release = reiser4_release_dir_common,
58841+ .fsync = reiser4_sync_common
44254afd
MT
58842+ },
58843+ .as_ops = {
58844+ .writepage = bugop,
58845+ .sync_page = bugop,
58846+ .writepages = dummyop,
58847+ .set_page_dirty = bugop,
58848+ .readpages = bugop,
58849+ .prepare_write = bugop,
58850+ .commit_write = bugop,
58851+ .bmap = bugop,
58852+ .invalidatepage = bugop,
58853+ .releasepage = bugop
58854+ },
58855+ .get_parent = get_parent_common,
58856+ .is_name_acceptable = is_name_acceptable_common,
58857+ .build_entry_key = build_entry_key_seekable,
58858+ .build_readdir_key = build_readdir_key_common,
71430cf6
MT
58859+ .add_entry = reiser4_add_entry_common,
58860+ .rem_entry = reiser4_rem_entry_common,
58861+ .init = reiser4_dir_init_common,
58862+ .done = reiser4_dir_done_common,
58863+ .attach = reiser4_attach_common,
58864+ .detach = reiser4_detach_common,
44254afd
MT
58865+ .estimate = {
58866+ .add_entry = estimate_add_entry_common,
58867+ .rem_entry = estimate_rem_entry_common,
58868+ .unlink = dir_estimate_unlink_common
58869+ }
58870+ }
58871+};
58872+
58873+/* Make Linus happy.
58874+ Local variables:
58875+ c-indentation-style: "K&R"
58876+ mode-name: "LC"
58877+ c-basic-offset: 8
58878+ tab-width: 8
58879+ fill-column: 120
58880+ End:
58881+*/
71430cf6
MT
58882diff -urN linux-2.6.22.orig/fs/reiser4/plugin/object.h linux-2.6.22/fs/reiser4/plugin/object.h
58883--- linux-2.6.22.orig/fs/reiser4/plugin/object.h 1970-01-01 03:00:00.000000000 +0300
58884+++ linux-2.6.22/fs/reiser4/plugin/object.h 2007-07-29 00:25:34.992726502 +0400
44254afd
MT
58885@@ -0,0 +1,121 @@
58886+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
58887+ * reiser4/README */
58888+
58889+/* Declaration of object plugin functions. */
58890+
58891+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
58892+#define __FS_REISER4_PLUGIN_OBJECT_H__
58893+
58894+#include "../type_safe_hash.h"
58895+
58896+/* common implementations of inode operations */
71430cf6
MT
58897+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
58898+ int mode, struct nameidata *);
58899+struct dentry * reiser4_lookup_common(struct inode *parent,
58900+ struct dentry *dentry,
58901+ struct nameidata *nameidata);
58902+int reiser4_link_common(struct dentry *existing, struct inode *parent,
58903+ struct dentry *newname);
58904+int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
58905+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
58906+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
44254afd 58907+ const char *linkname);
71430cf6 58908+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
44254afd 58909+ int mode, dev_t rdev);
71430cf6
MT
58910+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
58911+ struct inode *new_dir, struct dentry *new_name);
58912+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
58913+int reiser4_permission_common(struct inode *, int mask,
58914+ struct nameidata *nameidata);
58915+int reiser4_setattr_common(struct dentry *, struct iattr *);
58916+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
58917+ struct kstat *);
44254afd
MT
58918+
58919+/* common implementations of file operations */
71430cf6
MT
58920+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
58921+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
58922+int reiser4_release_dir_common(struct inode *, struct file *);
58923+int reiser4_sync_common(struct file *, struct dentry *, int datasync);
44254afd
MT
58924+
58925+/* common implementations of address space operations */
58926+int prepare_write_common(struct file *, struct page *, unsigned from,
58927+ unsigned to);
58928+
58929+/* file plugin operations: common implementations */
58930+int write_sd_by_inode_common(struct inode *);
58931+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
58932+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
58933+ reiser4_object_create_data *);
58934+int adjust_to_parent_common(struct inode *object, struct inode *parent,
58935+ struct inode *root);
58936+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
58937+ struct inode *root);
58938+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
58939+ struct inode *root);
71430cf6
MT
58940+int reiser4_create_object_common(struct inode *object, struct inode *parent,
58941+ reiser4_object_create_data *);
58942+int reiser4_delete_object_common(struct inode *);
58943+int reiser4_delete_dir_common(struct inode *);
58944+int reiser4_add_link_common(struct inode *object, struct inode *parent);
58945+int reiser4_rem_link_common(struct inode *object, struct inode *parent);
44254afd
MT
58946+int rem_link_common_dir(struct inode *object, struct inode *parent);
58947+int owns_item_common(const struct inode *, const coord_t *);
58948+int owns_item_common_dir(const struct inode *, const coord_t *);
58949+int can_add_link_common(const struct inode *);
58950+int can_rem_link_common_dir(const struct inode *);
71430cf6
MT
58951+int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
58952+int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
44254afd
MT
58953+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
58954+reiser4_block_nr estimate_create_common(const struct inode *);
58955+reiser4_block_nr estimate_create_common_dir(const struct inode *);
58956+reiser4_block_nr estimate_update_common(const struct inode *);
58957+reiser4_block_nr estimate_unlink_common(const struct inode *,
58958+ const struct inode *);
58959+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
58960+ const struct inode *);
58961+char *wire_write_common(struct inode *, char *start);
58962+char *wire_read_common(char *addr, reiser4_object_on_wire *);
58963+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
58964+int wire_size_common(struct inode *);
58965+void wire_done_common(reiser4_object_on_wire *);
58966+
58967+/* dir plugin operations: common implementations */
58968+struct dentry *get_parent_common(struct inode *child);
58969+int is_name_acceptable_common(const struct inode *, const char *name, int len);
58970+void build_entry_key_common(const struct inode *,
58971+ const struct qstr *qname, reiser4_key *);
58972+int build_readdir_key_common(struct file *dir, reiser4_key *);
71430cf6 58973+int reiser4_add_entry_common(struct inode *object, struct dentry *where,
44254afd 58974+ reiser4_object_create_data *, reiser4_dir_entry_desc *);
71430cf6 58975+int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
44254afd 58976+ reiser4_dir_entry_desc *);
71430cf6
MT
58977+int reiser4_dir_init_common(struct inode *object, struct inode *parent,
58978+ reiser4_object_create_data *);
58979+int reiser4_dir_done_common(struct inode *);
58980+int reiser4_attach_common(struct inode *child, struct inode *parent);
58981+int reiser4_detach_common(struct inode *object, struct inode *parent);
44254afd
MT
58982+reiser4_block_nr estimate_add_entry_common(const struct inode *);
58983+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
58984+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
58985+ const struct inode *);
58986+
58987+/* these are essential parts of common implementations, they are to make
58988+ customized implementations easier */
58989+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
58990+
58991+/* merely useful functions */
58992+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
58993+ const reiser4_key *, int silent);
58994+
44254afd
MT
58995+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
58996+#endif
58997+
58998+/* Make Linus happy.
58999+ Local variables:
59000+ c-indentation-style: "K&R"
59001+ mode-name: "LC"
59002+ c-basic-offset: 8
59003+ tab-width: 8
59004+ fill-column: 120
59005+ End:
59006+*/
71430cf6
MT
59007diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin.c linux-2.6.22/fs/reiser4/plugin/plugin.c
59008--- linux-2.6.22.orig/fs/reiser4/plugin/plugin.c 1970-01-01 03:00:00.000000000 +0300
59009+++ linux-2.6.22/fs/reiser4/plugin/plugin.c 2007-07-29 00:25:34.992726502 +0400
59010@@ -0,0 +1,559 @@
44254afd
MT
59011+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59012+ * reiser4/README */
59013+
59014+/* Basic plugin infrastructure, lookup etc. */
59015+
59016+/* PLUGINS:
59017+
59018+ Plugins are internal Reiser4 "modules" or "objects" used to increase
59019+ extensibility and allow external users to easily adapt reiser4 to
59020+ their needs.
59021+
59022+ Plugins are classified into several disjoint "types". Plugins
59023+ belonging to the particular plugin type are termed "instances" of
71430cf6
MT
59024+ this type. Existing types are listed by enum reiser4_plugin_type
59025+ (see plugin/plugin_header.h)
44254afd
MT
59026+
59027+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59028+
59029+ Object (file) plugin determines how given file-system object serves
59030+ standard VFS requests for read, write, seek, mmap etc. Instances of
59031+ file plugins are: regular file, directory, symlink. Another example
59032+ of file plugin is audit plugin, that optionally records accesses to
59033+ underlying object and forwards requests to it.
59034+
59035+ Hash plugins compute hashes used by reiser4 to store and locate
59036+ files within directories. Instances of hash plugin type are: r5,
59037+ tea, rupasov.
59038+
59039+ Tail plugins (or, more precisely, tail policy plugins) determine
59040+ when last part of the file should be stored in a formatted item.
59041+
44254afd
MT
59042+ Scope and lookup:
59043+
59044+ label such that pair ( type_label, plugin_label ) is unique. This
59045+ pair is a globally persistent and user-visible plugin
59046+ identifier. Internally kernel maintains plugins and plugin types in
59047+ arrays using an index into those arrays as plugin and plugin type
59048+ identifiers. File-system in turn, also maintains persistent
59049+ "dictionary" which is mapping from plugin label to numerical
59050+ identifier which is stored in file-system objects. That is, we
59051+ store the offset into the plugin array for that plugin type as the
59052+ plugin id in the stat data of the filesystem object.
59053+
44254afd
MT
59054+ Internal kernel plugin type identifier (index in plugins[] array) is
59055+ of type reiser4_plugin_type. Set of available plugin types is
59056+ currently static, but dynamic loading doesn't seem to pose
59057+ insurmountable problems.
59058+
59059+ Within each type plugins are addressed by the identifiers of type
71430cf6
MT
59060+ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
59061+ Such identifiers are only required to be unique within one type,
59062+ not globally.
44254afd
MT
59063+
59064+ Thus, plugin in memory is uniquely identified by the pair (type_id,
59065+ id).
59066+
59067+ Usage:
59068+
59069+ There exists only one instance of each plugin instance, but this
59070+ single instance can be associated with many entities (file-system
59071+ objects, items, nodes, transactions, file-descriptors etc.). Entity
59072+ to which plugin of given type is termed (due to the lack of
59073+ imagination) "subject" of this plugin type and, by abuse of
59074+ terminology, subject of particular instance of this type to which
59075+ it's attached currently. For example, inode is subject of object
59076+ plugin type. Inode representing directory is subject of directory
59077+ plugin, hash plugin type and some particular instance of hash plugin
59078+ type. Inode, representing regular file is subject of "regular file"
59079+ plugin, tail-policy plugin type etc.
59080+
59081+ With each subject the plugin possibly stores some state. For example,
59082+ the state of a directory plugin (instance of object plugin type) is pointer
71430cf6 59083+ to hash plugin (if directories always use hashing that is).
44254afd
MT
59084+
59085+ Interface:
59086+
59087+ In addition to a scalar identifier, each plugin type and plugin
59088+ proper has a "label": short string and a "description"---longer
59089+ descriptive string. Labels and descriptions of plugin types are
59090+ hard-coded into plugins[] array, declared and defined in
59091+ plugin.c. Label and description of plugin are stored in .label and
59092+ .desc fields of reiser4_plugin_header respectively. It's possible to
59093+ locate plugin by the pair of labels.
59094+
71430cf6 59095+ Features (not implemented):
44254afd
MT
59096+
59097+ . user-level plugin manipulations:
59098+ + reiser4("filename/..file_plugin<='audit'");
59099+ + write(open("filename/..file_plugin"), "audit", 8);
59100+
59101+ . user level utilities lsplug and chplug to manipulate plugins.
59102+ Utilities are not of primary priority. Possibly they will be not
59103+ working on v4.0
59104+
71430cf6
MT
59105+ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
59106+ option, do you agree? I don't think that specifying it at mount time,
59107+ and then changing it with each mount, is a good model for usage.
44254afd
MT
59108+
59109+ . mount option "plug" to set-up plugins of root-directory.
59110+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
59111+
59112+ Limitations:
59113+
59114+ . each plugin type has to provide at least one builtin
59115+ plugin. This is technical limitation and it can be lifted in the
59116+ future.
59117+
59118+ TODO:
59119+
59120+ New plugin types/plugings:
59121+ Things we should be able to separately choose to inherit:
59122+
59123+ security plugins
59124+
59125+ stat data
59126+
59127+ file bodies
59128+
59129+ file plugins
59130+
59131+ dir plugins
59132+
59133+ . perm:acl
59134+
71430cf6 59135+ . audi---audit plugin intercepting and possibly logging all
44254afd
MT
59136+ accesses to object. Requires to put stub functions in file_operations
59137+ in stead of generic_file_*.
59138+
59139+NIKITA-FIXME-HANS: why make overflows a plugin?
59140+ . over---handle hash overflows
59141+
59142+ . sqnt---handle different access patterns and instruments read-ahead
59143+
59144+NIKITA-FIXME-HANS: describe the line below in more detail.
59145+
59146+ . hier---handle inheritance of plugins along file-system hierarchy
59147+
59148+ Different kinds of inheritance: on creation vs. on access.
59149+ Compatible/incompatible plugins.
59150+ Inheritance for multi-linked files.
59151+ Layered plugins.
59152+ Notion of plugin context is abandoned.
59153+
59154+Each file is associated
59155+ with one plugin and dependant plugins (hash, etc.) are stored as
59156+ main plugin state. Now, if we have plugins used for regular files
59157+ but not for directories, how such plugins would be inherited?
59158+ . always store them with directories also
59159+
71430cf6
MT
59160+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
59161+the line below which is also useful.
44254afd
MT
59162+
59163+ . use inheritance hierarchy, independent of file-system namespace
44254afd
MT
59164+*/
59165+
59166+#include "../debug.h"
59167+#include "../dformat.h"
59168+#include "plugin_header.h"
59169+#include "item/static_stat.h"
59170+#include "node/node.h"
59171+#include "security/perm.h"
59172+#include "space/space_allocator.h"
59173+#include "disk_format/disk_format.h"
59174+#include "plugin.h"
59175+#include "../reiser4.h"
59176+#include "../jnode.h"
59177+#include "../inode.h"
59178+
59179+#include <linux/fs.h> /* for struct super_block */
59180+
71430cf6
MT
59181+/*
59182+ * init_plugins - initialize plugin sub-system.
59183+ * Just call this once on reiser4 startup.
44254afd
MT
59184+ *
59185+ * Initializes plugin sub-system. It is part of reiser4 module
59186+ * initialization. For each plugin of each type init method is called and each
59187+ * plugin is put into list of plugins.
59188+ */
59189+int init_plugins(void)
59190+{
59191+ reiser4_plugin_type type_id;
59192+
59193+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
71430cf6 59194+ struct reiser4_plugin_type_data *ptype;
44254afd
MT
59195+ int i;
59196+
59197+ ptype = &plugins[type_id];
59198+ assert("nikita-3508", ptype->label != NULL);
59199+ assert("nikita-3509", ptype->type_id == type_id);
59200+
59201+ INIT_LIST_HEAD(&ptype->plugins_list);
59202+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59203+ for (i = 0; i < ptype->builtin_num; ++i) {
59204+ reiser4_plugin *plugin;
59205+
59206+ plugin = plugin_at(ptype, i);
59207+
59208+ if (plugin->h.label == NULL)
59209+ /* uninitialized slot encountered */
59210+ continue;
59211+ assert("nikita-3445", plugin->h.type_id == type_id);
59212+ plugin->h.id = i;
59213+ if (plugin->h.pops != NULL &&
59214+ plugin->h.pops->init != NULL) {
59215+ int result;
59216+
59217+ result = plugin->h.pops->init(plugin);
59218+ if (result != 0)
59219+ return result;
59220+ }
59221+ INIT_LIST_HEAD(&plugin->h.linkage);
59222+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59223+ }
59224+ }
59225+ return 0;
59226+}
59227+
59228+/* true if plugin type id is valid */
71430cf6 59229+int is_plugin_type_valid(reiser4_plugin_type type)
44254afd 59230+{
71430cf6 59231+ /* "type" is unsigned, so no comparison with 0 is
44254afd 59232+ necessary */
71430cf6 59233+ return (type < REISER4_PLUGIN_TYPES);
44254afd
MT
59234+}
59235+
59236+/* true if plugin id is valid */
71430cf6 59237+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
44254afd 59238+{
71430cf6
MT
59239+ assert("nikita-1653", is_plugin_type_valid(type));
59240+ return id < plugins[type].builtin_num;
44254afd
MT
59241+}
59242+
71430cf6 59243+/* return plugin by its @type and @id.
44254afd
MT
59244+
59245+ Both arguments are checked for validness: this is supposed to be called
59246+ from user-level.
59247+
59248+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59249+user space, and passed to the filesystem by use of method files? Your
59250+comment really confused me on the first reading....
59251+
59252+*/
71430cf6
MT
59253+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
59254+ * unchecked */,
44254afd 59255+ reiser4_plugin_id id /* plugin id,
71430cf6 59256+ * unchecked */)
44254afd 59257+{
71430cf6
MT
59258+ if (is_plugin_type_valid(type)) {
59259+ if (is_plugin_id_valid(type, id))
59260+ return plugin_at(&plugins[type], id);
44254afd
MT
59261+ else
59262+ /* id out of bounds */
59263+ warning("nikita-2913",
71430cf6 59264+ "Invalid plugin id: [%i:%i]", type, id);
44254afd
MT
59265+ } else
59266+ /* type_id out of bounds */
71430cf6 59267+ warning("nikita-2914", "Invalid type_id: %i", type);
44254afd
MT
59268+ return NULL;
59269+}
59270+
59271+/**
59272+ * save_plugin_id - store plugin id in disk format
59273+ * @plugin: plugin to convert
59274+ * @area: where to store result
59275+ *
59276+ * Puts id of @plugin in little endian format to address @area.
59277+ */
59278+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59279+ d16 *area /* where to store result */ )
59280+{
59281+ assert("nikita-1261", plugin != NULL);
59282+ assert("nikita-1262", area != NULL);
59283+
59284+ put_unaligned(cpu_to_le16(plugin->h.id), area);
59285+ return 0;
59286+}
59287+
59288+/* list of all plugins of given type */
71430cf6 59289+struct list_head *get_plugin_list(reiser4_plugin_type type)
44254afd 59290+{
71430cf6
MT
59291+ assert("nikita-1056", is_plugin_type_valid(type));
59292+ return &plugins[type].plugins_list;
44254afd
MT
59293+}
59294+
71430cf6 59295+static void update_pset_mask(reiser4_inode * info, pset_member memb)
44254afd
MT
59296+{
59297+ struct dentry *rootdir;
59298+ reiser4_inode *root;
59299+
71430cf6
MT
59300+ assert("edward-1443", memb != PSET_FILE);
59301+
44254afd
MT
59302+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59303+ if (rootdir != NULL) {
59304+ root = reiser4_inode_data(rootdir->d_inode);
59305+ /*
59306+ * if inode is different from the default one, or we are
59307+ * changing plugin of root directory, update plugin_mask
59308+ */
71430cf6
MT
59309+ if (aset_get(info->pset, memb) !=
59310+ aset_get(root->pset, memb) ||
44254afd
MT
59311+ info == root)
59312+ info->plugin_mask |= (1 << memb);
71430cf6
MT
59313+ else
59314+ info->plugin_mask &= ~(1 << memb);
44254afd
MT
59315+ }
59316+}
59317+
71430cf6
MT
59318+/* Get specified plugin set member from parent,
59319+ or from fs-defaults (if no parent is given) and
59320+ install the result to pset of @self */
59321+int grab_plugin_pset(struct inode *self,
59322+ struct inode *ancestor,
59323+ pset_member memb)
44254afd 59324+{
71430cf6 59325+ reiser4_plugin *plug;
44254afd
MT
59326+ reiser4_inode *info;
59327+ int result = 0;
59328+
71430cf6 59329+ /* Do not grab if initialised already. */
44254afd 59330+ info = reiser4_inode_data(self);
71430cf6
MT
59331+ if (aset_get(info->pset, memb) != NULL)
59332+ return 0;
59333+ if (ancestor) {
59334+ reiser4_inode *parent;
59335+
59336+ parent = reiser4_inode_data(ancestor);
59337+ plug = aset_get(parent->hset, memb) ? :
59338+ aset_get(parent->pset, memb);
59339+ }
59340+ else
59341+ plug = get_default_plugin(memb);
59342+
59343+ result = set_plugin(&info->pset, memb, plug);
59344+ if (result == 0) {
59345+ if (!ancestor || self->i_sb->s_root->d_inode != self)
59346+ update_pset_mask(info, memb);
59347+ }
59348+ return result;
59349+}
59350+
59351+/* Take missing pset members from root inode */
59352+int finish_pset(struct inode *inode)
59353+{
59354+ reiser4_plugin *plug;
59355+ reiser4_inode *root;
59356+ reiser4_inode *info;
59357+ pset_member memb;
59358+ int result = 0;
59359+
59360+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
59361+ info = reiser4_inode_data(inode);
59362+
59363+ assert("edward-1455", root != NULL);
59364+ assert("edward-1456", info != NULL);
59365+
59366+ /* file and directory plugins are already initialized. */
59367+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
59368+
59369+ /* Do not grab if initialised already. */
59370+ if (aset_get(info->pset, memb) != NULL)
59371+ continue;
59372+
59373+ plug = aset_get(root->pset, memb);
59374+ result = set_plugin(&info->pset, memb, plug);
59375+ if (result != 0)
59376+ break;
59377+ }
59378+ if (result != 0) {
59379+ warning("nikita-3447",
59380+ "Cannot set up plugins for %lli",
59381+ (unsigned long long)
59382+ get_inode_oid(inode));
44254afd
MT
59383+ }
59384+ return result;
59385+}
59386+
71430cf6 59387+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
44254afd
MT
59388+{
59389+ reiser4_inode *info;
59390+ int result = 0;
59391+
71430cf6
MT
59392+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
59393+ /* Changing pset in the root object. */
59394+ return RETERR(-EINVAL);
59395+ }
59396+
44254afd
MT
59397+ info = reiser4_inode_data(self);
59398+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
71430cf6 59399+ result = plug->h.pops->change(self, plug, memb);
44254afd 59400+ else
71430cf6
MT
59401+ result = aset_set_unsafe(&info->pset, memb, plug);
59402+ if (result == 0) {
59403+ __u16 oldmask = info->plugin_mask;
59404+
59405+ update_pset_mask(info, memb);
59406+ if (oldmask != info->plugin_mask)
59407+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
59408+ }
44254afd
MT
59409+ return result;
59410+}
59411+
71430cf6 59412+struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
44254afd
MT
59413+ /* C90 initializers */
59414+ [REISER4_FILE_PLUGIN_TYPE] = {
59415+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59416+ .label = "file",
59417+ .desc = "Object plugins",
59418+ .builtin_num = sizeof_array(file_plugins),
59419+ .builtin = file_plugins,
59420+ .plugins_list = {NULL, NULL},
59421+ .size = sizeof(file_plugin)
59422+ },
59423+ [REISER4_DIR_PLUGIN_TYPE] = {
59424+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59425+ .label = "dir",
59426+ .desc = "Directory plugins",
59427+ .builtin_num = sizeof_array(dir_plugins),
59428+ .builtin = dir_plugins,
59429+ .plugins_list = {NULL, NULL},
59430+ .size = sizeof(dir_plugin)
59431+ },
59432+ [REISER4_HASH_PLUGIN_TYPE] = {
59433+ .type_id = REISER4_HASH_PLUGIN_TYPE,
59434+ .label = "hash",
59435+ .desc = "Directory hashes",
59436+ .builtin_num = sizeof_array(hash_plugins),
59437+ .builtin = hash_plugins,
59438+ .plugins_list = {NULL, NULL},
59439+ .size = sizeof(hash_plugin)
59440+ },
59441+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
59442+ .type_id =
59443+ REISER4_FIBRATION_PLUGIN_TYPE,
59444+ .label = "fibration",
59445+ .desc = "Directory fibrations",
59446+ .builtin_num = sizeof_array(fibration_plugins),
59447+ .builtin = fibration_plugins,
59448+ .plugins_list = {NULL, NULL},
59449+ .size = sizeof(fibration_plugin)
59450+ },
59451+ [REISER4_CIPHER_PLUGIN_TYPE] = {
59452+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59453+ .label = "cipher",
59454+ .desc = "Cipher plugins",
59455+ .builtin_num = sizeof_array(cipher_plugins),
59456+ .builtin = cipher_plugins,
59457+ .plugins_list = {NULL, NULL},
59458+ .size = sizeof(cipher_plugin)
59459+ },
59460+ [REISER4_DIGEST_PLUGIN_TYPE] = {
59461+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
59462+ .label = "digest",
59463+ .desc = "Digest plugins",
59464+ .builtin_num = sizeof_array(digest_plugins),
59465+ .builtin = digest_plugins,
59466+ .plugins_list = {NULL, NULL},
59467+ .size = sizeof(digest_plugin)
59468+ },
59469+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
59470+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
59471+ .label = "compression",
59472+ .desc = "Compression plugins",
59473+ .builtin_num = sizeof_array(compression_plugins),
59474+ .builtin = compression_plugins,
59475+ .plugins_list = {NULL, NULL},
59476+ .size = sizeof(compression_plugin)
59477+ },
59478+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
59479+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
59480+ .label = "formatting",
59481+ .desc = "Tail inlining policies",
59482+ .builtin_num = sizeof_array(formatting_plugins),
59483+ .builtin = formatting_plugins,
59484+ .plugins_list = {NULL, NULL},
59485+ .size = sizeof(formatting_plugin)
59486+ },
59487+ [REISER4_PERM_PLUGIN_TYPE] = {
59488+ .type_id = REISER4_PERM_PLUGIN_TYPE,
59489+ .label = "perm",
59490+ .desc = "Permission checks",
59491+ .builtin_num = sizeof_array(perm_plugins),
59492+ .builtin = perm_plugins,
59493+ .plugins_list = {NULL, NULL},
59494+ .size = sizeof(perm_plugin)
59495+ },
59496+ [REISER4_ITEM_PLUGIN_TYPE] = {
59497+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
59498+ .label = "item",
59499+ .desc = "Item handlers",
59500+ .builtin_num = sizeof_array(item_plugins),
59501+ .builtin = item_plugins,
59502+ .plugins_list = {NULL, NULL},
59503+ .size = sizeof(item_plugin)
59504+ },
59505+ [REISER4_NODE_PLUGIN_TYPE] = {
59506+ .type_id = REISER4_NODE_PLUGIN_TYPE,
59507+ .label = "node",
59508+ .desc = "node layout handlers",
59509+ .builtin_num = sizeof_array(node_plugins),
59510+ .builtin = node_plugins,
59511+ .plugins_list = {NULL, NULL},
59512+ .size = sizeof(node_plugin)
59513+ },
59514+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
59515+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
59516+ .label = "sd_ext",
59517+ .desc = "Parts of stat-data",
59518+ .builtin_num = sizeof_array(sd_ext_plugins),
59519+ .builtin = sd_ext_plugins,
59520+ .plugins_list = {NULL, NULL},
59521+ .size = sizeof(sd_ext_plugin)
59522+ },
59523+ [REISER4_FORMAT_PLUGIN_TYPE] = {
59524+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
59525+ .label = "disk_layout",
59526+ .desc = "defines filesystem on disk layout",
59527+ .builtin_num = sizeof_array(format_plugins),
59528+ .builtin = format_plugins,
59529+ .plugins_list = {NULL, NULL},
59530+ .size = sizeof(disk_format_plugin)
59531+ },
59532+ [REISER4_JNODE_PLUGIN_TYPE] = {
59533+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
59534+ .label = "jnode",
59535+ .desc = "defines kind of jnode",
59536+ .builtin_num = sizeof_array(jnode_plugins),
59537+ .builtin = jnode_plugins,
59538+ .plugins_list = {NULL, NULL},
59539+ .size = sizeof(jnode_plugin)
59540+ },
59541+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
59542+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59543+ .label = "compression_mode",
59544+ .desc = "Defines compression mode",
59545+ .builtin_num = sizeof_array(compression_mode_plugins),
59546+ .builtin = compression_mode_plugins,
59547+ .plugins_list = {NULL, NULL},
59548+ .size = sizeof(compression_mode_plugin)
59549+ },
59550+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
59551+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
59552+ .label = "cluster",
59553+ .desc = "Defines cluster size",
59554+ .builtin_num = sizeof_array(cluster_plugins),
59555+ .builtin = cluster_plugins,
59556+ .plugins_list = {NULL, NULL},
59557+ .size = sizeof(cluster_plugin)
44254afd
MT
59558+ }
59559+};
59560+
59561+/*
59562+ * Local variables:
59563+ * c-indentation-style: "K&R"
59564+ * mode-name: "LC"
59565+ * c-basic-offset: 8
59566+ * tab-width: 8
59567+ * fill-column: 120
59568+ * End:
59569+ */
71430cf6
MT
59570diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin.h linux-2.6.22/fs/reiser4/plugin/plugin.h
59571--- linux-2.6.22.orig/fs/reiser4/plugin/plugin.h 1970-01-01 03:00:00.000000000 +0300
59572+++ linux-2.6.22/fs/reiser4/plugin/plugin.h 2007-07-29 00:25:34.996727537 +0400
59573@@ -0,0 +1,899 @@
44254afd
MT
59574+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59575+
59576+/* Basic plugin data-types.
59577+ see fs/reiser4/plugin/plugin.c for details */
59578+
59579+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
59580+#define __FS_REISER4_PLUGIN_TYPES_H__
59581+
59582+#include "../forward.h"
59583+#include "../debug.h"
59584+#include "../dformat.h"
59585+#include "../key.h"
59586+#include "compress/compress.h"
59587+#include "crypto/cipher.h"
59588+#include "plugin_header.h"
59589+#include "item/static_stat.h"
59590+#include "item/internal.h"
59591+#include "item/sde.h"
59592+#include "item/cde.h"
59593+#include "item/item.h"
59594+#include "node/node.h"
59595+#include "node/node40.h"
59596+#include "security/perm.h"
59597+#include "fibration.h"
59598+
59599+#include "space/bitmap.h"
59600+#include "space/space_allocator.h"
59601+
59602+#include "disk_format/disk_format40.h"
59603+#include "disk_format/disk_format.h"
59604+
59605+#include <linux/fs.h> /* for struct super_block, address_space */
59606+#include <linux/mm.h> /* for struct page */
59607+#include <linux/buffer_head.h> /* for struct buffer_head */
59608+#include <linux/dcache.h> /* for struct dentry */
59609+#include <linux/types.h>
59610+#include <linux/crypto.h>
59611+
59612+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
59613+
59614+/*
59615+ * File plugin. Defines the set of methods that file plugins implement, some
59616+ * of which are optional.
59617+ *
59618+ * A file plugin offers to the caller an interface for IO ( writing to and/or
59619+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
59620+ * may affect more than one physical sequence of bytes, or no physical sequence
59621+ * of bytes, it may affect sequences of bytes offered by other file plugins to
59622+ * the semantic layer, and the file plugin may invoke other plugins and
59623+ * delegate work to them, but its interface is structured for offering the
59624+ * caller the ability to read and/or write what the caller sees as being a
59625+ * single sequence of bytes.
59626+ *
59627+ * The file plugin must present a sequence of bytes to the caller, but it does
59628+ * not necessarily have to store a sequence of bytes, it does not necessarily
59629+ * have to support efficient tree traversal to any offset in the sequence of
59630+ * bytes (tail and extent items, whose keys contain offsets, do however provide
59631+ * efficient non-sequential lookup of any offset in the sequence of bytes).
59632+ *
59633+ * Directory plugins provide methods for selecting file plugins by resolving a
59634+ * name for them.
59635+ *
59636+ * The functionality other filesystems call an attribute, and rigidly tie
59637+ * together, we decompose into orthogonal selectable features of files. Using
59638+ * the terminology we will define next, an attribute is a perhaps constrained,
59639+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
59640+ * which might be grandparent-major-packed, and whose parent has a deletion
59641+ * method that deletes it.
59642+ *
59643+ * File plugins can implement constraints.
59644+ *
59645+ * Files can be of variable length (e.g. regular unix files), or of static
59646+ * length (e.g. static sized attributes).
59647+ *
59648+ * An object may have many sequences of bytes, and many file plugins, but, it
59649+ * has exactly one objectid. It is usually desirable that an object has a
59650+ * deletion method which deletes every item with that objectid. Items cannot
59651+ * in general be found by just their objectids. This means that an object must
59652+ * have either a method built into its deletion plugin method for knowing what
59653+ * items need to be deleted, or links stored with the object that provide the
59654+ * plugin with a method for finding those items. Deleting a file within an
59655+ * object may or may not have the effect of deleting the entire object,
59656+ * depending on the file plugin's deletion method.
59657+ *
59658+ * LINK TAXONOMY:
59659+ *
59660+ * Many objects have a reference count, and when the reference count reaches 0
59661+ * the object's deletion method is invoked. Some links embody a reference
59662+ * count increase ("countlinks"), and others do not ("nocountlinks").
59663+ *
59664+ * Some links are bi-directional links ("bilinks"), and some are
59665+ * uni-directional("unilinks").
59666+ *
59667+ * Some links are between parts of the same object ("intralinks"), and some are
59668+ * between different objects ("interlinks").
59669+ *
59670+ * PACKING TAXONOMY:
59671+ *
59672+ * Some items of an object are stored with a major packing locality based on
59673+ * their object's objectid (e.g. unix directory items in plan A), and these are
59674+ * called "self-major-packed".
59675+ *
59676+ * Some items of an object are stored with a major packing locality based on
59677+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
59678+ * and these are called "parent-major-packed".
59679+ *
59680+ * Some items of an object are stored with a major packing locality based on
59681+ * their semantic grandparent, and these are called "grandparent-major-packed".
59682+ * Now carefully notice that we run into trouble with key length if we have to
59683+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
59684+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
59685+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
59686+ * grandparent-major-packed, and which to sacrifice is left to the item author
59687+ * choosing to make the item grandparent-major-packed. You cannot make tail
59688+ * items and extent items grandparent-major-packed, though you could make them
59689+ * self-major-packed (usually they are parent-major-packed).
59690+ *
59691+ * In the case of ACLs (which are composed of fixed length ACEs which consist
59692+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
59693+ * to not have an offset field in the ACE item key, and to allow duplicate keys
59694+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
59695+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
59696+ * a directory together), the minor packing locality of ACE, the objectid of
59697+ * the file, and 0.
59698+ *
59699+ * IO involves moving data from one location to another, which means that two
59700+ * locations must be specified, source and destination.
59701+ *
59702+ * This source and destination can be in the filesystem, or they can be a
59703+ * pointer in the user process address space plus a byte count.
59704+ *
59705+ * If both source and destination are in the filesystem, then at least one of
59706+ * them must be representable as a pure stream of bytes (which we call a flow,
59707+ * and define as a struct containing a key, a data pointer, and a length).
59708+ * This may mean converting one of them into a flow. We provide a generic
59709+ * cast_into_flow() method, which will work for any plugin supporting
59710+ * read_flow(), though it is inefficiently implemented in that it temporarily
59711+ * stores the flow in a buffer (Question: what to do with huge flows that
59712+ * cannot fit into memory? Answer: we must not convert them all at once. )
59713+ *
59714+ * Performing a write requires resolving the write request into a flow defining
59715+ * the source, and a method that performs the write, and a key that defines
59716+ * where in the tree the write is to go.
59717+ *
59718+ * Performing a read requires resolving the read request into a flow defining
59719+ * the target, and a method that performs the read, and a key that defines
59720+ * where in the tree the read is to come from.
59721+ *
59722+ * There will exist file plugins which have no pluginid stored on the disk for
59723+ * them, and which are only invoked by other plugins.
59724+ */
59725+
71430cf6
MT
59726+/* This should be incremented with each new contributed
59727+ pair (plugin type, plugin id).
59728+ NOTE: Make sure there is a release of reiser4progs
59729+ with the corresponding version number */
59730+#define PLUGIN_LIBRARY_VERSION 0
59731+
59732+ /* enumeration of fields within plugin_set */
59733+typedef enum {
59734+ PSET_FILE,
59735+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
59736+ * inode.c:read_inode() depends on this. */
59737+ PSET_PERM,
59738+ PSET_FORMATTING,
59739+ PSET_HASH,
59740+ PSET_FIBRATION,
59741+ PSET_SD,
59742+ PSET_DIR_ITEM,
59743+ PSET_CIPHER,
59744+ PSET_DIGEST,
59745+ PSET_COMPRESSION,
59746+ PSET_COMPRESSION_MODE,
59747+ PSET_CLUSTER,
59748+ PSET_CREATE,
59749+ PSET_LAST
59750+} pset_member;
59751+
44254afd
MT
59752+/* builtin file-plugins */
59753+typedef enum {
59754+ /* regular file */
59755+ UNIX_FILE_PLUGIN_ID,
59756+ /* directory */
59757+ DIRECTORY_FILE_PLUGIN_ID,
59758+ /* symlink */
59759+ SYMLINK_FILE_PLUGIN_ID,
59760+ /* for objects completely handled by the VFS: fifos, devices,
59761+ sockets */
59762+ SPECIAL_FILE_PLUGIN_ID,
59763+ /* regular cryptcompress file */
71430cf6 59764+ CRYPTCOMPRESS_FILE_PLUGIN_ID,
44254afd
MT
59765+ /* number of file plugins. Used as size of arrays to hold
59766+ file plugins. */
59767+ LAST_FILE_PLUGIN_ID
59768+} reiser4_file_id;
59769+
59770+typedef struct file_plugin {
59771+
59772+ /* generic fields */
59773+ plugin_header h;
59774+
59775+ struct inode_operations inode_ops;
59776+ struct file_operations file_ops;
59777+ struct address_space_operations as_ops;
59778+
59779+ /* save inode cached stat-data onto disk. It was called
59780+ reiserfs_update_sd() in 3.x */
59781+ int (*write_sd_by_inode) (struct inode *);
59782+
59783+ /*
59784+ * private methods: These are optional. If used they will allow you to
59785+ * minimize the amount of code needed to implement a deviation from
59786+ * some other method that also uses them.
59787+ */
59788+
59789+ /*
59790+ * Construct flow into @flow according to user-supplied data.
59791+ *
59792+ * This is used by read/write methods to construct a flow to
59793+ * write/read. ->flow_by_inode() is plugin method, rather than single
59794+ * global implementation, because key in a flow used by plugin may
59795+ * depend on data in a @buf.
59796+ *
59797+ * NIKITA-FIXME-HANS: please create statistics on what functions are
59798+ * dereferenced how often for the mongo benchmark. You can supervise
59799+ * Elena doing this for you if that helps. Email me the list of the
59800+ * top 10, with their counts, and an estimate of the total number of
59801+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
59802+ * processing (non-idle processing). If the total percent is, say,
59803+ * less than 1%, it will make our coding discussions much easier, and
59804+ * keep me from questioning whether functions like the below are too
59805+ * frequently called to be dereferenced. If the total percent is more
59806+ * than 1%, perhaps private methods should be listed in a "required"
59807+ * comment at the top of each plugin (with stern language about how if
59808+ * the comment is missing it will not be accepted by the maintainer),
59809+ * and implemented using macros not dereferenced functions. How about
59810+ * replacing this whole private methods part of the struct with a
59811+ * thorough documentation of what the standard helper functions are for
59812+ * use in constructing plugins? I think users have been asking for
59813+ * that, though not in so many words.
59814+ */
59815+ int (*flow_by_inode) (struct inode *, const char __user *buf,
59816+ int user, loff_t size,
59817+ loff_t off, rw_op op, flow_t *);
59818+
59819+ /*
59820+ * Return the key used to retrieve an offset of a file. It is used by
59821+ * default implementation of ->flow_by_inode() method
59822+ * (common_build_flow()) and, among other things, to get to the extent
59823+ * from jnode of unformatted node.
59824+ */
59825+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
59826+
59827+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
59828+ /*
59829+ * set the plugin for a file. Called during file creation in creat()
59830+ * but not reiser4() unless an inode already exists for the file.
59831+ */
59832+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
59833+ reiser4_object_create_data *);
59834+
59835+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
59836+ * are you setting up the object itself also or just adjusting the
59837+ * parent?.... */
59838+ /* set up plugins for new @object created in @parent. @root is root
59839+ directory. */
59840+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
59841+ struct inode *root);
59842+ /*
59843+ * this does whatever is necessary to do when object is created. For
59844+ * instance, for unix files stat data is inserted. It is supposed to be
59845+ * called by create of struct inode_operations.
59846+ */
59847+ int (*create_object) (struct inode *object, struct inode *parent,
59848+ reiser4_object_create_data *);
59849+
59850+ /* this does whatever is necessary to do when object is opened */
59851+ int (*open_object) (struct inode * inode, struct file * file);
59852+ /*
59853+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
59854+ * success. Deletion of an object usually includes removal of items
59855+ * building file body (for directories this is removal of "." and "..")
59856+ * and removal of stat-data item.
59857+ */
59858+ int (*delete_object) (struct inode *);
59859+
59860+ /* add link from @parent to @object */
59861+ int (*add_link) (struct inode *object, struct inode *parent);
59862+
59863+ /* remove link from @parent to @object */
59864+ int (*rem_link) (struct inode *object, struct inode *parent);
59865+
59866+ /*
59867+ * return true if item addressed by @coord belongs to @inode. This is
59868+ * used by read/write to properly slice flow into items in presence of
59869+ * multiple key assignment policies, because items of a file are not
59870+ * necessarily contiguous in a key space, for example, in a plan-b.
59871+ */
59872+ int (*owns_item) (const struct inode *, const coord_t *);
59873+
59874+ /* checks whether yet another hard links to this object can be
59875+ added */
59876+ int (*can_add_link) (const struct inode *);
59877+
59878+ /* checks whether hard links to this object can be removed */
59879+ int (*can_rem_link) (const struct inode *);
59880+
59881+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
59882+ detach of directory plugin to remove ".." */
59883+ int (*detach) (struct inode * child, struct inode * parent);
59884+
59885+ /* called when @child was just looked up in the @parent. It is not
59886+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
59887+ directory plugin */
59888+ int (*bind) (struct inode * child, struct inode * parent);
59889+
59890+ /* process safe-link during mount */
59891+ int (*safelink) (struct inode * object, reiser4_safe_link_t link,
59892+ __u64 value);
59893+
59894+ /* The couple of estimate methods for all file operations */
59895+ struct {
59896+ reiser4_block_nr(*create) (const struct inode *);
59897+ reiser4_block_nr(*update) (const struct inode *);
59898+ reiser4_block_nr(*unlink) (const struct inode *,
59899+ const struct inode *);
59900+ } estimate;
59901+
59902+ /*
59903+ * reiser4 specific part of inode has a union of structures which are
59904+ * specific to a plugin. This method is called when inode is read
59905+ * (read_inode) and when file is created (common_create_child) so that
59906+ * file plugin could initialize its inode data
59907+ */
59908+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
59909+ int);
59910+
59911+ /*
59912+ * This method performs progressive deletion of items and whole nodes
59913+ * from right to left.
59914+ *
59915+ * @tap: the point deletion process begins from,
59916+ * @from_key: the beginning of the deleted key range,
59917+ * @to_key: the end of the deleted key range,
59918+ * @smallest_removed: the smallest removed key,
59919+ *
59920+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
59921+ * operation was interrupted for allowing atom commit .
59922+ */
59923+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
59924+ const reiser4_key * to_key,
59925+ reiser4_key * smallest_removed, struct inode *,
59926+ int, int *);
59927+
59928+ /* called from ->destroy_inode() */
59929+ void (*destroy_inode) (struct inode *);
59930+
59931+ /*
59932+ * methods to serialize object identify. This is used, for example, by
59933+ * reiser4_{en,de}code_fh().
59934+ */
59935+ struct {
59936+ /* store object's identity at @area */
59937+ char *(*write) (struct inode * inode, char *area);
59938+ /* parse object from wire to the @obj */
59939+ char *(*read) (char *area, reiser4_object_on_wire * obj);
59940+ /* given object identity in @obj, find or create its dentry */
59941+ struct dentry *(*get) (struct super_block * s,
59942+ reiser4_object_on_wire * obj);
59943+ /* how many bytes ->wire.write() consumes */
59944+ int (*size) (struct inode * inode);
59945+ /* finish with object identify */
59946+ void (*done) (reiser4_object_on_wire * obj);
59947+ } wire;
59948+} file_plugin;
59949+
59950+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
59951+
59952+struct reiser4_object_on_wire {
59953+ file_plugin *plugin;
59954+ union {
59955+ struct {
59956+ obj_key_id key_id;
59957+ } std;
59958+ void *generic;
59959+ } u;
59960+};
59961+
59962+/* builtin dir-plugins */
59963+typedef enum {
59964+ HASHED_DIR_PLUGIN_ID,
59965+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
59966+ LAST_DIR_ID
59967+} reiser4_dir_id;
59968+
59969+typedef struct dir_plugin {
59970+ /* generic fields */
59971+ plugin_header h;
59972+
59973+ struct inode_operations inode_ops;
59974+ struct file_operations file_ops;
59975+ struct address_space_operations as_ops;
59976+
59977+ /*
59978+ * private methods: These are optional. If used they will allow you to
59979+ * minimize the amount of code needed to implement a deviation from
59980+ * some other method that uses them. You could logically argue that
59981+ * they should be a separate type of plugin.
59982+ */
59983+
59984+ struct dentry *(*get_parent) (struct inode * childdir);
59985+
59986+ /*
59987+ * check whether "name" is acceptable name to be inserted into this
59988+ * object. Optionally implemented by directory-like objects. Can check
59989+ * for maximal length, reserved symbols etc
59990+ */
59991+ int (*is_name_acceptable) (const struct inode * inode, const char *name,
59992+ int len);
59993+
59994+ void (*build_entry_key) (const struct inode * dir /* directory where
59995+ * entry is (or will
59996+ * be) in.*/ ,
59997+ const struct qstr * name /* name of file
59998+ * referenced by this
59999+ * entry */ ,
60000+ reiser4_key * result /* resulting key of
60001+ * directory entry */ );
60002+ int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60003+ int (*add_entry) (struct inode * object, struct dentry * where,
60004+ reiser4_object_create_data * data,
60005+ reiser4_dir_entry_desc * entry);
60006+ int (*rem_entry) (struct inode * object, struct dentry * where,
60007+ reiser4_dir_entry_desc * entry);
60008+
60009+ /*
60010+ * initialize directory structure for newly created object. For normal
60011+ * unix directories, insert dot and dotdot.
60012+ */
60013+ int (*init) (struct inode * object, struct inode * parent,
60014+ reiser4_object_create_data * data);
60015+
60016+ /* destroy directory */
60017+ int (*done) (struct inode * child);
60018+
60019+ /* called when @subdir was just looked up in the @dir */
60020+ int (*attach) (struct inode * subdir, struct inode * dir);
60021+ int (*detach) (struct inode * subdir, struct inode * dir);
60022+
60023+ struct {
60024+ reiser4_block_nr(*add_entry) (const struct inode *);
60025+ reiser4_block_nr(*rem_entry) (const struct inode *);
60026+ reiser4_block_nr(*unlink) (const struct inode *,
60027+ const struct inode *);
60028+ } estimate;
60029+} dir_plugin;
60030+
60031+extern dir_plugin dir_plugins[LAST_DIR_ID];
60032+
60033+typedef struct formatting_plugin {
60034+ /* generic fields */
60035+ plugin_header h;
60036+ /* returns non-zero iff file's tail has to be stored
60037+ in a direct item. */
60038+ int (*have_tail) (const struct inode * inode, loff_t size);
60039+} formatting_plugin;
60040+
60041+typedef struct hash_plugin {
60042+ /* generic fields */
60043+ plugin_header h;
60044+ /* computes hash of the given name */
60045+ __u64(*hash) (const unsigned char *name, int len);
60046+} hash_plugin;
60047+
60048+typedef struct cipher_plugin {
60049+ /* generic fields */
60050+ plugin_header h;
71430cf6
MT
60051+ struct crypto_blkcipher * (*alloc) (void);
60052+ void (*free) (struct crypto_blkcipher * tfm);
44254afd
MT
60053+ /* Offset translator. For each offset this returns (k * offset), where
60054+ k (k >= 1) is an expansion factor of the cipher algorithm.
60055+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
60056+ inflate data) offset translation guarantees that all disk cluster's
60057+ units will have keys smaller then next cluster's one.
60058+ */
60059+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60060+ /* Cipher algorithms can accept data only by chunks of cipher block
60061+ size. This method is to align any flow up to cipher block size when
60062+ we pass it to cipher algorithm. To align means to append padding of
60063+ special format specific to the cipher algorithm */
60064+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60065+ /* low-level key manager (check, install, etc..) */
60066+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60067+ unsigned int keylen);
60068+ /* main text processing procedures */
60069+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60070+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60071+} cipher_plugin;
60072+
60073+typedef struct digest_plugin {
60074+ /* generic fields */
60075+ plugin_header h;
60076+ /* fingerprint size in bytes */
60077+ int fipsize;
71430cf6
MT
60078+ struct crypto_hash * (*alloc) (void);
60079+ void (*free) (struct crypto_hash * tfm);
44254afd
MT
60080+} digest_plugin;
60081+
60082+typedef struct compression_plugin {
60083+ /* generic fields */
60084+ plugin_header h;
60085+ int (*init) (void);
60086+ /* the maximum number of bytes the size of the "compressed" data can
60087+ * exceed the uncompressed data. */
60088+ int (*overrun) (unsigned src_len);
60089+ coa_t(*alloc) (tfm_action act);
60090+ void (*free) (coa_t coa, tfm_action act);
60091+ /* minimal size of the flow we still try to compress */
60092+ int (*min_size_deflate) (void);
60093+ __u32(*checksum) (char *data, __u32 length);
60094+ /* main transform procedures */
60095+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60096+ __u8 * dst_first, unsigned *dst_len);
60097+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60098+ __u8 * dst_first, unsigned *dst_len);
60099+} compression_plugin;
60100+
60101+typedef struct compression_mode_plugin {
60102+ /* generic fields */
60103+ plugin_header h;
60104+ /* this is called when estimating compressibility
60105+ of a logical cluster by its content */
60106+ int (*should_deflate) (struct inode * inode, cloff_t index);
60107+ /* this is called when results of compression should be saved */
60108+ int (*accept_hook) (struct inode * inode, cloff_t index);
60109+ /* this is called when results of compression should be discarded */
60110+ int (*discard_hook) (struct inode * inode, cloff_t index);
60111+} compression_mode_plugin;
60112+
44254afd
MT
60113+typedef struct cluster_plugin {
60114+ /* generic fields */
60115+ plugin_header h;
60116+ int shift;
60117+} cluster_plugin;
60118+
60119+typedef struct sd_ext_plugin {
60120+ /* generic fields */
60121+ plugin_header h;
60122+ int (*present) (struct inode * inode, char **area, int *len);
60123+ int (*absent) (struct inode * inode);
60124+ int (*save_len) (struct inode * inode);
60125+ int (*save) (struct inode * inode, char **area);
60126+ /* alignment requirement for this stat-data part */
60127+ int alignment;
60128+} sd_ext_plugin;
60129+
60130+/* this plugin contains methods to allocate objectid for newly created files,
60131+ to deallocate objectid when file gets removed, to report number of used and
60132+ free objectids */
60133+typedef struct oid_allocator_plugin {
60134+ /* generic fields */
60135+ plugin_header h;
60136+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60137+ __u64 oids);
60138+ /* used to report statfs->f_files */
60139+ __u64(*oids_used) (reiser4_oid_allocator * map);
60140+ /* get next oid to use */
60141+ __u64(*next_oid) (reiser4_oid_allocator * map);
60142+ /* used to report statfs->f_ffree */
60143+ __u64(*oids_free) (reiser4_oid_allocator * map);
60144+ /* allocate new objectid */
60145+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60146+ /* release objectid */
60147+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60148+ /* how many pages to reserve in transaction for allocation of new
60149+ objectid */
60150+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60151+ /* how many pages to reserve in transaction for freeing of an
60152+ objectid */
60153+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
60154+ void (*print_info) (const char *, reiser4_oid_allocator *);
60155+} oid_allocator_plugin;
60156+
60157+/* disk layout plugin: this specifies super block, journal, bitmap (if there
60158+ are any) locations, etc */
60159+typedef struct disk_format_plugin {
60160+ /* generic fields */
60161+ plugin_header h;
60162+ /* replay journal, initialize super_info_data, etc */
60163+ int (*init_format) (struct super_block *, void *data);
60164+
60165+ /* key of root directory stat data */
60166+ const reiser4_key *(*root_dir_key) (const struct super_block *);
60167+
60168+ int (*release) (struct super_block *);
60169+ jnode *(*log_super) (struct super_block *);
60170+ int (*check_open) (const struct inode * object);
71430cf6 60171+ int (*version_update) (struct super_block *);
44254afd
MT
60172+} disk_format_plugin;
60173+
60174+struct jnode_plugin {
60175+ /* generic fields */
60176+ plugin_header h;
60177+ int (*init) (jnode * node);
60178+ int (*parse) (jnode * node);
60179+ struct address_space *(*mapping) (const jnode * node);
60180+ unsigned long (*index) (const jnode * node);
60181+ jnode *(*clone) (jnode * node);
60182+};
60183+
60184+/* plugin instance. */
60185+/* */
60186+/* This is "wrapper" union for all types of plugins. Most of the code uses */
60187+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60188+/* operates with pointers to reiser4_plugin. This union is only used in */
60189+/* some generic code in plugin/plugin.c that operates on all */
60190+/* plugins. Technically speaking purpose of this union is to add type */
60191+/* safety to said generic code: each plugin type (file_plugin, for */
60192+/* example), contains plugin_header as its first memeber. This first member */
60193+/* is located at the same place in memory as .h member of */
60194+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60195+/* looks in the .h which is header of plugin type located in union. This */
60196+/* allows to avoid type-casts. */
60197+union reiser4_plugin {
60198+ /* generic fields */
60199+ plugin_header h;
60200+ /* file plugin */
60201+ file_plugin file;
60202+ /* directory plugin */
60203+ dir_plugin dir;
60204+ /* hash plugin, used by directory plugin */
60205+ hash_plugin hash;
60206+ /* fibration plugin used by directory plugin */
60207+ fibration_plugin fibration;
60208+ /* cipher transform plugin, used by file plugin */
60209+ cipher_plugin cipher;
60210+ /* digest transform plugin, used by file plugin */
60211+ digest_plugin digest;
60212+ /* compression transform plugin, used by file plugin */
60213+ compression_plugin compression;
60214+ /* tail plugin, used by file plugin */
60215+ formatting_plugin formatting;
60216+ /* permission plugin */
60217+ perm_plugin perm;
60218+ /* node plugin */
60219+ node_plugin node;
60220+ /* item plugin */
60221+ item_plugin item;
60222+ /* stat-data extension plugin */
60223+ sd_ext_plugin sd_ext;
60224+ /* disk layout plugin */
60225+ disk_format_plugin format;
60226+ /* object id allocator plugin */
60227+ oid_allocator_plugin oid_allocator;
60228+ /* plugin for different jnode types */
60229+ jnode_plugin jnode;
60230+ /* compression mode plugin, used by object plugin */
60231+ compression_mode_plugin compression_mode;
60232+ /* cluster plugin, used by object plugin */
60233+ cluster_plugin clust;
44254afd
MT
60234+ /* place-holder for new plugin types that can be registered
60235+ dynamically, and used by other dynamically loaded plugins. */
60236+ void *generic;
60237+};
60238+
60239+struct reiser4_plugin_ops {
60240+ /* called when plugin is initialized */
60241+ int (*init) (reiser4_plugin * plugin);
60242+ /* called when plugin is unloaded */
60243+ int (*done) (reiser4_plugin * plugin);
60244+ /* load given plugin from disk */
60245+ int (*load) (struct inode * inode,
60246+ reiser4_plugin * plugin, char **area, int *len);
60247+ /* how many space is required to store this plugin's state
60248+ in stat-data */
60249+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60250+ /* save persistent plugin-data to disk */
60251+ int (*save) (struct inode * inode, reiser4_plugin * plugin,
60252+ char **area);
60253+ /* alignment requirement for on-disk state of this plugin
60254+ in number of bytes */
60255+ int alignment;
60256+ /* install itself into given inode. This can return error
60257+ (e.g., you cannot change hash of non-empty directory). */
71430cf6
MT
60258+ int (*change) (struct inode * inode, reiser4_plugin * plugin,
60259+ pset_member memb);
44254afd
MT
60260+ /* install itself into given inode. This can return error
60261+ (e.g., you cannot change hash of non-empty directory). */
60262+ int (*inherit) (struct inode * inode, struct inode * parent,
60263+ reiser4_plugin * plugin);
60264+};
60265+
60266+/* functions implemented in fs/reiser4/plugin/plugin.c */
60267+
60268+/* stores plugin reference in reiser4-specific part of inode */
60269+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
44254afd
MT
60270+extern int init_plugins(void);
60271+
60272+/* builtin plugins */
60273+
60274+/* builtin hash-plugins */
60275+
60276+typedef enum {
60277+ RUPASOV_HASH_ID,
60278+ R5_HASH_ID,
60279+ TEA_HASH_ID,
60280+ FNV1_HASH_ID,
60281+ DEGENERATE_HASH_ID,
60282+ LAST_HASH_ID
60283+} reiser4_hash_id;
60284+
60285+/* builtin cipher plugins */
60286+
60287+typedef enum {
60288+ NONE_CIPHER_ID,
44254afd
MT
60289+ LAST_CIPHER_ID
60290+} reiser4_cipher_id;
60291+
60292+/* builtin digest plugins */
60293+
60294+typedef enum {
60295+ SHA256_32_DIGEST_ID,
60296+ LAST_DIGEST_ID
60297+} reiser4_digest_id;
60298+
60299+/* builtin compression mode plugins */
60300+typedef enum {
60301+ NONE_COMPRESSION_MODE_ID,
71430cf6
MT
60302+ LATTD_COMPRESSION_MODE_ID,
60303+ ULTIM_COMPRESSION_MODE_ID,
44254afd 60304+ FORCE_COMPRESSION_MODE_ID,
71430cf6
MT
60305+ CONVX_COMPRESSION_MODE_ID,
60306+ LAST_COMPRESSION_MODE_ID
44254afd
MT
60307+} reiser4_compression_mode_id;
60308+
60309+/* builtin cluster plugins */
60310+typedef enum {
60311+ CLUSTER_64K_ID,
60312+ CLUSTER_32K_ID,
60313+ CLUSTER_16K_ID,
60314+ CLUSTER_8K_ID,
60315+ CLUSTER_4K_ID,
60316+ LAST_CLUSTER_ID
60317+} reiser4_cluster_id;
60318+
44254afd
MT
60319+/* builtin tail-plugins */
60320+
60321+typedef enum {
60322+ NEVER_TAILS_FORMATTING_ID,
60323+ ALWAYS_TAILS_FORMATTING_ID,
60324+ SMALL_FILE_FORMATTING_ID,
60325+ LAST_TAIL_FORMATTING_ID
60326+} reiser4_formatting_id;
60327+
44254afd
MT
60328+/* data type used to pack parameters that we pass to vfs object creation
60329+ function create_object() */
60330+struct reiser4_object_create_data {
60331+ /* plugin to control created object */
60332+ reiser4_file_id id;
60333+ /* mode of regular file, directory or special file */
60334+/* what happens if some other sort of perm plugin is in use? */
60335+ int mode;
60336+ /* rdev of special file */
60337+ dev_t rdev;
60338+ /* symlink target */
60339+ const char *name;
60340+ /* add here something for non-standard objects you invent, like
60341+ query for interpolation file etc. */
60342+
71430cf6 60343+ struct reiser4_crypto_info * crypto;
44254afd
MT
60344+
60345+ struct inode *parent;
60346+ struct dentry *dentry;
60347+};
60348+
60349+/* description of directory entry being created/destroyed/sought for
60350+
60351+ It is passed down to the directory plugin and farther to the
60352+ directory item plugin methods. Creation of new directory is done in
60353+ several stages: first we search for an entry with the same name, then
60354+ create new one. reiser4_dir_entry_desc is used to store some information
60355+ collected at some stage of this process and required later: key of
60356+ item that we want to insert/delete and pointer to an object that will
60357+ be bound by the new directory entry. Probably some more fields will
60358+ be added there.
60359+
60360+*/
60361+struct reiser4_dir_entry_desc {
60362+ /* key of directory entry */
60363+ reiser4_key key;
60364+ /* object bound by this entry. */
60365+ struct inode *obj;
60366+};
60367+
60368+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
60369+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
60370+
44254afd
MT
60371+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
60372+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
60373+{ \
60374+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
60375+ return plugin ? & plugin -> FIELD : NULL; \
60376+} \
60377+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60378+{ \
60379+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
60380+ return plugin ? & plugin -> FIELD : NULL; \
60381+} \
60382+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
60383+{ \
60384+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
60385+ return plugin ? & plugin -> FIELD : NULL; \
60386+} \
60387+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
60388+{ \
60389+ return ( reiser4_plugin * ) plugin; \
60390+} \
60391+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
60392+{ \
60393+ return TYPE ## _to_plugin (plugin) -> h.id; \
60394+} \
60395+typedef struct { int foo; } TYPE ## _plugin_dummy
60396+
60397+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60398+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60399+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60400+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60401+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60402+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60403+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60404+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60405+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60406+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60407+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60408+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60409+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60410+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60411+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60412+ compression_mode);
60413+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
44254afd
MT
60414+
60415+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60416+
60417+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60418+
60419+#define for_all_plugins(ptype, plugin) \
60420+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
60421+ get_plugin_list(ptype) != &plugin->h.linkage; \
60422+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60423+
60424+
71430cf6
MT
60425+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
60426+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
60427+extern int finish_pset(struct inode *inode);
44254afd
MT
60428+
60429+/* defined in fs/reiser4/plugin/object.c */
60430+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60431+/* defined in fs/reiser4/plugin/object.c */
60432+extern dir_plugin dir_plugins[LAST_DIR_ID];
60433+/* defined in fs/reiser4/plugin/item/static_stat.c */
60434+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
60435+/* defined in fs/reiser4/plugin/hash.c */
60436+extern hash_plugin hash_plugins[LAST_HASH_ID];
60437+/* defined in fs/reiser4/plugin/fibration.c */
60438+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
60439+/* defined in fs/reiser4/plugin/crypt.c */
60440+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
60441+/* defined in fs/reiser4/plugin/digest.c */
60442+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
60443+/* defined in fs/reiser4/plugin/compress/compress.c */
60444+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
60445+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
60446+extern compression_mode_plugin
60447+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
60448+/* defined in fs/reiser4/plugin/cluster.c */
60449+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
44254afd
MT
60450+/* defined in fs/reiser4/plugin/tail.c */
60451+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
60452+/* defined in fs/reiser4/plugin/security/security.c */
60453+extern perm_plugin perm_plugins[LAST_PERM_ID];
60454+/* defined in fs/reiser4/plugin/item/item.c */
60455+extern item_plugin item_plugins[LAST_ITEM_ID];
60456+/* defined in fs/reiser4/plugin/node/node.c */
60457+extern node_plugin node_plugins[LAST_NODE_ID];
60458+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
60459+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
60460+
60461+/* __FS_REISER4_PLUGIN_TYPES_H__ */
60462+#endif
60463+
60464+/* Make Linus happy.
60465+ Local variables:
60466+ c-indentation-style: "K&R"
60467+ mode-name: "LC"
60468+ c-basic-offset: 8
60469+ tab-width: 8
60470+ fill-column: 120
60471+ End:
60472+*/
71430cf6
MT
60473diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.22/fs/reiser4/plugin/plugin_header.h
60474--- linux-2.6.22.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
60475+++ linux-2.6.22/fs/reiser4/plugin/plugin_header.h 2007-07-29 00:25:34.996727537 +0400
60476@@ -0,0 +1,155 @@
44254afd
MT
60477+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60478+
60479+/* plugin header. Data structures required by all plugin types. */
60480+
60481+#if !defined( __PLUGIN_HEADER_H__ )
60482+#define __PLUGIN_HEADER_H__
60483+
60484+/* plugin data-types and constants */
60485+
60486+#include "../debug.h"
60487+#include "../dformat.h"
60488+
71430cf6
MT
60489+/* Every plugin type can be considered as a class of virtual objects
60490+ {(type, i) | i = 0, 1, ...}, which has one the following categories
60491+ of virtualization:
60492+ A - no virtualization;
60493+ F - per-file virtualization;
60494+ S - per-superblock virtualization;
60495+ FIXME-EDWARD: Define every such category */
60496+
60497+/* Supported plugin types: (id, (virtualization category), short description) */
44254afd 60498+typedef enum {
71430cf6
MT
60499+ REISER4_FILE_PLUGIN_TYPE, /* (F) service VFS enry-points */
60500+ REISER4_DIR_PLUGIN_TYPE, /* (F) service VFS enry-points */
60501+ REISER4_ITEM_PLUGIN_TYPE, /* (F) manage items */
60502+ REISER4_NODE_PLUGIN_TYPE, /* (S) manage formatted nodes */
60503+ REISER4_HASH_PLUGIN_TYPE, /* (F) compute hash */
60504+ REISER4_FIBRATION_PLUGIN_TYPE, /* (F) directory fibrations */
60505+ REISER4_FORMATTING_PLUGIN_TYPE, /* (F) tail-packing policy */
60506+ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
60507+ REISER4_SD_EXT_PLUGIN_TYPE, /* (A) stat-data extensions */
60508+ REISER4_FORMAT_PLUGIN_TYPE, /* (S) specify disk format */
60509+ REISER4_JNODE_PLUGIN_TYPE, /* (A) in-memory node headers */
60510+ REISER4_CIPHER_PLUGIN_TYPE, /* (F) cipher transform algs */
60511+ REISER4_DIGEST_PLUGIN_TYPE, /* (F) digest transform algs */
60512+ REISER4_COMPRESSION_PLUGIN_TYPE, /* (F) compression tfm algs */
60513+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
60514+ REISER4_CLUSTER_PLUGIN_TYPE, /* (F) size of logical cluster */
44254afd
MT
60515+ REISER4_PLUGIN_TYPES
60516+} reiser4_plugin_type;
60517+
71430cf6
MT
60518+/* Supported plugin groups */
60519+typedef enum {
60520+ REISER4_DIRECTORY_FILE,
60521+ REISER4_REGULAR_FILE,
60522+ REISER4_SYMLINK_FILE,
60523+ REISER4_SPECIAL_FILE,
60524+} file_plugin_group;
60525+
44254afd
MT
60526+struct reiser4_plugin_ops;
60527+/* generic plugin operations, supported by each
60528+ plugin type. */
60529+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
60530+
60531+/* the common part of all plugin instances. */
60532+typedef struct plugin_header {
60533+ /* plugin type */
60534+ reiser4_plugin_type type_id;
60535+ /* id of this plugin */
60536+ reiser4_plugin_id id;
71430cf6
MT
60537+ /* bitmask of groups the plugin belongs to. */
60538+ reiser4_plugin_groups groups;
44254afd
MT
60539+ /* plugin operations */
60540+ reiser4_plugin_ops *pops;
60541+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
60542+ /* short label of this plugin */
60543+ const char *label;
60544+ /* descriptive string.. */
60545+ const char *desc;
60546+ /* list linkage */
60547+ struct list_head linkage;
60548+} plugin_header;
60549+
71430cf6
MT
60550+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
60551+
44254afd
MT
60552+/* PRIVATE INTERFACES */
60553+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
60554+/* plugin type representation. */
71430cf6 60555+struct reiser4_plugin_type_data {
44254afd
MT
60556+ /* internal plugin type identifier. Should coincide with
60557+ index of this item in plugins[] array. */
60558+ reiser4_plugin_type type_id;
60559+ /* short symbolic label of this plugin type. Should be no longer
60560+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
60561+ const char *label;
60562+ /* plugin type description longer than .label */
60563+ const char *desc;
60564+
60565+/* NIKITA-FIXME-HANS: define built-in */
60566+ /* number of built-in plugin instances of this type */
60567+ int builtin_num;
60568+ /* array of built-in plugins */
60569+ void *builtin;
60570+ struct list_head plugins_list;
60571+ size_t size;
71430cf6 60572+};
44254afd 60573+
71430cf6 60574+extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
44254afd 60575+
71430cf6
MT
60576+int is_plugin_type_valid(reiser4_plugin_type type);
60577+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
44254afd 60578+
71430cf6
MT
60579+static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype,
60580+ int i)
44254afd
MT
60581+{
60582+ char *builtin;
60583+
60584+ builtin = ptype->builtin;
60585+ return (reiser4_plugin *) (builtin + i * ptype->size);
60586+}
60587+
60588+/* return plugin by its @type_id and @id */
71430cf6
MT
60589+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
60590+ reiser4_plugin_id id)
44254afd 60591+{
71430cf6
MT
60592+ assert("nikita-1651", is_plugin_type_valid(type));
60593+ assert("nikita-1652", is_plugin_id_valid(type, id));
60594+ return plugin_at(&plugins[type], id);
44254afd
MT
60595+}
60596+
60597+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
60598+ reiser4_plugin_id id);
60599+
60600+/**
60601+ * plugin_by_disk_id - get reiser4_plugin
60602+ * @type_id: plugin type id
60603+ * @did: plugin id in disk format
60604+ *
60605+ * Returns reiser4_plugin by plugin type id an dplugin_id.
60606+ */
60607+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
60608+ reiser4_plugin_type type_id,
60609+ __le16 *plugin_id)
60610+{
60611+ /*
60612+ * what we should do properly is to maintain within each file-system a
60613+ * dictionary that maps on-disk plugin ids to "universal" ids. This
60614+ * dictionary will be resolved on mount time, so that this function
60615+ * will perform just one additional array lookup.
60616+ */
60617+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
60618+}
60619+
60620+/* __PLUGIN_HEADER_H__ */
60621+#endif
60622+
60623+/*
60624+ * Local variables:
60625+ * c-indentation-style: "K&R"
60626+ * mode-name: "LC"
60627+ * c-basic-offset: 8
60628+ * tab-width: 8
60629+ * fill-column: 79
60630+ * End:
60631+ */
71430cf6
MT
60632diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.22/fs/reiser4/plugin/plugin_set.c
60633--- linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 03:00:00.000000000 +0300
60634+++ linux-2.6.22/fs/reiser4/plugin/plugin_set.c 2007-07-29 00:25:35.000728572 +0400
60635@@ -0,0 +1,379 @@
44254afd
MT
60636+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60637+ * reiser4/README */
71430cf6
MT
60638+/* This file contains Reiser4 plugin set operations */
60639+
60640+/* plugin sets
60641+ *
60642+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
60643+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
60644+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This
60645+ * set of plugins (so called pset) is described by structure plugin_set (see
60646+ * plugin/plugin_set.h), which contains pointers to all required plugins.
60647+ *
60648+ * Children can inherit some pset members from their parent, however sometimes
60649+ * it is useful to specify members different from parent ones. Since object's
60650+ * pset can not be easily changed without fatal consequences, we use for this
60651+ * purpose another special plugin table (so called hset, or heir set) described
60652+ * by the same structure.
60653+ *
60654+ * Inode only stores a pointers to pset and hset. Different inodes with the
60655+ * same set of pset (hset) members point to the same pset (hset). This is
60656+ * archived by storing psets and hsets in global hash table. Races are avoided
60657+ * by simple (and efficient so far) solution of never recycling psets, even
60658+ * when last inode pointing to it is destroyed.
44254afd
MT
60659+ */
60660+
60661+#include "../debug.h"
60662+#include "../super.h"
60663+#include "plugin_set.h"
60664+
60665+#include <linux/slab.h>
60666+#include <linux/stddef.h>
60667+
60668+/* slab for plugin sets */
71430cf6 60669+static struct kmem_cache *plugin_set_slab;
44254afd
MT
60670+
60671+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
60672+ [0 ... 7] = SPIN_LOCK_UNLOCKED
60673+};
60674+
60675+/* hash table support */
60676+
60677+#define PS_TABLE_SIZE (32)
60678+
60679+static inline plugin_set *cast_to(const unsigned long *a)
60680+{
60681+ return container_of(a, plugin_set, hashval);
60682+}
60683+
60684+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
60685+{
60686+ plugin_set *set1;
60687+ plugin_set *set2;
60688+
60689+ /* make sure fields are not missed in the code below */
60690+ cassert(sizeof *set1 ==
60691+ sizeof set1->hashval +
60692+ sizeof set1->link +
60693+ sizeof set1->file +
60694+ sizeof set1->dir +
60695+ sizeof set1->perm +
60696+ sizeof set1->formatting +
60697+ sizeof set1->hash +
60698+ sizeof set1->fibration +
60699+ sizeof set1->sd +
60700+ sizeof set1->dir_item +
60701+ sizeof set1->cipher +
60702+ sizeof set1->digest +
60703+ sizeof set1->compression +
60704+ sizeof set1->compression_mode +
71430cf6
MT
60705+ sizeof set1->cluster +
60706+ sizeof set1->create);
44254afd
MT
60707+
60708+ set1 = cast_to(a1);
60709+ set2 = cast_to(a2);
60710+ return
60711+ set1->hashval == set2->hashval &&
60712+ set1->file == set2->file &&
60713+ set1->dir == set2->dir &&
60714+ set1->perm == set2->perm &&
60715+ set1->formatting == set2->formatting &&
60716+ set1->hash == set2->hash &&
60717+ set1->fibration == set2->fibration &&
60718+ set1->sd == set2->sd &&
60719+ set1->dir_item == set2->dir_item &&
60720+ set1->cipher == set2->cipher &&
60721+ set1->digest == set2->digest &&
60722+ set1->compression == set2->compression &&
60723+ set1->compression_mode == set2->compression_mode &&
60724+ set1->cluster == set2->cluster &&
71430cf6 60725+ set1->create == set2->create;
44254afd
MT
60726+}
60727+
60728+#define HASH_FIELD(hash, set, field) \
60729+({ \
60730+ (hash) += (unsigned long)(set)->field >> 2; \
60731+})
60732+
60733+static inline unsigned long calculate_hash(const plugin_set * set)
60734+{
60735+ unsigned long result;
60736+
60737+ result = 0;
60738+ HASH_FIELD(result, set, file);
60739+ HASH_FIELD(result, set, dir);
60740+ HASH_FIELD(result, set, perm);
60741+ HASH_FIELD(result, set, formatting);
60742+ HASH_FIELD(result, set, hash);
60743+ HASH_FIELD(result, set, fibration);
60744+ HASH_FIELD(result, set, sd);
60745+ HASH_FIELD(result, set, dir_item);
60746+ HASH_FIELD(result, set, cipher);
60747+ HASH_FIELD(result, set, digest);
60748+ HASH_FIELD(result, set, compression);
60749+ HASH_FIELD(result, set, compression_mode);
60750+ HASH_FIELD(result, set, cluster);
71430cf6 60751+ HASH_FIELD(result, set, create);
44254afd
MT
60752+ return result & (PS_TABLE_SIZE - 1);
60753+}
60754+
60755+static inline unsigned long
60756+pshash(ps_hash_table * table, const unsigned long *a)
60757+{
60758+ return *a;
60759+}
60760+
60761+/* The hash table definition */
71430cf6 60762+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
44254afd
MT
60763+#define KFREE(ptr, size) kfree(ptr)
60764+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
60765+ pseq);
60766+#undef KFREE
60767+#undef KMALLOC
60768+
60769+static ps_hash_table ps_table;
60770+static plugin_set empty_set = {
60771+ .hashval = 0,
60772+ .file = NULL,
60773+ .dir = NULL,
60774+ .perm = NULL,
60775+ .formatting = NULL,
60776+ .hash = NULL,
60777+ .fibration = NULL,
60778+ .sd = NULL,
60779+ .dir_item = NULL,
60780+ .cipher = NULL,
60781+ .digest = NULL,
60782+ .compression = NULL,
60783+ .compression_mode = NULL,
60784+ .cluster = NULL,
71430cf6 60785+ .create = NULL,
44254afd
MT
60786+ .link = {NULL}
60787+};
60788+
60789+plugin_set *plugin_set_get_empty(void)
60790+{
60791+ return &empty_set;
60792+}
60793+
60794+void plugin_set_put(plugin_set * set)
60795+{
60796+}
60797+
60798+static inline unsigned long *pset_field(plugin_set * set, int offset)
60799+{
60800+ return (unsigned long *)(((char *)set) + offset);
60801+}
60802+
60803+static int plugin_set_field(plugin_set ** set, const unsigned long val,
60804+ const int offset)
60805+{
60806+ unsigned long *spot;
60807+ spinlock_t *lock;
60808+ plugin_set replica;
60809+ plugin_set *twin;
60810+ plugin_set *psal;
60811+ plugin_set *orig;
60812+
60813+ assert("nikita-2902", set != NULL);
60814+ assert("nikita-2904", *set != NULL);
60815+
60816+ spot = pset_field(*set, offset);
60817+ if (unlikely(*spot == val))
60818+ return 0;
60819+
60820+ replica = *(orig = *set);
60821+ *pset_field(&replica, offset) = val;
60822+ replica.hashval = calculate_hash(&replica);
60823+ rcu_read_lock();
60824+ twin = ps_hash_find(&ps_table, &replica.hashval);
60825+ if (unlikely(twin == NULL)) {
60826+ rcu_read_unlock();
71430cf6
MT
60827+ psal = kmem_cache_alloc(plugin_set_slab,
60828+ reiser4_ctx_gfp_mask_get());
44254afd
MT
60829+ if (psal == NULL)
60830+ return RETERR(-ENOMEM);
60831+ *psal = replica;
60832+ lock = &plugin_set_lock[replica.hashval & 7];
60833+ spin_lock(lock);
60834+ twin = ps_hash_find(&ps_table, &replica.hashval);
60835+ if (likely(twin == NULL)) {
60836+ *set = psal;
60837+ ps_hash_insert_rcu(&ps_table, psal);
60838+ } else {
60839+ *set = twin;
60840+ kmem_cache_free(plugin_set_slab, psal);
60841+ }
60842+ spin_unlock(lock);
60843+ } else {
60844+ rcu_read_unlock();
60845+ *set = twin;
60846+ }
60847+ return 0;
60848+}
60849+
60850+static struct {
60851+ int offset;
71430cf6 60852+ reiser4_plugin_groups groups;
44254afd
MT
60853+ reiser4_plugin_type type;
60854+} pset_descr[PSET_LAST] = {
60855+ [PSET_FILE] = {
60856+ .offset = offsetof(plugin_set, file),
71430cf6
MT
60857+ .type = REISER4_FILE_PLUGIN_TYPE,
60858+ .groups = 0
44254afd
MT
60859+ },
60860+ [PSET_DIR] = {
60861+ .offset = offsetof(plugin_set, dir),
71430cf6
MT
60862+ .type = REISER4_DIR_PLUGIN_TYPE,
60863+ .groups = 0
44254afd
MT
60864+ },
60865+ [PSET_PERM] = {
60866+ .offset = offsetof(plugin_set, perm),
71430cf6
MT
60867+ .type = REISER4_PERM_PLUGIN_TYPE,
60868+ .groups = 0
44254afd
MT
60869+ },
60870+ [PSET_FORMATTING] = {
60871+ .offset = offsetof(plugin_set, formatting),
71430cf6
MT
60872+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
60873+ .groups = 0
44254afd
MT
60874+ },
60875+ [PSET_HASH] = {
60876+ .offset = offsetof(plugin_set, hash),
71430cf6
MT
60877+ .type = REISER4_HASH_PLUGIN_TYPE,
60878+ .groups = 0
44254afd
MT
60879+ },
60880+ [PSET_FIBRATION] = {
60881+ .offset = offsetof(plugin_set, fibration),
71430cf6
MT
60882+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
60883+ .groups = 0
44254afd
MT
60884+ },
60885+ [PSET_SD] = {
60886+ .offset = offsetof(plugin_set, sd),
71430cf6
MT
60887+ .type = REISER4_ITEM_PLUGIN_TYPE,
60888+ .groups = (1 << STAT_DATA_ITEM_TYPE)
44254afd
MT
60889+ },
60890+ [PSET_DIR_ITEM] = {
60891+ .offset = offsetof(plugin_set, dir_item),
71430cf6
MT
60892+ .type = REISER4_ITEM_PLUGIN_TYPE,
60893+ .groups = (1 << DIR_ENTRY_ITEM_TYPE)
44254afd
MT
60894+ },
60895+ [PSET_CIPHER] = {
60896+ .offset = offsetof(plugin_set, cipher),
71430cf6
MT
60897+ .type = REISER4_CIPHER_PLUGIN_TYPE,
60898+ .groups = 0
44254afd
MT
60899+ },
60900+ [PSET_DIGEST] = {
60901+ .offset = offsetof(plugin_set, digest),
71430cf6
MT
60902+ .type = REISER4_DIGEST_PLUGIN_TYPE,
60903+ .groups = 0
44254afd
MT
60904+ },
60905+ [PSET_COMPRESSION] = {
60906+ .offset = offsetof(plugin_set, compression),
71430cf6
MT
60907+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
60908+ .groups = 0
44254afd
MT
60909+ },
60910+ [PSET_COMPRESSION_MODE] = {
60911+ .offset = offsetof(plugin_set, compression_mode),
71430cf6
MT
60912+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60913+ .groups = 0
44254afd
MT
60914+ },
60915+ [PSET_CLUSTER] = {
60916+ .offset = offsetof(plugin_set, cluster),
71430cf6
MT
60917+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
60918+ .groups = 0
44254afd 60919+ },
71430cf6
MT
60920+ [PSET_CREATE] = {
60921+ .offset = offsetof(plugin_set, create),
60922+ .type = REISER4_FILE_PLUGIN_TYPE,
60923+ .groups = (1 << REISER4_REGULAR_FILE)
44254afd
MT
60924+ }
60925+};
60926+
71430cf6
MT
60927+#define DEFINE_PSET_OPS(PREFIX) \
60928+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
60929+{ \
60930+ if (memb > PSET_LAST) \
60931+ return REISER4_PLUGIN_TYPES; \
60932+ return pset_descr[memb].type; \
60933+} \
60934+ \
60935+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
60936+ reiser4_plugin * plugin) \
60937+{ \
60938+ assert("nikita-3492", set != NULL); \
60939+ assert("nikita-3493", *set != NULL); \
60940+ assert("nikita-3494", plugin != NULL); \
60941+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
60942+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
60943+ \
60944+ if (pset_descr[memb].groups) \
60945+ if (!(pset_descr[memb].groups & plugin->h.groups)) \
60946+ return -EINVAL; \
60947+ \
60948+ return plugin_set_field(set, \
60949+ (unsigned long)plugin, pset_descr[memb].offset); \
60950+} \
60951+ \
60952+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
60953+{ \
60954+ assert("nikita-3497", set != NULL); \
60955+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
60956+ \
60957+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
60958+}
60959+
60960+DEFINE_PSET_OPS(aset);
60961+
60962+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
44254afd 60963+ return plugin_set_field(set,
71430cf6 60964+ (unsigned long)plugin, pset_descr[memb].offset);
44254afd
MT
60965+}
60966+
44254afd 60967+/**
71430cf6 60968+ * init_plugin_set - create plugin set cache and hash table
44254afd
MT
60969+ *
60970+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
60971+ * reiser4 module initialization.
60972+ */
60973+int init_plugin_set(void)
60974+{
60975+ int result;
60976+
60977+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
60978+ if (result == 0) {
60979+ plugin_set_slab = kmem_cache_create("plugin_set",
60980+ sizeof(plugin_set), 0,
60981+ SLAB_HWCACHE_ALIGN,
60982+ NULL, NULL);
60983+ if (plugin_set_slab == NULL)
60984+ result = RETERR(-ENOMEM);
60985+ }
60986+ return result;
60987+}
60988+
60989+/**
60990+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
60991+ *
60992+ * This is called on reiser4 module unloading or system shutdown.
60993+ */
60994+void done_plugin_set(void)
60995+{
60996+ plugin_set *cur, *next;
60997+
60998+ for_all_in_htable(&ps_table, ps, cur, next) {
60999+ ps_hash_remove(&ps_table, cur);
61000+ kmem_cache_free(plugin_set_slab, cur);
61001+ }
61002+ destroy_reiser4_cache(&plugin_set_slab);
61003+ ps_hash_done(&ps_table);
61004+}
61005+
61006+/*
61007+ * Local variables:
61008+ * c-indentation-style: "K&R"
61009+ * mode-name: "LC"
61010+ * c-basic-offset: 8
61011+ * tab-width: 8
61012+ * fill-column: 120
61013+ * End:
61014+ */
71430cf6
MT
61015diff -urN linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.22/fs/reiser4/plugin/plugin_set.h
61016--- linux-2.6.22.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 03:00:00.000000000 +0300
61017+++ linux-2.6.22/fs/reiser4/plugin/plugin_set.h 2007-07-29 00:25:35.000728572 +0400
61018@@ -0,0 +1,77 @@
44254afd
MT
61019+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61020+
71430cf6
MT
61021+/* Reiser4 plugin set definition.
61022+ See fs/reiser4/plugin/plugin_set.c for details */
44254afd
MT
61023+
61024+#if !defined( __PLUGIN_SET_H__ )
61025+#define __PLUGIN_SET_H__
61026+
61027+#include "../type_safe_hash.h"
61028+#include "plugin.h"
61029+
61030+#include <linux/rcupdate.h>
61031+
61032+struct plugin_set;
61033+typedef struct plugin_set plugin_set;
61034+
61035+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61036+
61037+struct plugin_set {
61038+ unsigned long hashval;
61039+ /* plugin of file */
61040+ file_plugin *file;
61041+ /* plugin of dir */
61042+ dir_plugin *dir;
61043+ /* perm plugin for this file */
61044+ perm_plugin *perm;
61045+ /* tail policy plugin. Only meaningful for regular files */
61046+ formatting_plugin *formatting;
61047+ /* hash plugin. Only meaningful for directories. */
61048+ hash_plugin *hash;
61049+ /* fibration plugin. Only meaningful for directories. */
61050+ fibration_plugin *fibration;
61051+ /* plugin of stat-data */
61052+ item_plugin *sd;
61053+ /* plugin of items a directory is built of */
61054+ item_plugin *dir_item;
61055+ /* cipher plugin */
61056+ cipher_plugin *cipher;
61057+ /* digest plugin */
61058+ digest_plugin *digest;
61059+ /* compression plugin */
61060+ compression_plugin *compression;
61061+ /* compression mode plugin */
61062+ compression_mode_plugin *compression_mode;
61063+ /* cluster plugin */
61064+ cluster_plugin *cluster;
71430cf6
MT
61065+ /* this specifies file plugin of regular children.
61066+ only meaningful for directories */
61067+ file_plugin *create;
44254afd
MT
61068+ ps_hash_link link;
61069+};
61070+
61071+extern plugin_set *plugin_set_get_empty(void);
61072+extern void plugin_set_put(plugin_set * set);
61073+
44254afd
MT
61074+extern int init_plugin_set(void);
61075+extern void done_plugin_set(void);
61076+
71430cf6
MT
61077+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
61078+extern int set_plugin(plugin_set ** set, pset_member memb,
61079+ reiser4_plugin * plugin);
61080+extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
61081+ reiser4_plugin * plugin);
61082+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
44254afd
MT
61083+
61084+/* __PLUGIN_SET_H__ */
61085+#endif
61086+
61087+/* Make Linus happy.
61088+ Local variables:
61089+ c-indentation-style: "K&R"
61090+ mode-name: "LC"
61091+ c-basic-offset: 8
61092+ tab-width: 8
61093+ fill-column: 120
61094+ End:
61095+*/
71430cf6
MT
61096diff -urN linux-2.6.22.orig/fs/reiser4/plugin/security/Makefile linux-2.6.22/fs/reiser4/plugin/security/Makefile
61097--- linux-2.6.22.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 03:00:00.000000000 +0300
61098+++ linux-2.6.22/fs/reiser4/plugin/security/Makefile 2007-07-29 00:25:35.000728572 +0400
44254afd
MT
61099@@ -0,0 +1,4 @@
61100+obj-$(CONFIG_REISER4_FS) += security_plugins.o
61101+
61102+security_plugins-objs := \
61103+ perm.o
71430cf6
MT
61104diff -urN linux-2.6.22.orig/fs/reiser4/plugin/security/perm.c linux-2.6.22/fs/reiser4/plugin/security/perm.c
61105--- linux-2.6.22.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
61106+++ linux-2.6.22/fs/reiser4/plugin/security/perm.c 2007-07-29 00:25:35.000728572 +0400
61107@@ -0,0 +1,33 @@
44254afd
MT
61108+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61109+
61110+/*
71430cf6
MT
61111+ * This file contains implementation of permission plugins.
61112+ * See the comments in perm.h
44254afd
MT
61113+ */
61114+
61115+#include "../plugin.h"
61116+#include "../plugin_header.h"
61117+#include "../../debug.h"
61118+
61119+perm_plugin perm_plugins[LAST_PERM_ID] = {
61120+ [NULL_PERM_ID] = {
61121+ .h = {
61122+ .type_id = REISER4_PERM_PLUGIN_TYPE,
61123+ .id = NULL_PERM_ID,
61124+ .pops = NULL,
61125+ .label = "null",
61126+ .desc = "stub permission plugin",
61127+ .linkage = {NULL, NULL}
71430cf6 61128+ }
44254afd
MT
61129+ }
61130+};
61131+
61132+/*
61133+ * Local variables:
61134+ * c-indentation-style: "K&R"
61135+ * mode-name: "LC"
61136+ * c-basic-offset: 8
61137+ * tab-width: 8
61138+ * fill-column: 79
61139+ * End:
61140+ */
71430cf6
MT
61141diff -urN linux-2.6.22.orig/fs/reiser4/plugin/security/perm.h linux-2.6.22/fs/reiser4/plugin/security/perm.h
61142--- linux-2.6.22.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
61143+++ linux-2.6.22/fs/reiser4/plugin/security/perm.h 2007-07-29 00:25:35.000728572 +0400
61144@@ -0,0 +1,38 @@
44254afd
MT
61145+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61146+
61147+/* Perm (short for "permissions") plugins common stuff. */
61148+
61149+#if !defined( __REISER4_PERM_H__ )
61150+#define __REISER4_PERM_H__
61151+
61152+#include "../../forward.h"
61153+#include "../plugin_header.h"
61154+
61155+#include <linux/types.h>
44254afd 61156+
71430cf6
MT
61157+/* Definition of permission plugin */
61158+/* NIKITA-FIXME-HANS: define what this is targeted for.
61159+ It does not seem to be intended for use with sys_reiser4. Explain. */
44254afd 61160+
71430cf6
MT
61161+/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
61162+ Consider it like a temporary "seam" and reserved pset member.
61163+ If you have something usefull to add, then rename this plugin and add here */
44254afd
MT
61164+typedef struct perm_plugin {
61165+ /* generic plugin fields */
61166+ plugin_header h;
44254afd
MT
61167+} perm_plugin;
61168+
61169+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61170+
61171+/* __REISER4_PERM_H__ */
61172+#endif
61173+
61174+/* Make Linus happy.
61175+ Local variables:
61176+ c-indentation-style: "K&R"
61177+ mode-name: "LC"
61178+ c-basic-offset: 8
61179+ tab-width: 8
61180+ fill-column: 120
61181+ End:
61182+*/
71430cf6
MT
61183diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.22/fs/reiser4/plugin/space/bitmap.c
61184--- linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 03:00:00.000000000 +0300
61185+++ linux-2.6.22/fs/reiser4/plugin/space/bitmap.c 2007-07-29 00:25:35.004729608 +0400
61186@@ -0,0 +1,1585 @@
44254afd
MT
61187+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61188+
61189+#include "../../debug.h"
61190+#include "../../dformat.h"
61191+#include "../../txnmgr.h"
61192+#include "../../jnode.h"
61193+#include "../../block_alloc.h"
61194+#include "../../tree.h"
61195+#include "../../super.h"
61196+#include "../plugin.h"
61197+#include "space_allocator.h"
61198+#include "bitmap.h"
61199+
61200+#include <linux/types.h>
61201+#include <linux/fs.h> /* for struct super_block */
71430cf6 61202+#include <linux/mutex.h>
44254afd
MT
61203+#include <asm/div64.h>
61204+
61205+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61206+ * blocks
61207+
61208+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61209+ blocks loading/unloading which is different from v3.x where all bitmap
61210+ blocks are loaded at mount time.
61211+
61212+ To implement bitmap blocks unloading we need to count bitmap block usage
61213+ and detect currently unused blocks allowing them to be unloaded. It is not
61214+ a simple task since we allow several threads to modify one bitmap block
61215+ simultaneously.
61216+
61217+ Briefly speaking, the following schema is proposed: we count in special
61218+ variable associated with each bitmap block. That is for counting of block
61219+ alloc/dealloc operations on that bitmap block. With a deferred block
61220+ deallocation feature of reiser4 all those operation will be represented in
61221+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
61222+ nodes.
61223+
61224+ So, we increment usage counter for each new node allocated or deleted, and
61225+ decrement it at atom commit one time for each node from the dirty/deleted
61226+ atom's list. Of course, freshly allocated node deletion and node reusing
61227+ from atom deleted (if we do so) list should decrement bitmap usage counter
61228+ also.
61229+
61230+ This schema seems to be working but that reference counting is
61231+ not easy to debug. I think we should agree with Hans and do not implement
61232+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61233+
61234+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61235+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
61236+ first access to it, the "dont_load_bitmap" mount option controls whether
61237+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61238+ nodes currently is not supported. */
61239+
61240+#define CHECKSUM_SIZE 4
61241+
61242+#define BYTES_PER_LONG (sizeof(long))
61243+
61244+#if BITS_PER_LONG == 64
61245+# define LONG_INT_SHIFT (6)
61246+#else
61247+# define LONG_INT_SHIFT (5)
61248+#endif
61249+
61250+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61251+
61252+typedef unsigned long ulong_t;
61253+
61254+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
61255+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
61256+
61257+/* Block allocation/deallocation are done through special bitmap objects which
61258+ are allocated in an array at fs mount. */
61259+struct bitmap_node {
71430cf6 61260+ struct mutex mutex; /* long term lock object */
44254afd
MT
61261+
61262+ jnode *wjnode; /* j-nodes for WORKING ... */
61263+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
61264+
61265+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
61266+
61267+ atomic_t loaded; /* a flag which shows that bnode is loaded
61268+ * already */
61269+};
61270+
61271+static inline char *bnode_working_data(struct bitmap_node *bnode)
61272+{
61273+ char *data;
61274+
61275+ data = jdata(bnode->wjnode);
61276+ assert("zam-429", data != NULL);
61277+
61278+ return data + CHECKSUM_SIZE;
61279+}
61280+
61281+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61282+{
61283+ char *data;
61284+
61285+ data = jdata(bnode->cjnode);
61286+ assert("zam-430", data != NULL);
61287+
61288+ return data + CHECKSUM_SIZE;
61289+}
61290+
61291+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61292+{
61293+ char *data;
61294+
61295+ data = jdata(bnode->cjnode);
61296+ assert("vpf-261", data != NULL);
61297+
61298+ return le32_to_cpu(get_unaligned((d32 *)data));
61299+}
61300+
61301+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
61302+{
61303+ char *data;
61304+
61305+ data = jdata(bnode->cjnode);
61306+ assert("vpf-261", data != NULL);
61307+
61308+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
61309+}
61310+
61311+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
61312+ * written the code, does this added abstraction still have */
61313+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
61314+ * reiser4_space_allocator structure) */
61315+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
61316+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
61317+ * someday?". What they about? If there is a reason to have a union, it should
61318+ * be a union, if not, it should not be a union. "..might be someday" means no
61319+ * reason. */
61320+struct bitmap_allocator_data {
61321+ /* an array for bitmap blocks direct access */
61322+ struct bitmap_node *bitmap;
61323+};
61324+
61325+#define get_barray(super) \
61326+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
61327+
61328+#define get_bnode(super, i) (get_barray(super) + i)
61329+
61330+/* allocate and initialize jnode with JNODE_BITMAP type */
61331+static jnode *bnew(void)
61332+{
61333+ jnode *jal = jalloc();
61334+
61335+ if (jal)
61336+ jnode_init(jal, current_tree, JNODE_BITMAP);
61337+
61338+ return jal;
61339+}
61340+
61341+/* this file contains:
61342+ - bitmap based implementation of space allocation plugin
61343+ - all the helper functions like set bit, find_first_zero_bit, etc */
61344+
61345+/* Audited by: green(2002.06.12) */
61346+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
61347+{
61348+ ulong_t mask = 1UL << start_bit;
61349+ int i = start_bit;
61350+
61351+ while ((word & mask) != 0) {
61352+ mask <<= 1;
61353+ if (++i >= BITS_PER_LONG)
61354+ break;
61355+ }
61356+
61357+ return i;
61358+}
61359+
61360+#include <asm/bitops.h>
61361+
61362+#if BITS_PER_LONG == 64
61363+
61364+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
61365+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
61366+
61367+static inline void reiser4_set_bit(int nr, void *addr)
61368+{
61369+ ext2_set_bit(nr + OFF(addr), BASE(addr));
61370+}
61371+
61372+static inline void reiser4_clear_bit(int nr, void *addr)
61373+{
61374+ ext2_clear_bit(nr + OFF(addr), BASE(addr));
61375+}
61376+
61377+static inline int reiser4_test_bit(int nr, void *addr)
61378+{
61379+ return ext2_test_bit(nr + OFF(addr), BASE(addr));
61380+}
61381+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
61382+ int offset)
61383+{
61384+ int off = OFF(addr);
61385+
61386+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
61387+ offset + off) - off;
61388+}
61389+
61390+#else
61391+
61392+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
61393+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
61394+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
61395+
61396+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
61397+ext2_find_next_zero_bit(addr, maxoffset, offset)
61398+#endif
61399+
61400+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
61401+ * are counted from @addr, return the offset of the first bit if it is found,
61402+ * @maxoffset otherwise. */
61403+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61404+ bmap_off_t start_offset)
61405+{
61406+ ulong_t *base = addr;
61407+ /* start_offset is in bits, convert it to byte offset within bitmap. */
61408+ int word_nr = start_offset >> LONG_INT_SHIFT;
61409+ /* bit number within the byte. */
61410+ int bit_nr = start_offset & LONG_INT_MASK;
61411+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
61412+
61413+ assert("zam-387", max_offset != 0);
61414+
61415+ /* Unaligned @start_offset case. */
61416+ if (bit_nr != 0) {
61417+ bmap_nr_t nr;
61418+
61419+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
61420+
61421+ if (nr < BITS_PER_LONG)
61422+ return (word_nr << LONG_INT_SHIFT) + nr;
61423+
61424+ ++word_nr;
61425+ }
61426+
61427+ /* Fast scan trough aligned words. */
61428+ while (word_nr <= max_word_nr) {
61429+ if (base[word_nr] != 0) {
61430+ return (word_nr << LONG_INT_SHIFT)
61431+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
61432+ }
61433+
61434+ ++word_nr;
61435+ }
61436+
61437+ return max_offset;
61438+}
61439+
61440+#if BITS_PER_LONG == 64
61441+
61442+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
61443+ bmap_off_t start_offset)
61444+{
61445+ bmap_off_t off = OFF(addr);
61446+
61447+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
61448+ start_offset + off) - off;
61449+}
61450+
61451+#else
61452+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
61453+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
61454+#endif
61455+
61456+/* search for the first set bit in single word. */
61457+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
61458+{
61459+ ulong_t bit_mask;
61460+ int nr = start_bit;
61461+
61462+ assert("zam-965", start_bit < BITS_PER_LONG);
61463+ assert("zam-966", start_bit >= 0);
61464+
61465+ bit_mask = (1UL << nr);
61466+
61467+ while (bit_mask != 0) {
61468+ if (bit_mask & word)
61469+ return nr;
61470+ bit_mask >>= 1;
61471+ nr--;
61472+ }
61473+ return BITS_PER_LONG;
61474+}
61475+
61476+/* Search bitmap for a set bit in backward direction from the end to the
61477+ * beginning of given region
61478+ *
61479+ * @result: result offset of the last set bit
61480+ * @addr: base memory address,
61481+ * @low_off: low end of the search region, edge bit included into the region,
61482+ * @high_off: high end of the search region, edge bit included into the region,
61483+ *
61484+ * @return: 0 - set bit was found, -1 otherwise.
61485+ */
61486+static int
61487+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61488+ bmap_off_t high_off)
61489+{
61490+ ulong_t *base = addr;
61491+ int last_word;
61492+ int first_word;
61493+ int last_bit;
61494+ int nr;
61495+
44254afd
MT
61496+ assert("zam-962", high_off >= low_off);
61497+
61498+ last_word = high_off >> LONG_INT_SHIFT;
61499+ last_bit = high_off & LONG_INT_MASK;
61500+ first_word = low_off >> LONG_INT_SHIFT;
61501+
61502+ if (last_bit < BITS_PER_LONG) {
61503+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
61504+ if (nr < BITS_PER_LONG) {
61505+ *result = (last_word << LONG_INT_SHIFT) + nr;
61506+ return 0;
61507+ }
61508+ --last_word;
61509+ }
61510+ while (last_word >= first_word) {
61511+ if (base[last_word] != 0x0) {
61512+ last_bit =
61513+ find_last_set_bit_in_word(base[last_word],
61514+ BITS_PER_LONG - 1);
61515+ assert("zam-972", last_bit < BITS_PER_LONG);
61516+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
61517+ return 0;
61518+ }
61519+ --last_word;
61520+ }
61521+
61522+ return -1; /* set bit not found */
61523+}
61524+
61525+/* Search bitmap for a clear bit in backward direction from the end to the
61526+ * beginning of given region */
61527+static int
61528+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
61529+ bmap_off_t high_off)
61530+{
61531+ ulong_t *base = addr;
61532+ int last_word;
61533+ int first_word;
61534+ int last_bit;
61535+ int nr;
61536+
61537+ last_word = high_off >> LONG_INT_SHIFT;
61538+ last_bit = high_off & LONG_INT_MASK;
61539+ first_word = low_off >> LONG_INT_SHIFT;
61540+
61541+ if (last_bit < BITS_PER_LONG) {
61542+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
61543+ if (nr < BITS_PER_LONG) {
61544+ *result = (last_word << LONG_INT_SHIFT) + nr;
61545+ return 0;
61546+ }
61547+ --last_word;
61548+ }
61549+ while (last_word >= first_word) {
61550+ if (base[last_word] != (ulong_t) (-1)) {
61551+ *result = (last_word << LONG_INT_SHIFT) +
61552+ find_last_set_bit_in_word(~base[last_word],
61553+ BITS_PER_LONG - 1);
61554+ return 0;
61555+ }
61556+ --last_word;
61557+ }
61558+
61559+ return -1; /* zero bit not found */
61560+}
61561+
61562+/* Audited by: green(2002.06.12) */
61563+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
61564+{
61565+ int first_byte;
61566+ int last_byte;
61567+
61568+ unsigned char first_byte_mask = 0xFF;
61569+ unsigned char last_byte_mask = 0xFF;
61570+
61571+ assert("zam-410", start < end);
61572+
61573+ first_byte = start >> 3;
61574+ last_byte = (end - 1) >> 3;
61575+
61576+ if (last_byte > first_byte + 1)
61577+ memset(addr + first_byte + 1, 0,
61578+ (size_t) (last_byte - first_byte - 1));
61579+
61580+ first_byte_mask >>= 8 - (start & 0x7);
61581+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
61582+
61583+ if (first_byte == last_byte) {
61584+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
61585+ } else {
61586+ addr[first_byte] &= first_byte_mask;
61587+ addr[last_byte] &= last_byte_mask;
61588+ }
61589+}
61590+
61591+/* Audited by: green(2002.06.12) */
61592+/* ZAM-FIXME-HANS: comment this */
61593+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
61594+{
61595+ int first_byte;
61596+ int last_byte;
61597+
61598+ unsigned char first_byte_mask = 0xFF;
61599+ unsigned char last_byte_mask = 0xFF;
61600+
61601+ assert("zam-386", start < end);
61602+
61603+ first_byte = start >> 3;
61604+ last_byte = (end - 1) >> 3;
61605+
61606+ if (last_byte > first_byte + 1)
61607+ memset(addr + first_byte + 1, 0xFF,
61608+ (size_t) (last_byte - first_byte - 1));
61609+
61610+ first_byte_mask <<= start & 0x7;
61611+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
61612+
61613+ if (first_byte == last_byte) {
61614+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
61615+ } else {
61616+ addr[first_byte] |= first_byte_mask;
61617+ addr[last_byte] |= last_byte_mask;
61618+ }
61619+}
61620+
61621+#define ADLER_BASE 65521
61622+#define ADLER_NMAX 5552
61623+
61624+/* Calculates the adler32 checksum for the data pointed by `data` of the
61625+ length `len`. This function was originally taken from zlib, version 1.1.3,
61626+ July 9th, 1998.
61627+
61628+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
61629+
61630+ This software is provided 'as-is', without any express or implied
61631+ warranty. In no event will the authors be held liable for any damages
61632+ arising from the use of this software.
61633+
61634+ Permission is granted to anyone to use this software for any purpose,
61635+ including commercial applications, and to alter it and redistribute it
61636+ freely, subject to the following restrictions:
61637+
61638+ 1. The origin of this software must not be misrepresented; you must not
61639+ claim that you wrote the original software. If you use this software
61640+ in a product, an acknowledgment in the product documentation would be
61641+ appreciated but is not required.
61642+ 2. Altered source versions must be plainly marked as such, and must not be
61643+ misrepresented as being the original software.
61644+ 3. This notice may not be removed or altered from any source distribution.
61645+
61646+ Jean-loup Gailly Mark Adler
61647+ jloup@gzip.org madler@alumni.caltech.edu
61648+
61649+ The above comment applies only to the reiser4_adler32 function.
61650+*/
61651+
61652+__u32 reiser4_adler32(char *data, __u32 len)
61653+{
61654+ unsigned char *t = data;
61655+ __u32 s1 = 1;
61656+ __u32 s2 = 0;
61657+ int k;
61658+
61659+ while (len > 0) {
61660+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
61661+ len -= k;
61662+
61663+ while (k--) {
61664+ s1 += *t++;
61665+ s2 += s1;
61666+ }
61667+
61668+ s1 %= ADLER_BASE;
61669+ s2 %= ADLER_BASE;
61670+ }
61671+ return (s2 << 16) | s1;
61672+}
61673+
61674+#define sb_by_bnode(bnode) \
61675+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
61676+
61677+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
61678+{
61679+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
61680+}
61681+
61682+static int
61683+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
61684+{
61685+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
61686+ bmap_nr_t bmap;
61687+
61688+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
61689+
61690+ warning("vpf-263",
61691+ "Checksum for the bitmap block %llu is incorrect",
61692+ bmap);
61693+
61694+ return RETERR(-EIO);
61695+ }
61696+
61697+ return 0;
61698+}
61699+
61700+#define REISER4_CHECK_BMAP_CRC (0)
61701+
61702+#if REISER4_CHECK_BMAP_CRC
61703+static int bnode_check_crc(const struct bitmap_node *bnode)
61704+{
61705+ return bnode_check_adler32(bnode,
61706+ bmap_size(sb_by_bnode(bnode)->s_blocksize));
61707+}
61708+
61709+/* REISER4_CHECK_BMAP_CRC */
61710+#else
61711+
61712+#define bnode_check_crc(bnode) (0)
61713+
61714+/* REISER4_CHECK_BMAP_CRC */
61715+#endif
61716+
61717+/* Recalculates the adler32 checksum for only 1 byte change.
61718+ adler - previous adler checksum
61719+ old_data, data - old, new byte values.
61720+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
61721+ the changed byte within this chunk.
61722+ This function can be used for checksum calculation optimisation.
61723+*/
61724+
61725+static __u32
61726+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
61727+ __u32 tail)
61728+{
61729+ __u32 delta = data - old_data + 2 * ADLER_BASE;
61730+ __u32 s1 = adler & 0xffff;
61731+ __u32 s2 = (adler >> 16) & 0xffff;
61732+
61733+ s1 = (delta + s1) % ADLER_BASE;
61734+ s2 = (delta * tail + s2) % ADLER_BASE;
61735+
61736+ return (s2 << 16) | s1;
61737+}
61738+
61739+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
61740+
61741+/**
61742+ * get_nr_bitmap - calculate number of bitmap blocks
61743+ * @super: super block with initialized blocksize and block count
61744+ *
61745+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
61746+ * maintain free disk space. It assumes that each bitmap addresses the same
61747+ * number of blocks which is calculated by bmap_block_count macro defined in
61748+ * above. Number of blocks in the filesystem has to be initialized in reiser4
61749+ * private data of super block already so that it can be obtained via
61750+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
61751+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
61752+ * to use special function to divide and modulo 64bits filesystem block
61753+ * counters.
61754+ *
61755+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
61756+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
61757+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
61758+ */
61759+static bmap_nr_t get_nr_bmap(const struct super_block *super)
61760+{
61761+ u64 quotient;
61762+
61763+ assert("zam-393", reiser4_block_count(super) != 0);
61764+
61765+ quotient = reiser4_block_count(super) - 1;
61766+ do_div(quotient, bmap_bit_count(super->s_blocksize));
61767+ return quotient + 1;
61768+}
61769+
61770+/**
61771+ * parse_blocknr - calculate bitmap number and offset in it by block number
61772+ * @block: pointer to block number to calculate location in bitmap of
61773+ * @bmap: pointer where to store bitmap block number
61774+ * @offset: pointer where to store offset within bitmap block
61775+ *
61776+ * Calculates location of bit which is responsible for allocation/freeing of
61777+ * block @*block. That location is represented by bitmap block number and offset
61778+ * within that bitmap block.
61779+ */
61780+static void
61781+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
61782+ bmap_off_t *offset)
61783+{
61784+ struct super_block *super = get_current_context()->super;
61785+ u64 quotient = *block;
61786+
61787+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
61788+ *bmap = quotient;
61789+
61790+ assert("zam-433", *bmap < get_nr_bmap(super));
61791+ assert("", *offset < bmap_bit_count(super->s_blocksize));
61792+}
61793+
61794+#if REISER4_DEBUG
61795+/* Audited by: green(2002.06.12) */
61796+static void
61797+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
61798+{
61799+ struct super_block *sb = reiser4_get_current_sb();
61800+
61801+ assert("zam-436", sb != NULL);
61802+
61803+ assert("zam-455", start != NULL);
61804+ assert("zam-437", *start != 0);
71430cf6 61805+ assert("zam-541", !reiser4_blocknr_is_fake(start));
44254afd
MT
61806+ assert("zam-441", *start < reiser4_block_count(sb));
61807+
61808+ if (len != NULL) {
61809+ assert("zam-438", *len != 0);
61810+ assert("zam-442", *start + *len <= reiser4_block_count(sb));
61811+ }
61812+}
61813+
61814+static void check_bnode_loaded(const struct bitmap_node *bnode)
61815+{
61816+ assert("zam-485", bnode != NULL);
61817+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
61818+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
61819+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
61820+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
61821+}
61822+
61823+#else
61824+
61825+# define check_block_range(start, len) do { /* nothing */} while(0)
61826+# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
61827+
61828+#endif
61829+
61830+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
61831+ spin-locked */
61832+static inline void
61833+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
61834+{
61835+ if (offset < bnode->first_zero_bit)
61836+ bnode->first_zero_bit = offset;
61837+}
61838+
61839+/* return a physical disk address for logical bitmap number @bmap */
61840+/* FIXME-VS: this is somehow related to disk layout? */
61841+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
61842+ * per block allocation so that performance is not affected. Probably this
61843+ * whole file should be considered part of the disk layout plugin, and other
61844+ * disk layouts can use other defines and efficiency will not be significantly
61845+ * affected. */
61846+
61847+#define REISER4_FIRST_BITMAP_BLOCK \
61848+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
61849+
61850+/* Audited by: green(2002.06.12) */
61851+static void
61852+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
61853+ reiser4_block_nr * bnr)
61854+{
61855+
61856+ assert("zam-390", bmap < get_nr_bmap(super));
61857+
61858+#ifdef CONFIG_REISER4_BADBLOCKS
61859+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
61860+ /* Check if the diskmap have this already, first. */
61861+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
61862+ return; /* Found it in diskmap */
61863+#endif
61864+ /* FIXME_ZAM: before discussing of disk layouts and disk format
61865+ plugins I implement bitmap location scheme which is close to scheme
61866+ used in reiser 3.6 */
61867+ if (bmap == 0) {
61868+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
61869+ } else {
61870+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
61871+ }
61872+}
61873+
61874+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
61875+/* Audited by: green(2002.06.12) */
61876+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
61877+{
61878+ *bnr =
61879+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
61880+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
61881+}
61882+
61883+/* bnode structure initialization */
61884+static void
61885+init_bnode(struct bitmap_node *bnode,
61886+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
61887+{
61888+ memset(bnode, 0, sizeof(struct bitmap_node));
61889+
71430cf6 61890+ mutex_init(&bnode->mutex);
44254afd
MT
61891+ atomic_set(&bnode->loaded, 0);
61892+}
61893+
61894+static void release(jnode * node)
61895+{
61896+ jrelse(node);
61897+ JF_SET(node, JNODE_HEARD_BANSHEE);
61898+ jput(node);
61899+}
61900+
61901+/* This function is for internal bitmap.c use because it assumes that jnode is
61902+ in under full control of this thread */
61903+static void done_bnode(struct bitmap_node *bnode)
61904+{
61905+ if (bnode) {
61906+ atomic_set(&bnode->loaded, 0);
61907+ if (bnode->wjnode != NULL)
61908+ release(bnode->wjnode);
61909+ if (bnode->cjnode != NULL)
61910+ release(bnode->cjnode);
61911+ bnode->wjnode = bnode->cjnode = NULL;
61912+ }
61913+}
61914+
61915+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
71430cf6
MT
61916+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
61917+ jnode **wjnode_ret)
44254afd
MT
61918+{
61919+ struct super_block *super;
61920+ jnode *cjnode;
61921+ jnode *wjnode;
61922+ bmap_nr_t bmap;
61923+ int ret;
61924+
61925+ super = reiser4_get_current_sb();
61926+
61927+ *wjnode_ret = wjnode = bnew();
61928+ if (wjnode == NULL) {
61929+ *cjnode_ret = NULL;
61930+ return RETERR(-ENOMEM);
61931+ }
61932+
61933+ *cjnode_ret = cjnode = bnew();
61934+ if (cjnode == NULL)
61935+ return RETERR(-ENOMEM);
61936+
61937+ bmap = bnode - get_bnode(super, 0);
61938+
61939+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
61940+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
61941+
61942+ jref(cjnode);
61943+ jref(wjnode);
61944+
61945+ /* load commit bitmap */
61946+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
61947+
61948+ if (ret)
61949+ goto error;
61950+
61951+ /* allocate memory for working bitmap block. Note that for
61952+ * bitmaps jinit_new() doesn't actually modifies node content,
61953+ * so parallel calls to this are ok. */
61954+ ret = jinit_new(wjnode, GFP_NOFS);
61955+
61956+ if (ret != 0) {
61957+ jrelse(cjnode);
61958+ goto error;
61959+ }
61960+
61961+ return 0;
61962+
61963+ error:
61964+ jput(cjnode);
61965+ jput(wjnode);
61966+ *wjnode_ret = *cjnode_ret = NULL;
61967+ return ret;
61968+
61969+}
61970+
61971+/* Check the bnode data on read. */
61972+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
61973+{
61974+ void *data;
61975+ int ret;
61976+
61977+ /* Check CRC */
61978+ ret = bnode_check_adler32(bnode, blksize);
61979+
61980+ if (ret) {
61981+ return ret;
61982+ }
61983+
61984+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
61985+
61986+ /* Check the very first bit -- it must be busy. */
61987+ if (!reiser4_test_bit(0, data)) {
61988+ warning("vpf-1362", "The allocator block %llu is not marked "
61989+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
61990+
61991+ return -EINVAL;
61992+ }
61993+
61994+ return 0;
61995+}
61996+
61997+/* load bitmap blocks "on-demand" */
61998+static int load_and_lock_bnode(struct bitmap_node *bnode)
61999+{
62000+ int ret;
62001+
62002+ jnode *cjnode;
62003+ jnode *wjnode;
62004+
71430cf6 62005+ assert("nikita-3040", reiser4_schedulable());
44254afd
MT
62006+
62007+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62008+ * need to be atomic, right? Just leave a comment that if bitmaps were
62009+ * unloadable, this would need to be atomic. */
62010+ if (atomic_read(&bnode->loaded)) {
62011+ /* bitmap is already loaded, nothing to do */
62012+ check_bnode_loaded(bnode);
71430cf6 62013+ mutex_lock(&bnode->mutex);
44254afd
MT
62014+ assert("nikita-2827", atomic_read(&bnode->loaded));
62015+ return 0;
62016+ }
62017+
62018+ ret = prepare_bnode(bnode, &cjnode, &wjnode);
62019+ if (ret == 0) {
71430cf6 62020+ mutex_lock(&bnode->mutex);
44254afd
MT
62021+
62022+ if (!atomic_read(&bnode->loaded)) {
62023+ assert("nikita-2822", cjnode != NULL);
62024+ assert("nikita-2823", wjnode != NULL);
62025+ assert("nikita-2824", jnode_is_loaded(cjnode));
62026+ assert("nikita-2825", jnode_is_loaded(wjnode));
62027+
62028+ bnode->wjnode = wjnode;
62029+ bnode->cjnode = cjnode;
62030+
62031+ ret = check_struct_bnode(bnode, current_blocksize);
62032+ if (!ret) {
62033+ cjnode = wjnode = NULL;
62034+ atomic_set(&bnode->loaded, 1);
62035+ /* working bitmap is initialized by on-disk
62036+ * commit bitmap. This should be performed
71430cf6 62037+ * under mutex. */
44254afd
MT
62038+ memcpy(bnode_working_data(bnode),
62039+ bnode_commit_data(bnode),
62040+ bmap_size(current_blocksize));
71430cf6
MT
62041+ } else
62042+ mutex_unlock(&bnode->mutex);
44254afd
MT
62043+ } else
62044+ /* race: someone already loaded bitmap while we were
62045+ * busy initializing data. */
62046+ check_bnode_loaded(bnode);
62047+ }
62048+
62049+ if (wjnode != NULL) {
62050+ release(wjnode);
62051+ bnode->wjnode = NULL;
62052+ }
62053+ if (cjnode != NULL) {
62054+ release(cjnode);
62055+ bnode->cjnode = NULL;
62056+ }
62057+
62058+ return ret;
62059+}
62060+
62061+static void release_and_unlock_bnode(struct bitmap_node *bnode)
62062+{
62063+ check_bnode_loaded(bnode);
71430cf6 62064+ mutex_unlock(&bnode->mutex);
44254afd
MT
62065+}
62066+
62067+/* This function does all block allocation work but only for one bitmap
62068+ block.*/
62069+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62070+ block responsibility zone boundaries. This had no sense in v3.6 but may
62071+ have it in v4.x */
62072+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62073+static int
62074+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62075+ bmap_off_t max_offset, int min_len, int max_len)
62076+{
62077+ struct super_block *super = get_current_context()->super;
62078+ struct bitmap_node *bnode = get_bnode(super, bmap);
62079+
62080+ char *data;
62081+
62082+ bmap_off_t search_end;
62083+ bmap_off_t start;
62084+ bmap_off_t end;
62085+
62086+ int set_first_zero_bit = 0;
62087+
62088+ int ret;
62089+
62090+ assert("zam-364", min_len > 0);
62091+ assert("zam-365", max_len >= min_len);
62092+ assert("zam-366", *offset <= max_offset);
62093+
62094+ ret = load_and_lock_bnode(bnode);
62095+
62096+ if (ret)
62097+ return ret;
62098+
62099+ data = bnode_working_data(bnode);
62100+
62101+ start = *offset;
62102+
62103+ if (bnode->first_zero_bit >= start) {
62104+ start = bnode->first_zero_bit;
62105+ set_first_zero_bit = 1;
62106+ }
62107+
62108+ while (start + min_len < max_offset) {
62109+
62110+ start =
62111+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
62112+ if (set_first_zero_bit) {
62113+ bnode->first_zero_bit = start;
62114+ set_first_zero_bit = 0;
62115+ }
62116+ if (start >= max_offset)
62117+ break;
62118+
62119+ search_end = LIMIT(start + max_len, max_offset);
62120+ end =
62121+ reiser4_find_next_set_bit((long *)data, search_end, start);
62122+ if (end >= start + min_len) {
62123+ /* we can't trust find_next_set_bit result if set bit
62124+ was not fount, result may be bigger than
62125+ max_offset */
62126+ if (end > search_end)
62127+ end = search_end;
62128+
62129+ ret = end - start;
62130+ *offset = start;
62131+
62132+ reiser4_set_bits(data, start, end);
62133+
62134+ /* FIXME: we may advance first_zero_bit if [start,
62135+ end] region overlaps the first_zero_bit point */
62136+
62137+ break;
62138+ }
62139+
62140+ start = end + 1;
62141+ }
62142+
62143+ release_and_unlock_bnode(bnode);
62144+
62145+ return ret;
62146+}
62147+
62148+static int
62149+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62150+ bmap_off_t end_offset, int min_len, int max_len)
62151+{
62152+ struct super_block *super = get_current_context()->super;
62153+ struct bitmap_node *bnode = get_bnode(super, bmap);
62154+ char *data;
62155+ bmap_off_t start;
62156+ int ret;
62157+
62158+ assert("zam-958", min_len > 0);
62159+ assert("zam-959", max_len >= min_len);
62160+ assert("zam-960", *start_offset >= end_offset);
62161+
62162+ ret = load_and_lock_bnode(bnode);
62163+ if (ret)
62164+ return ret;
62165+
62166+ data = bnode_working_data(bnode);
62167+ start = *start_offset;
62168+
62169+ while (1) {
62170+ bmap_off_t end, search_end;
62171+
62172+ /* Find the beginning of the zero filled region */
62173+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62174+ break;
62175+ /* Is there more than `min_len' bits from `start' to
62176+ * `end_offset'? */
62177+ if (start < end_offset + min_len - 1)
62178+ break;
62179+
62180+ /* Do not search to `end_offset' if we need to find less than
62181+ * `max_len' zero bits. */
62182+ if (end_offset + max_len - 1 < start)
62183+ search_end = start - max_len + 1;
62184+ else
62185+ search_end = end_offset;
62186+
62187+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
62188+ end = search_end;
62189+ else
62190+ end++;
62191+
62192+ if (end + min_len <= start + 1) {
62193+ if (end < search_end)
62194+ end = search_end;
62195+ ret = start - end + 1;
62196+ *start_offset = end; /* `end' is lowest offset */
62197+ assert("zam-987",
62198+ reiser4_find_next_set_bit(data, start + 1,
62199+ end) >= start + 1);
62200+ reiser4_set_bits(data, end, start + 1);
62201+ break;
62202+ }
62203+
62204+ if (end <= end_offset)
62205+ /* left search boundary reached. */
62206+ break;
62207+ start = end - 1;
62208+ }
62209+
62210+ release_and_unlock_bnode(bnode);
62211+ return ret;
62212+}
62213+
62214+/* allocate contiguous range of blocks in bitmap */
62215+static int bitmap_alloc_forward(reiser4_block_nr * start,
62216+ const reiser4_block_nr * end, int min_len,
62217+ int max_len)
62218+{
62219+ bmap_nr_t bmap, end_bmap;
62220+ bmap_off_t offset, end_offset;
62221+ int len;
62222+
62223+ reiser4_block_nr tmp;
62224+
62225+ struct super_block *super = get_current_context()->super;
62226+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62227+
62228+ parse_blocknr(start, &bmap, &offset);
62229+
62230+ tmp = *end - 1;
62231+ parse_blocknr(&tmp, &end_bmap, &end_offset);
62232+ ++end_offset;
62233+
62234+ assert("zam-358", end_bmap >= bmap);
62235+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62236+
62237+ for (; bmap < end_bmap; bmap++, offset = 0) {
62238+ len =
62239+ search_one_bitmap_forward(bmap, &offset, max_offset,
62240+ min_len, max_len);
62241+ if (len != 0)
62242+ goto out;
62243+ }
62244+
62245+ len =
62246+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62247+ max_len);
62248+ out:
62249+ *start = bmap * max_offset + offset;
62250+ return len;
62251+}
62252+
62253+/* allocate contiguous range of blocks in bitmap (from @start to @end in
62254+ * backward direction) */
62255+static int bitmap_alloc_backward(reiser4_block_nr * start,
62256+ const reiser4_block_nr * end, int min_len,
62257+ int max_len)
62258+{
62259+ bmap_nr_t bmap, end_bmap;
62260+ bmap_off_t offset, end_offset;
62261+ int len;
62262+ struct super_block *super = get_current_context()->super;
62263+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62264+
62265+ parse_blocknr(start, &bmap, &offset);
62266+ parse_blocknr(end, &end_bmap, &end_offset);
62267+
62268+ assert("zam-961", end_bmap <= bmap);
62269+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62270+
62271+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62272+ len =
62273+ search_one_bitmap_backward(bmap, &offset, 0, min_len,
62274+ max_len);
62275+ if (len != 0)
62276+ goto out;
62277+ }
62278+
62279+ len =
62280+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62281+ max_len);
62282+ out:
62283+ *start = bmap * max_offset + offset;
62284+ return len;
62285+}
62286+
62287+/* plugin->u.space_allocator.alloc_blocks() */
62288+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62289+ reiser4_block_nr *start, reiser4_block_nr *len)
62290+{
62291+ struct super_block *super = get_current_context()->super;
62292+ int actual_len;
62293+
62294+ reiser4_block_nr search_start;
62295+ reiser4_block_nr search_end;
62296+
62297+ assert("zam-398", super != NULL);
62298+ assert("zam-412", hint != NULL);
62299+ assert("zam-397", hint->blk <= reiser4_block_count(super));
62300+
62301+ if (hint->max_dist == 0)
62302+ search_end = reiser4_block_count(super);
62303+ else
62304+ search_end =
62305+ LIMIT(hint->blk + hint->max_dist,
62306+ reiser4_block_count(super));
62307+
62308+ /* We use @hint -> blk as a search start and search from it to the end
62309+ of the disk or in given region if @hint -> max_dist is not zero */
62310+ search_start = hint->blk;
62311+
62312+ actual_len =
62313+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62314+
62315+ /* There is only one bitmap search if max_dist was specified or first
62316+ pass was from the beginning of the bitmap. We also do one pass for
62317+ scanning bitmap in backward direction. */
62318+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
62319+ /* next step is a scanning from 0 to search_start */
62320+ search_end = search_start;
62321+ search_start = 0;
62322+ actual_len =
62323+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
62324+ }
62325+ if (actual_len == 0)
62326+ return RETERR(-ENOSPC);
62327+ if (actual_len < 0)
62328+ return RETERR(actual_len);
62329+ *len = actual_len;
62330+ *start = search_start;
62331+ return 0;
62332+}
62333+
62334+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
62335+ reiser4_block_nr * start,
62336+ reiser4_block_nr * len)
62337+{
62338+ reiser4_block_nr search_start;
62339+ reiser4_block_nr search_end;
62340+ int actual_len;
62341+
62342+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
62343+
62344+ assert("zam-969", super != NULL);
62345+ assert("zam-970", hint != NULL);
62346+ assert("zam-971", hint->blk <= reiser4_block_count(super));
62347+
62348+ search_start = hint->blk;
62349+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
62350+ search_end = 0;
62351+ else
62352+ search_end = search_start - hint->max_dist;
62353+
62354+ actual_len =
62355+ bitmap_alloc_backward(&search_start, &search_end, 1, needed);
62356+ if (actual_len == 0)
62357+ return RETERR(-ENOSPC);
62358+ if (actual_len < 0)
62359+ return RETERR(actual_len);
62360+ *len = actual_len;
62361+ *start = search_start;
62362+ return 0;
62363+}
62364+
62365+/* plugin->u.space_allocator.alloc_blocks() */
71430cf6
MT
62366+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
62367+ reiser4_blocknr_hint * hint, int needed,
62368+ reiser4_block_nr * start, reiser4_block_nr * len)
44254afd
MT
62369+{
62370+ if (hint->backward)
62371+ return alloc_blocks_backward(hint, needed, start, len);
62372+ return alloc_blocks_forward(hint, needed, start, len);
62373+}
62374+
62375+/* plugin->u.space_allocator.dealloc_blocks(). */
62376+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
62377+ nodes deletion is deferred until transaction commit. However, deallocation
62378+ of temporary objects like wandered blocks and transaction commit records
62379+ requires immediate node deletion from WORKING BITMAP.*/
71430cf6
MT
62380+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
62381+ reiser4_block_nr start, reiser4_block_nr len)
44254afd
MT
62382+{
62383+ struct super_block *super = reiser4_get_current_sb();
62384+
62385+ bmap_nr_t bmap;
62386+ bmap_off_t offset;
62387+
62388+ struct bitmap_node *bnode;
62389+ int ret;
62390+
62391+ assert("zam-468", len != 0);
62392+ check_block_range(&start, &len);
62393+
62394+ parse_blocknr(&start, &bmap, &offset);
62395+
62396+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
62397+
62398+ bnode = get_bnode(super, bmap);
62399+
62400+ assert("zam-470", bnode != NULL);
62401+
62402+ ret = load_and_lock_bnode(bnode);
62403+ assert("zam-481", ret == 0);
62404+
62405+ reiser4_clear_bits(bnode_working_data(bnode), offset,
62406+ (bmap_off_t) (offset + len));
62407+
62408+ adjust_first_zero_bit(bnode, offset);
62409+
62410+ release_and_unlock_bnode(bnode);
62411+}
62412+
62413+/* plugin->u.space_allocator.check_blocks(). */
71430cf6
MT
62414+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
62415+ const reiser4_block_nr * len, int desired)
44254afd
MT
62416+{
62417+#if REISER4_DEBUG
62418+ struct super_block *super = reiser4_get_current_sb();
62419+
62420+ bmap_nr_t bmap;
62421+ bmap_off_t start_offset;
62422+ bmap_off_t end_offset;
62423+
62424+ struct bitmap_node *bnode;
62425+ int ret;
62426+
62427+ assert("zam-622", len != NULL);
62428+ check_block_range(start, len);
62429+ parse_blocknr(start, &bmap, &start_offset);
62430+
62431+ end_offset = start_offset + *len;
62432+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
62433+
62434+ bnode = get_bnode(super, bmap);
62435+
62436+ assert("nikita-2215", bnode != NULL);
62437+
62438+ ret = load_and_lock_bnode(bnode);
62439+ assert("zam-626", ret == 0);
62440+
62441+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
62442+
62443+ if (desired) {
62444+ assert("zam-623",
62445+ reiser4_find_next_zero_bit(bnode_working_data(bnode),
62446+ end_offset, start_offset)
62447+ >= end_offset);
62448+ } else {
62449+ assert("zam-624",
62450+ reiser4_find_next_set_bit(bnode_working_data(bnode),
62451+ end_offset, start_offset)
62452+ >= end_offset);
62453+ }
62454+
62455+ release_and_unlock_bnode(bnode);
62456+#endif
62457+}
62458+
62459+/* conditional insertion of @node into atom's overwrite set if it was not there */
62460+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
62461+{
62462+ assert("zam-546", atom != NULL);
62463+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
62464+ assert("zam-548", node != NULL);
62465+
62466+ spin_lock_atom(atom);
62467+ spin_lock_jnode(node);
62468+
62469+ if (node->atom == NULL) {
62470+ JF_SET(node, JNODE_OVRWR);
62471+ insert_into_atom_ovrwr_list(atom, node);
62472+ } else {
62473+ assert("zam-549", node->atom == atom);
62474+ }
62475+
62476+ spin_unlock_jnode(node);
62477+ spin_unlock_atom(atom);
62478+}
62479+
62480+/* an actor which applies delete set to COMMIT bitmap pages and link modified
62481+ pages in a single-linked list */
62482+static int
62483+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
62484+ const reiser4_block_nr * len, void *data)
62485+{
62486+
62487+ bmap_nr_t bmap;
62488+ bmap_off_t offset;
62489+ int ret;
62490+
62491+ long long *blocks_freed_p = data;
62492+
62493+ struct bitmap_node *bnode;
62494+
62495+ struct super_block *sb = reiser4_get_current_sb();
62496+
62497+ check_block_range(start, len);
62498+
62499+ parse_blocknr(start, &bmap, &offset);
62500+
62501+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
62502+ bitmap-based allocator and each block range can't go over a zone of
62503+ responsibility of one bitmap block; same assumption is used in
62504+ other journal hooks in bitmap code. */
62505+ bnode = get_bnode(sb, bmap);
62506+ assert("zam-448", bnode != NULL);
62507+
62508+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
62509+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
62510+ ret = load_and_lock_bnode(bnode);
62511+ if (ret)
62512+ return ret;
62513+
62514+ /* put bnode into atom's overwrite set */
62515+ cond_add_to_overwrite_set(atom, bnode->cjnode);
62516+
62517+ data = bnode_commit_data(bnode);
62518+
62519+ ret = bnode_check_crc(bnode);
62520+ if (ret != 0)
62521+ return ret;
62522+
62523+ if (len != NULL) {
62524+ /* FIXME-ZAM: a check that all bits are set should be there */
62525+ assert("zam-443",
62526+ offset + *len <= bmap_bit_count(sb->s_blocksize));
62527+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
62528+
62529+ (*blocks_freed_p) += *len;
62530+ } else {
62531+ reiser4_clear_bit(offset, data);
62532+ (*blocks_freed_p)++;
62533+ }
62534+
62535+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
62536+
62537+ release_and_unlock_bnode(bnode);
62538+
62539+ return 0;
62540+}
62541+
62542+/* plugin->u.space_allocator.pre_commit_hook(). */
62543+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
62544+ rest is done by transaction manager (allocate wandered locations for COMMIT
62545+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
62546+/* Only one instance of this function can be running at one given time, because
62547+ only one transaction can be committed a time, therefore it is safe to access
62548+ some global variables without any locking */
62549+
71430cf6 62550+int reiser4_pre_commit_hook_bitmap(void)
44254afd
MT
62551+{
62552+ struct super_block *super = reiser4_get_current_sb();
62553+ txn_atom *atom;
62554+
62555+ long long blocks_freed = 0;
62556+
62557+ atom = get_current_atom_locked();
62558+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
62559+ spin_unlock_atom(atom);
62560+
62561+ { /* scan atom's captured list and find all freshly allocated nodes,
62562+ * mark corresponded bits in COMMIT BITMAP as used */
62563+ struct list_head *head = ATOM_CLEAN_LIST(atom);
62564+ jnode *node = list_entry(head->next, jnode, capture_link);
62565+
62566+ while (head != &node->capture_link) {
62567+ /* we detect freshly allocated jnodes */
62568+ if (JF_ISSET(node, JNODE_RELOC)) {
62569+ int ret;
62570+ bmap_nr_t bmap;
62571+
62572+ bmap_off_t offset;
62573+ bmap_off_t index;
62574+ struct bitmap_node *bn;
62575+ __u32 size = bmap_size(super->s_blocksize);
62576+ __u32 crc;
62577+ char byte;
62578+
62579+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
62580+ assert("zam-460",
71430cf6 62581+ !reiser4_blocknr_is_fake(&node->blocknr));
44254afd
MT
62582+
62583+ parse_blocknr(&node->blocknr, &bmap, &offset);
62584+ bn = get_bnode(super, bmap);
62585+
62586+ index = offset >> 3;
62587+ assert("vpf-276", index < size);
62588+
62589+ ret = bnode_check_crc(bnode);
62590+ if (ret != 0)
62591+ return ret;
62592+
62593+ check_bnode_loaded(bn);
62594+ load_and_lock_bnode(bn);
62595+
62596+ byte = *(bnode_commit_data(bn) + index);
62597+ reiser4_set_bit(offset, bnode_commit_data(bn));
62598+
62599+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
62600+ *(bnode_commit_data(bn) +
62601+ index),
62602+ size - index),
62603+ bnode_set_commit_crc(bn, crc);
62604+
62605+ release_and_unlock_bnode(bn);
62606+
62607+ ret = bnode_check_crc(bn);
62608+ if (ret != 0)
62609+ return ret;
62610+
62611+ /* working of this depends on how it inserts
62612+ new j-node into clean list, because we are
62613+ scanning the same list now. It is OK, if
62614+ insertion is done to the list front */
62615+ cond_add_to_overwrite_set(atom, bn->cjnode);
62616+ }
62617+
62618+ node = list_entry(node->capture_link.next, jnode, capture_link);
62619+ }
62620+ }
62621+
62622+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
62623+ &blocks_freed, 0);
62624+
62625+ blocks_freed -= atom->nr_blocks_allocated;
62626+
62627+ {
62628+ reiser4_super_info_data *sbinfo;
62629+
62630+ sbinfo = get_super_private(super);
62631+
62632+ spin_lock_reiser4_super(sbinfo);
62633+ sbinfo->blocks_free_committed += blocks_freed;
62634+ spin_unlock_reiser4_super(sbinfo);
62635+ }
62636+
62637+ return 0;
62638+}
62639+
62640+/* plugin->u.space_allocator.init_allocator
62641+ constructor of reiser4_space_allocator object. It is called on fs mount */
71430cf6
MT
62642+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
62643+ struct super_block *super, void *arg)
44254afd
MT
62644+{
62645+ struct bitmap_allocator_data *data = NULL;
62646+ bmap_nr_t bitmap_blocks_nr;
62647+ bmap_nr_t i;
62648+
71430cf6 62649+ assert("nikita-3039", reiser4_schedulable());
44254afd
MT
62650+
62651+ /* getting memory for bitmap allocator private data holder */
62652+ data =
71430cf6
MT
62653+ kmalloc(sizeof(struct bitmap_allocator_data),
62654+ reiser4_ctx_gfp_mask_get());
44254afd
MT
62655+
62656+ if (data == NULL)
62657+ return RETERR(-ENOMEM);
62658+
62659+ /* allocation and initialization for the array of bnodes */
62660+ bitmap_blocks_nr = get_nr_bmap(super);
62661+
62662+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
62663+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
62664+ may I never meet someone who still uses the ia32 architecture when
62665+ storage devices of that size enter the market, and wants to use ia32
62666+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
62667+ probably, another dynamic data structure should replace a static
62668+ array of bnodes. */
62669+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
71430cf6 62670+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
44254afd
MT
62671+ if (data->bitmap == NULL) {
62672+ kfree(data);
62673+ return RETERR(-ENOMEM);
62674+ }
62675+
62676+ for (i = 0; i < bitmap_blocks_nr; i++)
62677+ init_bnode(data->bitmap + i, super, i);
62678+
62679+ allocator->u.generic = data;
62680+
62681+#if REISER4_DEBUG
62682+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
62683+#endif
62684+
62685+ /* Load all bitmap blocks at mount time. */
62686+ if (!test_bit
62687+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
62688+ __u64 start_time, elapsed_time;
62689+ struct bitmap_node *bnode;
62690+ int ret;
62691+
62692+ if (REISER4_DEBUG)
62693+ printk(KERN_INFO "loading reiser4 bitmap...");
62694+ start_time = jiffies;
62695+
62696+ for (i = 0; i < bitmap_blocks_nr; i++) {
62697+ bnode = data->bitmap + i;
62698+ ret = load_and_lock_bnode(bnode);
62699+ if (ret) {
71430cf6
MT
62700+ reiser4_destroy_allocator_bitmap(allocator,
62701+ super);
44254afd
MT
62702+ return ret;
62703+ }
62704+ release_and_unlock_bnode(bnode);
62705+ }
62706+
62707+ elapsed_time = jiffies - start_time;
62708+ if (REISER4_DEBUG)
62709+ printk("...done (%llu jiffies)\n",
62710+ (unsigned long long)elapsed_time);
62711+ }
62712+
62713+ return 0;
62714+}
62715+
62716+/* plugin->u.space_allocator.destroy_allocator
62717+ destructor. It is called on fs unmount */
71430cf6
MT
62718+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
62719+ struct super_block *super)
44254afd
MT
62720+{
62721+ bmap_nr_t bitmap_blocks_nr;
62722+ bmap_nr_t i;
62723+
62724+ struct bitmap_allocator_data *data = allocator->u.generic;
62725+
62726+ assert("zam-414", data != NULL);
62727+ assert("zam-376", data->bitmap != NULL);
62728+
62729+ bitmap_blocks_nr = get_nr_bmap(super);
62730+
62731+ for (i = 0; i < bitmap_blocks_nr; i++) {
62732+ struct bitmap_node *bnode = data->bitmap + i;
62733+
71430cf6 62734+ mutex_lock(&bnode->mutex);
44254afd
MT
62735+
62736+#if REISER4_DEBUG
62737+ if (atomic_read(&bnode->loaded)) {
62738+ jnode *wj = bnode->wjnode;
62739+ jnode *cj = bnode->cjnode;
62740+
62741+ assert("zam-480", jnode_page(cj) != NULL);
62742+ assert("zam-633", jnode_page(wj) != NULL);
62743+
62744+ assert("zam-634",
62745+ memcmp(jdata(wj), jdata(wj),
62746+ bmap_size(super->s_blocksize)) == 0);
62747+
62748+ }
62749+#endif
62750+ done_bnode(bnode);
71430cf6 62751+ mutex_unlock(&bnode->mutex);
44254afd
MT
62752+ }
62753+
62754+ vfree(data->bitmap);
62755+ kfree(data);
62756+
62757+ allocator->u.generic = NULL;
62758+
62759+ return 0;
62760+}
62761+
62762+/*
71430cf6
MT
62763+ * Local variables:
62764+ * c-indentation-style: "K&R"
62765+ * mode-name: "LC"
62766+ * c-basic-offset: 8
62767+ * tab-width: 8
62768+ * fill-column: 79
62769+ * scroll-step: 1
62770+ * End:
62771+ */
62772diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.22/fs/reiser4/plugin/space/bitmap.h
62773--- linux-2.6.22.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 03:00:00.000000000 +0300
62774+++ linux-2.6.22/fs/reiser4/plugin/space/bitmap.h 2007-07-29 00:25:35.004729608 +0400
44254afd
MT
62775@@ -0,0 +1,47 @@
62776+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62777+
62778+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
62779+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
62780+
62781+#include "../../dformat.h"
62782+#include "../../block_alloc.h"
62783+
62784+#include <linux/types.h> /* for __u?? */
62785+#include <linux/fs.h> /* for struct super_block */
62786+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
62787+/* declarations of functions implementing methods of space allocator plugin for
62788+ bitmap based allocator. The functions themselves are in bitmap.c */
71430cf6
MT
62789+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
62790+ struct super_block *, void *);
62791+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
62792+ struct super_block *);
62793+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
62794+ reiser4_blocknr_hint *, int needed,
62795+ reiser4_block_nr * start,
62796+ reiser4_block_nr * len);
62797+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
62798+ const reiser4_block_nr *, int);
62799+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
62800+ reiser4_block_nr,
62801+ reiser4_block_nr);
62802+extern int reiser4_pre_commit_hook_bitmap(void);
62803+
62804+#define reiser4_post_commit_hook_bitmap() do{}while(0)
62805+#define reiser4_post_write_back_hook_bitmap() do{}while(0)
62806+#define reiser4_print_info_bitmap(pref, al) do{}while(0)
44254afd
MT
62807+
62808+typedef __u64 bmap_nr_t;
62809+typedef __u32 bmap_off_t;
62810+
62811+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
62812+
62813+/* Make Linus happy.
62814+ Local variables:
62815+ c-indentation-style: "K&R"
62816+ mode-name: "LC"
62817+ c-basic-offset: 8
62818+ tab-width: 8
62819+ fill-column: 120
62820+ scroll-step: 1
62821+ End:
62822+*/
71430cf6
MT
62823diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/Makefile linux-2.6.22/fs/reiser4/plugin/space/Makefile
62824--- linux-2.6.22.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 03:00:00.000000000 +0300
62825+++ linux-2.6.22/fs/reiser4/plugin/space/Makefile 2007-07-29 00:25:35.004729608 +0400
62826@@ -0,0 +1,4 @@
62827+obj-$(CONFIG_REISER4_FS) += space_plugins.o
62828+
62829+space_plugins-objs := \
62830+ bitmap.o
62831diff -urN linux-2.6.22.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.22/fs/reiser4/plugin/space/space_allocator.h
62832--- linux-2.6.22.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
62833+++ linux-2.6.22/fs/reiser4/plugin/space/space_allocator.h 2007-07-29 00:25:35.004729608 +0400
44254afd
MT
62834@@ -0,0 +1,80 @@
62835+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62836+
62837+#ifndef __SPACE_ALLOCATOR_H__
62838+#define __SPACE_ALLOCATOR_H__
62839+
62840+#include "../../forward.h"
62841+#include "bitmap.h"
62842+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
62843+ * but... */
62844+#define DEF_SPACE_ALLOCATOR(allocator) \
62845+ \
62846+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
62847+{ \
71430cf6 62848+ return reiser4_init_allocator_##allocator (al, s, opaque); \
44254afd
MT
62849+} \
62850+ \
62851+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
62852+{ \
71430cf6 62853+ reiser4_destroy_allocator_##allocator (al, s); \
44254afd
MT
62854+} \
62855+ \
62856+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
62857+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
62858+{ \
71430cf6 62859+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
44254afd
MT
62860+} \
62861+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
62862+{ \
71430cf6 62863+ reiser4_dealloc_blocks_##allocator (al, start, len); \
44254afd
MT
62864+} \
62865+ \
62866+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
62867+{ \
71430cf6 62868+ reiser4_check_blocks_##allocator (start, end, desired); \
44254afd
MT
62869+} \
62870+ \
62871+static inline void sa_pre_commit_hook (void) \
62872+{ \
71430cf6 62873+ reiser4_pre_commit_hook_##allocator (); \
44254afd
MT
62874+} \
62875+ \
62876+static inline void sa_post_commit_hook (void) \
62877+{ \
71430cf6 62878+ reiser4_post_commit_hook_##allocator (); \
44254afd
MT
62879+} \
62880+ \
62881+static inline void sa_post_write_back_hook (void) \
62882+{ \
71430cf6 62883+ reiser4_post_write_back_hook_##allocator(); \
44254afd
MT
62884+} \
62885+ \
62886+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
62887+{ \
71430cf6 62888+ reiser4_print_info_##allocator (prefix, al); \
44254afd
MT
62889+}
62890+
62891+DEF_SPACE_ALLOCATOR(bitmap)
62892+
62893+/* this object is part of reiser4 private in-core super block */
62894+struct reiser4_space_allocator {
62895+ union {
62896+ /* space allocators might use this pointer to reference their
62897+ * data. */
62898+ void *generic;
62899+ } u;
62900+};
62901+
62902+/* __SPACE_ALLOCATOR_H__ */
62903+#endif
62904+
62905+/* Make Linus happy.
62906+ Local variables:
62907+ c-indentation-style: "K&R"
62908+ mode-name: "LC"
62909+ c-basic-offset: 8
62910+ tab-width: 8
62911+ fill-column: 120
62912+ scroll-step: 1
62913+ End:
62914+*/
71430cf6
MT
62915diff -urN linux-2.6.22.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.22/fs/reiser4/plugin/tail_policy.c
62916--- linux-2.6.22.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 03:00:00.000000000 +0300
62917+++ linux-2.6.22/fs/reiser4/plugin/tail_policy.c 2007-07-29 00:25:35.008730643 +0400
44254afd
MT
62918@@ -0,0 +1,113 @@
62919+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62920+ * reiser4/README */
62921+
62922+/* Formatting policy plugins */
62923+
62924+/*
62925+ * Formatting policy plugin is used by object plugin (of regular file) to
62926+ * convert file between two representations.
62927+ *
62928+ * Currently following policies are implemented:
62929+ * never store file in formatted nodes
62930+ * always store file in formatted nodes
62931+ * store file in formatted nodes if file is smaller than 4 blocks (default)
62932+ */
62933+
62934+#include "../tree.h"
62935+#include "../inode.h"
62936+#include "../super.h"
62937+#include "object.h"
62938+#include "plugin.h"
62939+#include "node/node.h"
62940+#include "plugin_header.h"
62941+
62942+#include <linux/pagemap.h>
62943+#include <linux/fs.h> /* For struct inode */
62944+
62945+/**
62946+ * have_formatting_never -
62947+ * @inode:
62948+ * @size:
62949+ *
62950+ *
62951+ */
62952+/* Never store file's tail as direct item */
62953+/* Audited by: green(2002.06.12) */
62954+static int have_formatting_never(const struct inode *inode UNUSED_ARG
62955+ /* inode to operate on */ ,
62956+ loff_t size UNUSED_ARG /* new object size */ )
62957+{
62958+ return 0;
62959+}
62960+
62961+/* Always store file's tail as direct item */
62962+/* Audited by: green(2002.06.12) */
62963+static int
62964+have_formatting_always(const struct inode *inode UNUSED_ARG
62965+ /* inode to operate on */ ,
62966+ loff_t size UNUSED_ARG /* new object size */ )
62967+{
62968+ return 1;
62969+}
62970+
62971+/* This function makes test if we should store file denoted @inode as tails only or
62972+ as extents only. */
62973+static int
62974+have_formatting_default(const struct inode *inode UNUSED_ARG
62975+ /* inode to operate on */ ,
62976+ loff_t size /* new object size */ )
62977+{
62978+ assert("umka-1253", inode != NULL);
62979+
62980+ if (size > inode->i_sb->s_blocksize * 4)
62981+ return 0;
62982+
62983+ return 1;
62984+}
62985+
62986+/* tail plugins */
62987+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
62988+ [NEVER_TAILS_FORMATTING_ID] = {
62989+ .h = {
62990+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
62991+ .id = NEVER_TAILS_FORMATTING_ID,
62992+ .pops = NULL,
62993+ .label = "never",
62994+ .desc = "Never store file's tail",
62995+ .linkage = {NULL, NULL}
62996+ },
62997+ .have_tail = have_formatting_never
62998+ },
62999+ [ALWAYS_TAILS_FORMATTING_ID] = {
63000+ .h = {
63001+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63002+ .id = ALWAYS_TAILS_FORMATTING_ID,
63003+ .pops = NULL,
63004+ .label = "always",
63005+ .desc = "Always store file's tail",
63006+ .linkage = {NULL, NULL}
63007+ },
63008+ .have_tail = have_formatting_always
63009+ },
63010+ [SMALL_FILE_FORMATTING_ID] = {
63011+ .h = {
63012+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63013+ .id = SMALL_FILE_FORMATTING_ID,
63014+ .pops = NULL,
63015+ .label = "4blocks",
63016+ .desc = "store files shorter than 4 blocks in tail items",
63017+ .linkage = {NULL, NULL}
63018+ },
63019+ .have_tail = have_formatting_default
63020+ }
63021+};
63022+
63023+/*
63024+ * Local variables:
63025+ * c-indentation-style: "K&R"
63026+ * mode-name: "LC"
63027+ * c-basic-offset: 8
63028+ * tab-width: 8
63029+ * fill-column: 79
63030+ * End:
63031+ */
71430cf6
MT
63032diff -urN linux-2.6.22.orig/fs/reiser4/pool.c linux-2.6.22/fs/reiser4/pool.c
63033--- linux-2.6.22.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
63034+++ linux-2.6.22/fs/reiser4/pool.c 2007-07-29 00:25:35.008730643 +0400
63035@@ -0,0 +1,231 @@
44254afd
MT
63036+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63037+ * reiser4/README */
63038+
63039+/* Fast pool allocation.
63040+
63041+ There are situations when some sub-system normally asks memory allocator
63042+ for only few objects, but under some circumstances could require much
63043+ more. Typical and actually motivating example is tree balancing. It needs
63044+ to keep track of nodes that were involved into it, and it is well-known
63045+ that in reasonable packed balanced tree most (92.938121%) percent of all
63046+ balancings end up after working with only few nodes (3.141592 on
63047+ average). But in rare cases balancing can involve much more nodes
63048+ (3*tree_height+1 in extremal situation).
63049+
63050+ On the one hand, we don't want to resort to dynamic allocation (slab,
63051+ malloc(), etc.) to allocate data structures required to keep track of
63052+ nodes during balancing. On the other hand, we cannot statically allocate
63053+ required amount of space on the stack, because first: it is useless wastage
63054+ of precious resource, and second: this amount is unknown in advance (tree
63055+ height can change).
63056+
63057+ Pools, implemented in this file are solution for this problem:
63058+
63059+ - some configurable amount of objects is statically preallocated on the
63060+ stack
63061+
63062+ - if this preallocated pool is exhausted and more objects is requested
63063+ they are allocated dynamically.
63064+
63065+ Pools encapsulate distinction between statically and dynamically allocated
63066+ objects. Both allocation and recycling look exactly the same.
63067+
63068+ To keep track of dynamically allocated objects, pool adds its own linkage
63069+ to each object.
63070+
63071+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
63072+ is not perfect. On the other hand, balancing is currently the only client
63073+ of pool code.
63074+
63075+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63076+ functions in the style of tslist/tshash, i.e., make them unreadable, but
63077+ type-safe.
63078+
44254afd
MT
63079+*/
63080+
63081+#include "debug.h"
63082+#include "pool.h"
63083+#include "super.h"
63084+
63085+#include <linux/types.h>
63086+#include <linux/err.h>
63087+
71430cf6
MT
63088+/* initialize new pool object @h */
63089+static void reiser4_init_pool_obj(struct reiser4_pool_header * h)
44254afd
MT
63090+{
63091+ INIT_LIST_HEAD(&h->usage_linkage);
63092+ INIT_LIST_HEAD(&h->level_linkage);
63093+ INIT_LIST_HEAD(&h->extra_linkage);
63094+}
63095+
63096+/* initialize new pool */
71430cf6 63097+void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ ,
44254afd
MT
63098+ size_t obj_size /* size of objects in @pool */ ,
63099+ int num_of_objs /* number of preallocated objects */ ,
63100+ char *data /* area for preallocated objects */ )
63101+{
71430cf6 63102+ struct reiser4_pool_header *h;
44254afd
MT
63103+ int i;
63104+
63105+ assert("nikita-955", pool != NULL);
63106+ assert("nikita-1044", obj_size > 0);
63107+ assert("nikita-956", num_of_objs >= 0);
63108+ assert("nikita-957", data != NULL);
63109+
63110+ memset(pool, 0, sizeof *pool);
63111+ pool->obj_size = obj_size;
63112+ pool->data = data;
63113+ INIT_LIST_HEAD(&pool->free);
63114+ INIT_LIST_HEAD(&pool->used);
63115+ INIT_LIST_HEAD(&pool->extra);
63116+ memset(data, 0, obj_size * num_of_objs);
63117+ for (i = 0; i < num_of_objs; ++i) {
71430cf6 63118+ h = (struct reiser4_pool_header *) (data + i * obj_size);
44254afd
MT
63119+ reiser4_init_pool_obj(h);
63120+ /* add pool header to the end of pool's free list */
63121+ list_add_tail(&h->usage_linkage, &pool->free);
63122+ }
63123+}
63124+
63125+/* release pool resources
63126+
63127+ Release all resources acquired by this pool, specifically, dynamically
63128+ allocated objects.
63129+
63130+*/
71430cf6 63131+void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG)
44254afd
MT
63132+{
63133+}
63134+
71430cf6 63135+/* allocate carry object from @pool
44254afd
MT
63136+
63137+ First, try to get preallocated object. If this fails, resort to dynamic
63138+ allocation.
63139+
63140+*/
71430cf6 63141+static void *reiser4_pool_alloc(struct reiser4_pool * pool)
44254afd 63142+{
71430cf6 63143+ struct reiser4_pool_header *result;
44254afd
MT
63144+
63145+ assert("nikita-959", pool != NULL);
63146+
63147+ if (!list_empty(&pool->free)) {
63148+ struct list_head *linkage;
63149+
63150+ linkage = pool->free.next;
63151+ list_del(linkage);
63152+ INIT_LIST_HEAD(linkage);
71430cf6
MT
63153+ result = list_entry(linkage, struct reiser4_pool_header,
63154+ usage_linkage);
44254afd
MT
63155+ BUG_ON(!list_empty(&result->level_linkage) ||
63156+ !list_empty(&result->extra_linkage));
63157+ } else {
63158+ /* pool is empty. Extra allocations don't deserve dedicated
63159+ slab to be served from, as they are expected to be rare. */
71430cf6 63160+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
44254afd
MT
63161+ if (result != 0) {
63162+ reiser4_init_pool_obj(result);
63163+ list_add(&result->extra_linkage, &pool->extra);
63164+ } else
63165+ return ERR_PTR(RETERR(-ENOMEM));
63166+ BUG_ON(!list_empty(&result->usage_linkage) ||
63167+ !list_empty(&result->level_linkage));
63168+ }
63169+ ++pool->objs;
63170+ list_add(&result->usage_linkage, &pool->used);
63171+ memset(result + 1, 0, pool->obj_size - sizeof *result);
63172+ return result;
63173+}
63174+
63175+/* return object back to the pool */
71430cf6
MT
63176+void reiser4_pool_free(struct reiser4_pool * pool,
63177+ struct reiser4_pool_header * h)
44254afd
MT
63178+{
63179+ assert("nikita-961", h != NULL);
63180+ assert("nikita-962", pool != NULL);
63181+
63182+ --pool->objs;
63183+ assert("nikita-963", pool->objs >= 0);
63184+
63185+ list_del_init(&h->usage_linkage);
63186+ list_del_init(&h->level_linkage);
63187+
63188+ if (list_empty(&h->extra_linkage))
63189+ /*
63190+ * pool header is not an extra one. Push it onto free list
63191+ * using usage_linkage
63192+ */
63193+ list_add(&h->usage_linkage, &pool->free);
63194+ else {
63195+ /* remove pool header from pool's extra list and kfree it */
63196+ list_del(&h->extra_linkage);
63197+ kfree(h);
63198+ }
63199+}
63200+
63201+/* add new object to the carry level list
63202+
63203+ Carry level is FIFO most of the time, but not always. Complications arise
63204+ when make_space() function tries to go to the left neighbor and thus adds
63205+ carry node before existing nodes, and also, when updating delimiting keys
63206+ after moving data between two nodes, we want left node to be locked before
63207+ right node.
63208+
63209+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
63210+ opration that updates delimiting keys is sometimes called with two nodes
63211+ (when data are moved between two nodes) and sometimes with only one node
63212+ (when leftmost item is deleted in a node). In any case operation is
63213+ supplied with at least node whose left delimiting key is to be updated
63214+ (that is "right" node).
63215+
71430cf6
MT
63216+ @pool - from which to allocate new object;
63217+ @list - where to add object;
63218+ @reference - after (or before) which existing object to add
44254afd 63219+*/
71430cf6
MT
63220+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
63221+ struct list_head *list,
63222+ pool_ordering order,
63223+ struct reiser4_pool_header * reference)
63224+{
63225+ struct reiser4_pool_header *result;
44254afd
MT
63226+
63227+ assert("nikita-972", pool != NULL);
63228+
63229+ result = reiser4_pool_alloc(pool);
63230+ if (IS_ERR(result))
63231+ return result;
63232+
63233+ assert("nikita-973", result != NULL);
63234+
63235+ switch (order) {
63236+ case POOLO_BEFORE:
63237+ __list_add(&result->level_linkage,
63238+ reference->level_linkage.prev,
63239+ &reference->level_linkage);
63240+ break;
63241+ case POOLO_AFTER:
63242+ __list_add(&result->level_linkage,
63243+ &reference->level_linkage,
63244+ reference->level_linkage.next);
63245+ break;
63246+ case POOLO_LAST:
63247+ list_add_tail(&result->level_linkage, list);
63248+ break;
63249+ case POOLO_FIRST:
63250+ list_add(&result->level_linkage, list);
63251+ break;
63252+ default:
63253+ wrong_return_value("nikita-927", "order");
63254+ }
63255+ return result;
63256+}
63257+
63258+/* Make Linus happy.
63259+ Local variables:
63260+ c-indentation-style: "K&R"
63261+ mode-name: "LC"
63262+ c-basic-offset: 8
63263+ tab-width: 8
63264+ fill-column: 120
63265+ End:
63266+*/
71430cf6
MT
63267diff -urN linux-2.6.22.orig/fs/reiser4/pool.h linux-2.6.22/fs/reiser4/pool.h
63268--- linux-2.6.22.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
63269+++ linux-2.6.22/fs/reiser4/pool.h 2007-07-29 00:25:35.008730643 +0400
63270@@ -0,0 +1,56 @@
44254afd
MT
63271+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63272+
63273+/* Fast pool allocation */
63274+
63275+#ifndef __REISER4_POOL_H__
63276+#define __REISER4_POOL_H__
63277+
63278+#include <linux/types.h>
63279+
71430cf6 63280+struct reiser4_pool {
44254afd
MT
63281+ size_t obj_size;
63282+ int objs;
63283+ char *data;
63284+ struct list_head free;
63285+ struct list_head used;
63286+ struct list_head extra;
71430cf6 63287+};
44254afd 63288+
71430cf6 63289+struct reiser4_pool_header {
44254afd
MT
63290+ /* object is either on free or "used" lists */
63291+ struct list_head usage_linkage;
63292+ struct list_head level_linkage;
63293+ struct list_head extra_linkage;
71430cf6 63294+};
44254afd
MT
63295+
63296+typedef enum {
63297+ POOLO_BEFORE,
63298+ POOLO_AFTER,
63299+ POOLO_LAST,
63300+ POOLO_FIRST
63301+} pool_ordering;
63302+
63303+/* pool manipulation functions */
63304+
71430cf6 63305+extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size,
44254afd 63306+ int num_of_objs, char *data);
71430cf6
MT
63307+extern void reiser4_done_pool(struct reiser4_pool * pool);
63308+extern void reiser4_pool_free(struct reiser4_pool * pool,
63309+ struct reiser4_pool_header * h);
63310+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
63311+ struct list_head * list,
63312+ pool_ordering order,
63313+ struct reiser4_pool_header *reference);
44254afd
MT
63314+
63315+/* __REISER4_POOL_H__ */
63316+#endif
63317+
63318+/* Make Linus happy.
63319+ Local variables:
63320+ c-indentation-style: "K&R"
63321+ mode-name: "LC"
63322+ c-basic-offset: 8
63323+ tab-width: 8
63324+ fill-column: 120
63325+ End:
63326+*/
71430cf6
MT
63327diff -urN linux-2.6.22.orig/fs/reiser4/readahead.c linux-2.6.22/fs/reiser4/readahead.c
63328--- linux-2.6.22.orig/fs/reiser4/readahead.c 1970-01-01 03:00:00.000000000 +0300
63329+++ linux-2.6.22/fs/reiser4/readahead.c 2007-07-29 00:25:35.008730643 +0400
44254afd
MT
63330@@ -0,0 +1,138 @@
63331+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63332+ * reiser4/README */
63333+
63334+#include "forward.h"
63335+#include "tree.h"
63336+#include "tree_walk.h"
63337+#include "super.h"
63338+#include "inode.h"
63339+#include "key.h"
63340+#include "znode.h"
63341+
63342+#include <linux/swap.h> /* for totalram_pages */
63343+
71430cf6 63344+void reiser4_init_ra_info(ra_info_t * rai)
44254afd 63345+{
71430cf6 63346+ rai->key_to_stop = *reiser4_min_key();
44254afd
MT
63347+}
63348+
63349+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
63350+static inline int ra_adjacent_only(int flags)
63351+{
63352+ return flags & RA_ADJACENT_ONLY;
63353+}
63354+
63355+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
63356+ if right neighbor's first key is less or equal to readahead's stop key */
63357+static int should_readahead_neighbor(znode * node, ra_info_t * info)
63358+{
63359+ int result;
63360+
63361+ read_lock_dk(znode_get_tree(node));
63362+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
63363+ read_unlock_dk(znode_get_tree(node));
63364+ return result;
63365+}
63366+
63367+#define LOW_MEM_PERCENTAGE (5)
63368+
63369+static int low_on_memory(void)
63370+{
63371+ unsigned int freepages;
63372+
63373+ freepages = nr_free_pages();
63374+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
63375+}
63376+
63377+/* start read for @node and for a few of its right neighbors */
63378+void formatted_readahead(znode * node, ra_info_t * info)
63379+{
71430cf6 63380+ struct formatted_ra_params *ra_params;
44254afd
MT
63381+ znode *cur;
63382+ int i;
63383+ int grn_flags;
63384+ lock_handle next_lh;
63385+
63386+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
71430cf6 63387+ if (reiser4_blocknr_is_fake(znode_get_block(node)))
44254afd
MT
63388+ return;
63389+
63390+ ra_params = get_current_super_ra_params();
63391+
63392+ if (znode_page(node) == NULL)
63393+ jstartio(ZJNODE(node));
63394+
63395+ if (znode_get_level(node) != LEAF_LEVEL)
63396+ return;
63397+
63398+ /* don't waste memory for read-ahead when low on memory */
63399+ if (low_on_memory())
63400+ return;
63401+
63402+ /* We can have locked nodes on upper tree levels, in this situation lock
63403+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
63404+ here. */
63405+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
63406+
63407+ i = 0;
63408+ cur = zref(node);
63409+ init_lh(&next_lh);
63410+ while (i < ra_params->max) {
63411+ const reiser4_block_nr *nextblk;
63412+
63413+ if (!should_readahead_neighbor(cur, info))
63414+ break;
63415+
63416+ if (reiser4_get_right_neighbor
63417+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
63418+ break;
63419+
63420+ nextblk = znode_get_block(next_lh.node);
71430cf6 63421+ if (reiser4_blocknr_is_fake(nextblk) ||
44254afd
MT
63422+ (ra_adjacent_only(ra_params->flags)
63423+ && *nextblk != *znode_get_block(cur) + 1)) {
63424+ break;
63425+ }
63426+
63427+ zput(cur);
63428+ cur = zref(next_lh.node);
63429+ done_lh(&next_lh);
63430+ if (znode_page(cur) == NULL)
63431+ jstartio(ZJNODE(cur));
63432+ else
63433+ /* Do not scan read-ahead window if pages already
63434+ * allocated (and i/o already started). */
63435+ break;
63436+
63437+ i++;
63438+ }
63439+ zput(cur);
63440+ done_lh(&next_lh);
63441+}
63442+
63443+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
63444+{
63445+ reiser4_key *stop_key;
63446+
63447+ assert("nikita-3542", dir != NULL);
63448+ assert("nikita-3543", tap != NULL);
63449+
63450+ stop_key = &tap->ra_info.key_to_stop;
63451+ /* initialize readdir readahead information: include into readahead
63452+ * stat data of all files of the directory */
63453+ set_key_locality(stop_key, get_inode_oid(dir));
63454+ set_key_type(stop_key, KEY_SD_MINOR);
71430cf6
MT
63455+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
63456+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
63457+ set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
44254afd
MT
63458+}
63459+
63460+/*
63461+ Local variables:
63462+ c-indentation-style: "K&R"
63463+ mode-name: "LC"
63464+ c-basic-offset: 8
63465+ tab-width: 8
63466+ fill-column: 80
63467+ End:
63468+*/
71430cf6
MT
63469diff -urN linux-2.6.22.orig/fs/reiser4/readahead.h linux-2.6.22/fs/reiser4/readahead.h
63470--- linux-2.6.22.orig/fs/reiser4/readahead.h 1970-01-01 03:00:00.000000000 +0300
63471+++ linux-2.6.22/fs/reiser4/readahead.h 2007-07-29 00:25:35.008730643 +0400
63472@@ -0,0 +1,51 @@
44254afd
MT
63473+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63474+
63475+#ifndef __READAHEAD_H__
63476+#define __READAHEAD_H__
63477+
63478+#include "key.h"
63479+
63480+typedef enum {
71430cf6
MT
63481+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
63482+ Default is NO (not only adjacent) */
44254afd
MT
63483+} ra_global_flags;
63484+
71430cf6
MT
63485+/* reiser4 super block has a field of this type.
63486+ It controls readahead during tree traversals */
63487+struct formatted_ra_params {
63488+ unsigned long max; /* request not more than this amount of nodes.
63489+ Default is totalram_pages / 4 */
44254afd 63490+ int flags;
71430cf6 63491+};
44254afd
MT
63492+
63493+typedef struct {
63494+ reiser4_key key_to_stop;
63495+} ra_info_t;
63496+
63497+void formatted_readahead(znode *, ra_info_t *);
71430cf6 63498+void reiser4_init_ra_info(ra_info_t * rai);
44254afd
MT
63499+
63500+struct reiser4_file_ra_state {
63501+ loff_t start; /* Current window */
63502+ loff_t size;
63503+ loff_t next_size; /* Next window size */
63504+ loff_t ahead_start; /* Ahead window */
63505+ loff_t ahead_size;
63506+ loff_t max_window_size; /* Maximum readahead window */
63507+ loff_t slow_start; /* enlarging r/a size algorithm. */
63508+};
63509+
63510+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
63511+
63512+/* __READAHEAD_H__ */
63513+#endif
63514+
63515+/*
63516+ Local variables:
63517+ c-indentation-style: "K&R"
63518+ mode-name: "LC"
63519+ c-basic-offset: 8
63520+ tab-width: 8
63521+ fill-column: 120
63522+ End:
63523+*/
71430cf6
MT
63524diff -urN linux-2.6.22.orig/fs/reiser4/README linux-2.6.22/fs/reiser4/README
63525--- linux-2.6.22.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
63526+++ linux-2.6.22/fs/reiser4/README 2007-07-29 00:25:35.008730643 +0400
63527@@ -0,0 +1,128 @@
63528+[LICENSING]
63529+
63530+Reiser4 is hereby licensed under the GNU General
63531+Public License version 2.
63532+
63533+Source code files that contain the phrase "licensing governed by
63534+reiser4/README" are "governed files" throughout this file. Governed
63535+files are licensed under the GPL. The portions of them owned by Hans
63536+Reiser, or authorized to be licensed by him, have been in the past,
63537+and likely will be in the future, licensed to other parties under
63538+other licenses. If you add your code to governed files, and don't
63539+want it to be owned by Hans Reiser, put your copyright label on that
63540+code so the poor blight and his customers can keep things straight.
63541+All portions of governed files not labeled otherwise are owned by Hans
63542+Reiser, and by adding your code to it, widely distributing it to
63543+others or sending us a patch, and leaving the sentence in stating that
63544+licensing is governed by the statement in this file, you accept this.
63545+It will be a kindness if you identify whether Hans Reiser is allowed
63546+to license code labeled as owned by you on your behalf other than
63547+under the GPL, because he wants to know if it is okay to do so and put
63548+a check in the mail to you (for non-trivial improvements) when he
63549+makes his next sale. He makes no guarantees as to the amount if any,
63550+though he feels motivated to motivate contributors, and you can surely
63551+discuss this with him before or after contributing. You have the
63552+right to decline to allow him to license your code contribution other
63553+than under the GPL.
63554+
63555+Further licensing options are available for commercial and/or other
63556+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
63557+the GPL as not allowing those additional licensing options, you read
63558+it wrongly, and Richard Stallman agrees with me, when carefully read
63559+you can see that those restrictions on additional terms do not apply
63560+to the owner of the copyright, and my interpretation of this shall
63561+govern for this license.
63562+
63563+[END LICENSING]
63564+
63565+Reiser4 is a file system based on dancing tree algorithms, and is
63566+described at http://www.namesys.com
63567+
63568+mkfs.reiser4 and other utilities are on our webpage or wherever your
63569+Linux provider put them. You really want to be running the latest
63570+version off the website if you use fsck.
63571+
63572+Yes, if you update your reiser4 kernel module you do have to
63573+recompile your kernel, most of the time. The errors you get will be
63574+quite cryptic if your forget to do so.
63575+
63576+Hideous Commercial Pitch: Spread your development costs across other OS
63577+vendors. Select from the best in the world, not the best in your
63578+building, by buying from third party OS component suppliers. Leverage
63579+the software component development power of the internet. Be the most
63580+aggressive in taking advantage of the commercial possibilities of
63581+decentralized internet development, and add value through your branded
63582+integration that you sell as an operating system. Let your competitors
63583+be the ones to compete against the entire internet by themselves. Be
63584+hip, get with the new economic trend, before your competitors do. Send
63585+email to reiser@namesys.com
63586+
63587+Hans Reiser was the primary architect of Reiser4, but a whole team
63588+chipped their ideas in. He invested everything he had into Namesys
63589+for 5.5 dark years of no money before Reiser3 finally started to work well
63590+enough to bring in money. He owns the copyright.
63591+
63592+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
63593+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
63594+opinion, unique in its willingness to invest into things more
63595+theoretical than the VC community can readily understand, and more
63596+longterm than allows them to be sure that they will be the ones to
63597+extract the economic benefits from. DARPA also integrated us into a
63598+security community that transformed our security worldview.
63599+
63600+Vladimir Saveliev is our lead programmer, with us from the beginning,
63601+and he worked long hours writing the cleanest code. This is why he is
63602+now the lead programmer after years of commitment to our work. He
63603+always made the effort to be the best he could be, and to make his
63604+code the best that it could be. What resulted was quite remarkable. I
63605+don't think that money can ever motivate someone to work the way he
63606+did, he is one of the most selfless men I know.
63607+
63608+Alexander Lyamin was our sysadmin, and helped to educate us in
63609+security issues. Moscow State University and IMT were very generous
63610+in the internet access they provided us, and in lots of other little
63611+ways that a generous institution can be.
63612+
63613+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
63614+locking code, the block allocator, and finished the flushing code.
63615+His code is always crystal clean and well structured.
63616+
63617+Nikita Danilov wrote the core of the balancing code, the core of the
63618+plugins code, and the directory code. He worked a steady pace of long
63619+hours that produced a whole lot of well abstracted code. He is our
63620+senior computer scientist.
63621+
63622+Vladimir Demidov wrote the parser. Writing an in kernel parser is
63623+something very few persons have the skills for, and it is thanks to
63624+him that we can say that the parser is really not so big compared to
63625+various bits of our other code, and making a parser work in the kernel
63626+was not so complicated as everyone would imagine mainly because it was
63627+him doing it...
63628+
63629+Joshua McDonald wrote the transaction manager, and the flush code.
63630+The flush code unexpectedly turned out be extremely hairy for reasons
63631+you can read about on our web page, and he did a great job on an
63632+extremely difficult task.
63633+
63634+Nina Reiser handled our accounting, government relations, and much
63635+more.
63636+
63637+Ramon Reiser developed our website.
63638+
63639+Beverly Palmer drew our graphics.
63640+
63641+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
63642+and worked with Umka on developing libreiser4 and userspace plugins.
63643+
63644+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
63645+userspace tools (reiser4progs).
63646+
63647+Oleg Drokin (aka Green) is the release manager who fixes everything.
63648+It is so nice to have someone like that on the team. He (plus Chris
63649+and Jeff) make it possible for the entire rest of the Namesys team to
63650+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
63651+is just amazing to watch his talent for spotting bugs in action.
63652+
63653+Edward Shishkin wrote cryptcompress file plugin (which manages files
63654+built of encrypted and(or) compressed bodies) and other plugins related
63655+to transparent encryption and compression support.
63656diff -urN linux-2.6.22.orig/fs/reiser4/reiser4.h linux-2.6.22/fs/reiser4/reiser4.h
63657--- linux-2.6.22.orig/fs/reiser4/reiser4.h 1970-01-01 03:00:00.000000000 +0300
63658+++ linux-2.6.22/fs/reiser4/reiser4.h 2007-07-29 00:25:35.012731678 +0400
63659@@ -0,0 +1,269 @@
44254afd
MT
63660+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63661+ * reiser4/README */
63662+
63663+/* definitions of common constants used by reiser4 */
63664+
63665+#if !defined( __REISER4_H__ )
63666+#define __REISER4_H__
63667+
44254afd
MT
63668+#include <asm/param.h> /* for HZ */
63669+#include <linux/errno.h>
63670+#include <linux/types.h>
63671+#include <linux/fs.h>
71430cf6 63672+#include <linux/hardirq.h>
44254afd
MT
63673+#include <linux/sched.h>
63674+
63675+/*
63676+ * reiser4 compilation options.
63677+ */
63678+
63679+#if defined(CONFIG_REISER4_DEBUG)
63680+/* turn on assertion checks */
63681+#define REISER4_DEBUG (1)
63682+#else
63683+#define REISER4_DEBUG (0)
63684+#endif
63685+
63686+#if defined(CONFIG_ZLIB_INFLATE)
63687+/* turn on zlib */
63688+#define REISER4_ZLIB (1)
63689+#else
63690+#define REISER4_ZLIB (0)
63691+#endif
63692+
63693+#if defined(CONFIG_CRYPTO_SHA256)
63694+#define REISER4_SHA256 (1)
63695+#else
63696+#define REISER4_SHA256 (0)
63697+#endif
63698+
44254afd
MT
63699+/*
63700+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
63701+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
63702+ * components. Additional component, referred to as "ordering" is used to
63703+ * order items from which given object is composed of. As such, ordering is
63704+ * placed between locality and objectid. For directory item ordering contains
63705+ * initial prefix of the file name this item is for. This sorts all directory
63706+ * items within given directory lexicographically (but see
63707+ * fibration.[ch]). For file body and stat-data, ordering contains initial
63708+ * prefix of the name file was initially created with. In the common case
63709+ * (files with single name) this allows to order file bodies and stat-datas in
63710+ * the same order as their respective directory entries, thus speeding up
63711+ * readdir.
63712+ *
63713+ * Note, that kernel can only mount file system with the same key size as one
63714+ * it is compiled for, so flipping this option may render your data
63715+ * inaccessible.
63716+ */
63717+#define REISER4_LARGE_KEY (1)
63718+/*#define REISER4_LARGE_KEY (0)*/
63719+
63720+/*#define GUESS_EXISTS 1*/
63721+
63722+/*
63723+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
63724+ * option
63725+ */
63726+
63727+extern const char *REISER4_SUPER_MAGIC_STRING;
63728+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
63729+ * beginning of device */
63730+
63731+/* here go tunable parameters that are not worth special entry in kernel
63732+ configuration */
63733+
63734+/* default number of slots in coord-by-key caches */
63735+#define CBK_CACHE_SLOTS (16)
63736+/* how many elementary tree operation to carry on the next level */
63737+#define CARRIES_POOL_SIZE (5)
63738+/* size of pool of preallocated nodes for carry process. */
63739+#define NODES_LOCKED_POOL_SIZE (5)
63740+
63741+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63742+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
63743+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
63744+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
63745+
63746+/* we are supporting reservation of disk space on uid basis */
63747+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
63748+/* we are supporting reservation of disk space for groups */
63749+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
63750+/* we are supporting reservation of disk space for root */
63751+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
63752+/* we use rapid flush mode, see flush.c for comments. */
63753+#define REISER4_USE_RAPID_FLUSH (1)
63754+
63755+/*
63756+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
63757+ */
63758+#define REISER4_USE_ENTD (1)
63759+
63760+/* key allocation is Plan-A */
63761+#define REISER4_PLANA_KEY_ALLOCATION (1)
63762+/* key allocation follows good old 3.x scheme */
63763+#define REISER4_3_5_KEY_ALLOCATION (0)
63764+
63765+/* size of hash-table for znodes */
63766+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
63767+
63768+/* number of buckets in lnode hash-table */
63769+#define LNODE_HTABLE_BUCKETS (1024)
63770+
63771+/* some ridiculously high maximal limit on height of znode tree. This
63772+ is used in declaration of various per level arrays and
63773+ to allocate stattistics gathering array for per-level stats. */
63774+#define REISER4_MAX_ZTREE_HEIGHT (8)
63775+
63776+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
63777+
63778+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
63779+ sequential search is on average faster than binary. This is because
63780+ of better optimization and because sequential search is more CPU
63781+ cache friendly. This number (25) was found by experiments on dual AMD
63782+ Athlon(tm), 1400MHz.
63783+
63784+ NOTE: testing in kernel has shown that binary search is more effective than
63785+ implied by results of the user level benchmarking. Probably because in the
63786+ node keys are separated by other data. So value was adjusted after few
63787+ tests. More thorough tuning is needed.
63788+*/
63789+#define REISER4_SEQ_SEARCH_BREAK (3)
63790+
63791+/* don't allow tree to be lower than this */
63792+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
63793+
63794+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
63795+ * available memory. */
63796+/* Default value of maximal atom size. Can be ovewritten by
63797+ tmgr.atom_max_size mount option. By default infinity. */
63798+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
63799+
63800+/* Default value of maximal atom age (in jiffies). After reaching this age
63801+ atom will be forced to commit, either synchronously or asynchronously. Can
63802+ be overwritten by tmgr.atom_max_age mount option. */
63803+#define REISER4_ATOM_MAX_AGE (600 * HZ)
63804+
63805+/* sleeping period for ktxnmrgd */
63806+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
63807+
63808+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
63809+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
63810+
63811+/* start complaining after that many restarts in coord_by_key().
63812+
63813+ This either means incredibly heavy contention for this part of a tree, or
63814+ some corruption or bug.
63815+*/
63816+#define REISER4_CBK_ITERATIONS_LIMIT (100)
63817+
63818+/* return -EIO after that many iterations in coord_by_key().
63819+
63820+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
63821+ finished. --nikita
63822+*/
63823+#define REISER4_MAX_CBK_ITERATIONS 500000
63824+
63825+/* put a per-inode limit on maximal number of directory entries with identical
63826+ keys in hashed directory.
63827+
63828+ Disable this until inheritance interfaces stabilize: we need some way to
63829+ set per directory limit.
63830+*/
63831+#define REISER4_USE_COLLISION_LIMIT (0)
63832+
63833+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
63834+ will force them to be relocated. */
63835+#define FLUSH_RELOCATE_THRESHOLD 64
63836+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
63837+ from the preceder it will relocate to that position. */
63838+#define FLUSH_RELOCATE_DISTANCE 64
63839+
63840+/* If we have written this much or more blocks before encountering busy jnode
63841+ in flush list - abort flushing hoping that next time we get called
63842+ this jnode will be clean already, and we will save some seeks. */
63843+#define FLUSH_WRITTEN_THRESHOLD 50
63844+
63845+/* The maximum number of nodes to scan left on a level during flush. */
63846+#define FLUSH_SCAN_MAXNODES 10000
63847+
63848+/* per-atom limit of flushers */
63849+#define ATOM_MAX_FLUSHERS (1)
63850+
63851+/* default tracing buffer size */
63852+#define REISER4_TRACE_BUF_SIZE (1 << 15)
63853+
63854+/* what size units of IO we would like cp, etc., to use, in writing to
63855+ reiser4. In bytes.
63856+
63857+ Can be overwritten by optimal_io_size mount option.
63858+*/
63859+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
63860+
63861+/* see comments in inode.c:oid_to_uino() */
63862+#define REISER4_UINO_SHIFT (1 << 30)
63863+
63864+/* Mark function argument as unused to avoid compiler warnings. */
63865+#define UNUSED_ARG __attribute__((unused))
63866+
63867+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
63868+#define NONNULL __attribute__((nonnull))
63869+#else
63870+#define NONNULL
63871+#endif
63872+
63873+/* master super block offset in bytes.*/
63874+#define REISER4_MASTER_OFFSET 65536
63875+
63876+/* size of VFS block */
63877+#define VFS_BLKSIZE 512
63878+/* number of bits in size of VFS block (512==2^9) */
63879+#define VFS_BLKSIZE_BITS 9
63880+
63881+#define REISER4_I reiser4_inode_data
63882+
63883+/* implication */
63884+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
63885+/* logical equivalence */
63886+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
63887+
63888+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
63889+
63890+#define NOT_YET (0)
63891+
63892+/** Reiser4 specific error codes **/
63893+
63894+#define REISER4_ERROR_CODE_BASE 500
63895+
63896+/* Neighbor is not available (side neighbor or parent) */
63897+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
63898+
63899+/* Node was not found in cache */
63900+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
63901+
63902+/* node has no free space enough for completion of balancing operation */
63903+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
63904+
63905+/* repeat operation */
63906+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
63907+
63908+/* deadlock happens */
63909+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
63910+
63911+/* operation cannot be performed, because it would block and non-blocking mode
63912+ * was requested. */
63913+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
63914+
63915+/* wait some event (depends on context), then repeat */
63916+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
63917+
63918+#endif /* __REISER4_H__ */
63919+
63920+/* Make Linus happy.
63921+ Local variables:
63922+ c-indentation-style: "K&R"
63923+ mode-name: "LC"
63924+ c-basic-offset: 8
63925+ tab-width: 8
63926+ fill-column: 120
63927+ End:
63928+*/
71430cf6
MT
63929diff -urN linux-2.6.22.orig/fs/reiser4/safe_link.c linux-2.6.22/fs/reiser4/safe_link.c
63930--- linux-2.6.22.orig/fs/reiser4/safe_link.c 1970-01-01 03:00:00.000000000 +0300
63931+++ linux-2.6.22/fs/reiser4/safe_link.c 2007-07-29 00:25:35.012731678 +0400
63932@@ -0,0 +1,352 @@
44254afd
MT
63933+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
63934+ * reiser4/README */
63935+
63936+/* Safe-links. */
63937+
63938+/*
63939+ * Safe-links are used to maintain file system consistency during operations
63940+ * that spawns multiple transactions. For example:
63941+ *
63942+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
63943+ * without user-visible names in the file system, but still opened by some
63944+ * active process. What happens here is that unlink proper (i.e., removal
63945+ * of the last file name) and file deletion (truncate of file body to zero
63946+ * and deletion of stat-data, that happens when last file descriptor is
63947+ * closed), may belong to different transactions T1 and T2. If a crash
63948+ * happens after T1 commit, but before T2 commit, on-disk file system has
63949+ * a file without name, that is, disk space leak.
63950+ *
63951+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
63952+ * system crashes while truncate was in-progress, file is left partially
63953+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
63954+ * every system is atomic.
63955+ *
63956+ * Safe-links address both above cases. Basically, safe-link is a way post
63957+ * some operation to be executed during commit of some other transaction than
63958+ * current one. (Another way to look at the safe-link is to interpret it as a
63959+ * logical logging.)
63960+ *
63961+ * Specifically, at the beginning of unlink safe-link in inserted in the
63962+ * tree. This safe-link is normally removed by file deletion code (during
63963+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
63964+ * normally removed when truncate operation is finished.
63965+ *
63966+ * This means, that in the case of "clean umount" there are no safe-links in
63967+ * the tree. If safe-links are observed during mount, it means that (a) system
63968+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
63969+ * (i.e., not finished) operations that were in-progress during system
63970+ * termination. Each safe-link record enough information to complete
63971+ * corresponding operation, and mount simply "replays" them (hence, the
63972+ * analogy with the logical logging).
63973+ *
63974+ * Safe-links are implemented as blackbox items (see
63975+ * plugin/item/blackbox.[ch]).
63976+ *
63977+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
63978+ * list" there.
63979+ */
63980+
63981+#include "safe_link.h"
63982+#include "debug.h"
63983+#include "inode.h"
63984+
63985+#include "plugin/item/blackbox.h"
63986+
63987+#include <linux/fs.h>
63988+
63989+/*
63990+ * On-disk format of safe-link.
63991+ */
63992+typedef struct safelink {
63993+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
63994+ * for */
63995+ d64 size; /* size to which file should be truncated */
63996+} safelink_t;
63997+
63998+/*
63999+ * locality where safe-link items are stored. Next to the objectid of root
64000+ * directory.
64001+ */
64002+static oid_t safe_link_locality(reiser4_tree * tree)
64003+{
64004+ return get_key_objectid(get_super_private(tree->super)->df_plug->
64005+ root_dir_key(tree->super)) + 1;
64006+}
64007+
64008+/*
64009+ Construct a key for the safe-link. Key has the following format:
64010+
64011+| 60 | 4 | 64 | 4 | 60 | 64 |
64012++---------------+---+------------------+---+---------------+------------------+
64013+| locality | 0 | 0 | 0 | objectid | link type |
64014++---------------+---+------------------+---+---------------+------------------+
64015+| | | | |
64016+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64017+
64018+ This is in large keys format. In small keys format second 8 byte chunk is
64019+ out. Locality is a constant returned by safe_link_locality(). objectid is
64020+ an oid of a file on which operation protected by this safe-link is
64021+ performed. link-type is used to distinguish safe-links for different
64022+ operations.
64023+
64024+ */
64025+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64026+ reiser4_safe_link_t link, reiser4_key * key)
64027+{
64028+ reiser4_key_init(key);
64029+ set_key_locality(key, safe_link_locality(tree));
64030+ set_key_objectid(key, oid);
64031+ set_key_offset(key, link);
64032+ return key;
64033+}
64034+
64035+/*
64036+ * how much disk space is necessary to insert and remove (in the
64037+ * error-handling path) safe-link.
64038+ */
64039+static __u64 safe_link_tograb(reiser4_tree * tree)
64040+{
64041+ return
64042+ /* insert safe link */
64043+ estimate_one_insert_item(tree) +
64044+ /* remove safe link */
64045+ estimate_one_item_removal(tree) +
64046+ /* drill to the leaf level during insertion */
64047+ 1 + estimate_one_insert_item(tree) +
64048+ /*
64049+ * possible update of existing safe-link. Actually, if
64050+ * safe-link existed already (we failed to remove it), then no
64051+ * insertion is necessary, so this term is already "covered",
64052+ * but for simplicity let's left it.
64053+ */
64054+ 1;
64055+}
64056+
64057+/*
64058+ * grab enough disk space to insert and remove (in the error-handling path)
64059+ * safe-link.
64060+ */
64061+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64062+{
64063+ int result;
64064+
64065+ grab_space_enable();
71430cf6 64066+ /* The sbinfo->delete_mutex can be taken here.
44254afd
MT
64067+ * safe_link_release() should be called before leaving reiser4
64068+ * context. */
64069+ result =
64070+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64071+ grab_space_enable();
64072+ return result;
64073+}
64074+
64075+/*
64076+ * release unused disk space reserved by safe_link_grab().
64077+ */
64078+void safe_link_release(reiser4_tree * tree)
64079+{
64080+ reiser4_release_reserved(tree->super);
64081+}
64082+
64083+/*
64084+ * insert into tree safe-link for operation @link on inode @inode.
64085+ */
64086+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64087+{
64088+ reiser4_key key;
64089+ safelink_t sl;
64090+ int length;
64091+ int result;
64092+ reiser4_tree *tree;
64093+
64094+ build_sd_key(inode, &sl.sdkey);
64095+ length = sizeof sl.sdkey;
64096+
64097+ if (link == SAFE_TRUNCATE) {
64098+ /*
64099+ * for truncate we have to store final file length also,
64100+ * expand item.
64101+ */
64102+ length += sizeof(sl.size);
64103+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64104+ }
71430cf6 64105+ tree = reiser4_tree_by_inode(inode);
44254afd
MT
64106+ build_link_key(tree, get_inode_oid(inode), link, &key);
64107+
64108+ result = store_black_box(tree, &key, &sl, length);
64109+ if (result == -EEXIST)
64110+ result = update_black_box(tree, &key, &sl, length);
64111+ return result;
64112+}
64113+
64114+/*
64115+ * remove safe-link corresponding to the operation @link on inode @inode from
64116+ * the tree.
64117+ */
64118+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64119+{
64120+ reiser4_key key;
64121+
64122+ return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64123+}
64124+
64125+/*
64126+ * in-memory structure to keep information extracted from safe-link. This is
64127+ * used to iterate over all safe-links.
64128+ */
71430cf6 64129+struct safe_link_context {
44254afd
MT
64130+ reiser4_tree *tree; /* internal tree */
64131+ reiser4_key key; /* safe-link key */
64132+ reiser4_key sdkey; /* key of object stat-data */
64133+ reiser4_safe_link_t link; /* safe-link type */
64134+ oid_t oid; /* object oid */
64135+ __u64 size; /* final size for truncate */
71430cf6 64136+};
44254afd
MT
64137+
64138+/*
64139+ * start iterating over all safe-links.
64140+ */
71430cf6
MT
64141+static void safe_link_iter_begin(reiser4_tree * tree,
64142+ struct safe_link_context * ctx)
44254afd
MT
64143+{
64144+ ctx->tree = tree;
64145+ reiser4_key_init(&ctx->key);
64146+ set_key_locality(&ctx->key, safe_link_locality(tree));
71430cf6
MT
64147+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
64148+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
44254afd
MT
64149+}
64150+
64151+/*
64152+ * return next safe-link.
64153+ */
71430cf6 64154+static int safe_link_iter_next(struct safe_link_context * ctx)
44254afd
MT
64155+{
64156+ int result;
64157+ safelink_t sl;
64158+
64159+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64160+ if (result == 0) {
64161+ ctx->oid = get_key_objectid(&ctx->key);
64162+ ctx->link = get_key_offset(&ctx->key);
64163+ ctx->sdkey = sl.sdkey;
64164+ if (ctx->link == SAFE_TRUNCATE)
64165+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64166+ }
64167+ return result;
64168+}
64169+
64170+/*
64171+ * check are there any more safe-links left in the tree.
64172+ */
71430cf6 64173+static int safe_link_iter_finished(struct safe_link_context * ctx)
44254afd
MT
64174+{
64175+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64176+}
64177+
64178+/*
64179+ * finish safe-link iteration.
64180+ */
71430cf6 64181+static void safe_link_iter_end(struct safe_link_context * ctx)
44254afd
MT
64182+{
64183+ /* nothing special */
64184+}
64185+
64186+/*
64187+ * process single safe-link.
64188+ */
64189+static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64190+ reiser4_key * sdkey, oid_t oid, __u64 size)
64191+{
64192+ struct inode *inode;
64193+ int result;
64194+
64195+ /*
64196+ * obtain object inode by reiser4_iget(), then call object plugin
64197+ * ->safelink() method to do actual work, then delete safe-link on
64198+ * success.
64199+ */
64200+ inode = reiser4_iget(super, sdkey, 1);
64201+ if (!IS_ERR(inode)) {
64202+ file_plugin *fplug;
64203+
64204+ fplug = inode_file_plugin(inode);
64205+ assert("nikita-3428", fplug != NULL);
64206+ assert("", oid == get_inode_oid(inode));
64207+ if (fplug->safelink != NULL) {
71430cf6 64208+ /* reiser4_txn_restart_current is not necessary because
44254afd
MT
64209+ * mounting is signle thread. However, without it
64210+ * deadlock detection code will complain (see
64211+ * nikita-3361). */
71430cf6 64212+ reiser4_txn_restart_current();
44254afd
MT
64213+ result = fplug->safelink(inode, link, size);
64214+ } else {
64215+ warning("nikita-3430",
64216+ "Cannot handle safelink for %lli",
64217+ (unsigned long long)oid);
71430cf6 64218+ reiser4_print_key("key", sdkey);
44254afd
MT
64219+ result = 0;
64220+ }
64221+ if (result != 0) {
64222+ warning("nikita-3431",
64223+ "Error processing safelink for %lli: %i",
64224+ (unsigned long long)oid, result);
64225+ }
64226+ reiser4_iget_complete(inode);
64227+ iput(inode);
64228+ if (result == 0) {
71430cf6 64229+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
44254afd
MT
64230+ if (result == 0)
64231+ result =
71430cf6
MT
64232+ safe_link_del(reiser4_get_tree(super), oid, link);
64233+ safe_link_release(reiser4_get_tree(super));
44254afd
MT
64234+ /*
64235+ * restart transaction: if there was large number of
64236+ * safe-links, their processing may fail to fit into
64237+ * single transaction.
64238+ */
64239+ if (result == 0)
71430cf6 64240+ reiser4_txn_restart_current();
44254afd
MT
64241+ }
64242+ } else
64243+ result = PTR_ERR(inode);
64244+ return result;
64245+}
64246+
64247+/*
64248+ * iterate over all safe-links in the file-system processing them one by one.
64249+ */
64250+int process_safelinks(struct super_block *super)
64251+{
71430cf6 64252+ struct safe_link_context ctx;
44254afd
MT
64253+ int result;
64254+
64255+ if (rofs_super(super))
64256+ /* do nothing on the read-only file system */
64257+ return 0;
64258+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64259+ result = 0;
64260+ do {
64261+ result = safe_link_iter_next(&ctx);
64262+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64263+ result = 0;
64264+ break;
64265+ }
64266+ if (result == 0)
64267+ result = process_safelink(super, ctx.link,
64268+ &ctx.sdkey, ctx.oid,
64269+ ctx.size);
64270+ } while (result == 0);
64271+ safe_link_iter_end(&ctx);
64272+ return result;
64273+}
64274+
64275+/* Make Linus happy.
64276+ Local variables:
64277+ c-indentation-style: "K&R"
64278+ mode-name: "LC"
64279+ c-basic-offset: 8
64280+ tab-width: 8
64281+ fill-column: 120
64282+ scroll-step: 1
64283+ End:
64284+*/
71430cf6
MT
64285diff -urN linux-2.6.22.orig/fs/reiser4/safe_link.h linux-2.6.22/fs/reiser4/safe_link.h
64286--- linux-2.6.22.orig/fs/reiser4/safe_link.h 1970-01-01 03:00:00.000000000 +0300
64287+++ linux-2.6.22/fs/reiser4/safe_link.h 2007-07-29 00:25:35.012731678 +0400
44254afd
MT
64288@@ -0,0 +1,29 @@
64289+/* Copyright 2003 by Hans Reiser, licensing governed by
64290+ * reiser4/README */
64291+
64292+/* Safe-links. See safe_link.c for details. */
64293+
64294+#if !defined( __FS_SAFE_LINK_H__ )
64295+#define __FS_SAFE_LINK_H__
64296+
64297+#include "tree.h"
64298+
64299+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64300+void safe_link_release(reiser4_tree * tree);
64301+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64302+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64303+
64304+int process_safelinks(struct super_block *super);
64305+
64306+/* __FS_SAFE_LINK_H__ */
64307+#endif
64308+
64309+/* Make Linus happy.
64310+ Local variables:
64311+ c-indentation-style: "K&R"
64312+ mode-name: "LC"
64313+ c-basic-offset: 8
64314+ tab-width: 8
64315+ fill-column: 120
64316+ End:
64317+*/
71430cf6
MT
64318diff -urN linux-2.6.22.orig/fs/reiser4/seal.c linux-2.6.22/fs/reiser4/seal.c
64319--- linux-2.6.22.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
64320+++ linux-2.6.22/fs/reiser4/seal.c 2007-07-29 00:25:35.012731678 +0400
64321@@ -0,0 +1,218 @@
44254afd
MT
64322+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64323+/* Seals implementation. */
64324+/* Seals are "weak" tree pointers. They are analogous to tree coords in
64325+ allowing to bypass tree traversal. But normal usage of coords implies that
64326+ node pointed to by coord is locked, whereas seals don't keep a lock (or
64327+ even a reference) to znode. In stead, each znode contains a version number,
64328+ increased on each znode modification. This version number is copied into a
64329+ seal when seal is created. Later, one can "validate" seal by calling
71430cf6
MT
64330+ reiser4_seal_validate(). If znode is in cache and its version number is
64331+ still the same, seal is "pristine" and coord associated with it can be
64332+ re-used immediately.
44254afd
MT
64333+
64334+ If, on the other hand, znode is out of cache, or it is obviously different
64335+ one from the znode seal was initially attached to (for example, it is on
64336+ the different level, or is being removed from the tree), seal is
64337+ irreparably invalid ("burned") and tree traversal has to be repeated.
64338+
64339+ Otherwise, there is some hope, that while znode was modified (and seal was
64340+ "broken" as a result), key attached to the seal is still in the node. This
64341+ is checked by first comparing this key with delimiting keys of node and, if
64342+ key is ok, doing intra-node lookup.
64343+
64344+ Znode version is maintained in the following way:
64345+
64346+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64347+ znode_epoch is incremented and its new value is stored in ->version field
64348+ of new znode. Whenever znode is dirtied (which means it was probably
64349+ modified), znode_epoch is also incremented and its new value is stored in
64350+ znode->version. This is done so, because just incrementing znode->version
64351+ on each update is not enough: it may so happen, that znode get deleted, new
64352+ znode is allocated for the same disk block and gets the same version
64353+ counter, tricking seal code into false positive.
64354+*/
64355+
64356+#include "forward.h"
64357+#include "debug.h"
64358+#include "key.h"
64359+#include "coord.h"
64360+#include "seal.h"
64361+#include "plugin/item/item.h"
64362+#include "plugin/node/node.h"
64363+#include "jnode.h"
64364+#include "znode.h"
64365+#include "super.h"
64366+
64367+static znode *seal_node(const seal_t * seal);
64368+static int seal_matches(const seal_t * seal, znode * node);
64369+
64370+/* initialise seal. This can be called several times on the same seal. @coord
64371+ and @key can be NULL. */
71430cf6
MT
64372+void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
64373+ const coord_t * coord /* coord @seal will be
64374+ * attached to */ ,
64375+ const reiser4_key * key UNUSED_ARG /* key @seal will be
64376+ * attached to */ )
44254afd
MT
64377+{
64378+ assert("nikita-1886", seal != NULL);
64379+ memset(seal, 0, sizeof *seal);
64380+ if (coord != NULL) {
64381+ znode *node;
64382+
64383+ node = coord->node;
64384+ assert("nikita-1987", node != NULL);
64385+ spin_lock_znode(node);
64386+ seal->version = node->version;
64387+ assert("nikita-1988", seal->version != 0);
64388+ seal->block = *znode_get_block(node);
64389+#if REISER4_DEBUG
64390+ seal->coord1 = *coord;
64391+ if (key != NULL)
64392+ seal->key = *key;
64393+#endif
64394+ spin_unlock_znode(node);
64395+ }
64396+}
64397+
64398+/* finish with seal */
71430cf6 64399+void reiser4_seal_done(seal_t * seal /* seal to clear */ )
44254afd
MT
64400+{
64401+ assert("nikita-1887", seal != NULL);
64402+ seal->version = 0;
64403+}
64404+
64405+/* true if seal was initialised */
71430cf6 64406+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
44254afd
MT
64407+{
64408+ assert("nikita-1890", seal != NULL);
64409+ return seal->version != 0;
64410+}
64411+
64412+#if REISER4_DEBUG
71430cf6
MT
64413+/* helper function for reiser4_seal_validate(). It checks that item at @coord
64414+ * has expected key. This is to detect cases where node was modified but wasn't
44254afd
MT
64415+ * marked dirty. */
64416+static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
64417+ const reiser4_key * k /* expected key */ )
64418+{
64419+ reiser4_key ukey;
64420+
64421+ return (coord->between != AT_UNIT) ||
64422+ /* FIXME-VS: we only can compare keys for items whose units
64423+ represent exactly one key */
64424+ ((coord_is_existing_unit(coord))
64425+ && (item_is_extent(coord)
64426+ || keyeq(k, unit_key_by_coord(coord, &ukey))))
64427+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
64428+ && keyge(k, unit_key_by_coord(coord, &ukey)));
64429+}
64430+#endif
64431+
71430cf6 64432+/* this is used by reiser4_seal_validate. It accepts return value of
44254afd
MT
64433+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
64434+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
71430cf6
MT
64435+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
64436+ * We cannot do this in longterm_lock_znode(), because sometimes we want to
64437+ * distinguish between -EINVAL and -E_REPEAT. */
44254afd
MT
64438+static int should_repeat(int return_code)
64439+{
64440+ return return_code == -EINVAL;
64441+}
64442+
64443+/* (re-)validate seal.
64444+
64445+ Checks whether seal is pristine, and try to revalidate it if possible.
64446+
64447+ If seal was burned, or broken irreparably, return -E_REPEAT.
64448+
71430cf6 64449+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
44254afd
MT
64450+ looking for is in range of keys covered by the sealed node, but item wasn't
64451+ found by node ->lookup() method. Alternative is to return -ENOENT in this
64452+ case, but this would complicate callers logic.
64453+
64454+*/
71430cf6
MT
64455+int reiser4_seal_validate(seal_t * seal /* seal to validate */,
64456+ coord_t * coord /* coord to validate against */,
64457+ const reiser4_key * key /* key to validate against */,
64458+ lock_handle * lh /* resulting lock handle */,
64459+ znode_lock_mode mode /* lock node */,
64460+ znode_lock_request request /* locking priority */)
44254afd
MT
64461+{
64462+ znode *node;
64463+ int result;
64464+
64465+ assert("nikita-1889", seal != NULL);
71430cf6 64466+ assert("nikita-1881", reiser4_seal_is_set(seal));
44254afd
MT
64467+ assert("nikita-1882", key != NULL);
64468+ assert("nikita-1883", coord != NULL);
64469+ assert("nikita-1884", lh != NULL);
64470+ assert("nikita-1885", keyeq(&seal->key, key));
64471+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
64472+
64473+ /* obtain znode by block number */
64474+ node = seal_node(seal);
64475+ if (node != NULL) {
64476+ /* znode was in cache, lock it */
64477+ result = longterm_lock_znode(lh, node, mode, request);
64478+ zput(node);
64479+ if (result == 0) {
64480+ if (seal_matches(seal, node)) {
64481+ /* if seal version and znode version
64482+ coincide */
64483+ ON_DEBUG(coord_update_v(coord));
64484+ assert("nikita-1990",
64485+ node == seal->coord1.node);
64486+ assert("nikita-1898",
64487+ WITH_DATA_RET(coord->node, 1,
64488+ check_seal_match(coord,
64489+ key)));
64490+ } else
64491+ result = RETERR(-E_REPEAT);
64492+ }
64493+ if (result != 0) {
64494+ if (should_repeat(result))
64495+ result = RETERR(-E_REPEAT);
64496+ /* unlock node on failure */
64497+ done_lh(lh);
64498+ }
64499+ } else {
64500+ /* znode wasn't in cache */
64501+ result = RETERR(-E_REPEAT);
64502+ }
64503+ return result;
64504+}
64505+
64506+/* helpers functions */
64507+
64508+/* obtain reference to znode seal points to, if in cache */
64509+static znode *seal_node(const seal_t * seal /* seal to query */ )
64510+{
64511+ assert("nikita-1891", seal != NULL);
64512+ return zlook(current_tree, &seal->block);
64513+}
64514+
64515+/* true if @seal version and @node version coincide */
64516+static int seal_matches(const seal_t * seal /* seal to check */ ,
64517+ znode * node /* node to check */ )
64518+{
64519+ int result;
64520+
64521+ assert("nikita-1991", seal != NULL);
64522+ assert("nikita-1993", node != NULL);
64523+
64524+ spin_lock_znode(node);
64525+ result = (seal->version == node->version);
64526+ spin_unlock_znode(node);
64527+ return result;
64528+}
64529+
64530+/* Make Linus happy.
64531+ Local variables:
64532+ c-indentation-style: "K&R"
64533+ mode-name: "LC"
64534+ c-basic-offset: 8
64535+ tab-width: 8
64536+ fill-column: 120
64537+ scroll-step: 1
64538+ End:
64539+*/
71430cf6
MT
64540diff -urN linux-2.6.22.orig/fs/reiser4/seal.h linux-2.6.22/fs/reiser4/seal.h
64541--- linux-2.6.22.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
64542+++ linux-2.6.22/fs/reiser4/seal.h 2007-07-29 00:25:35.012731678 +0400
44254afd
MT
64543@@ -0,0 +1,49 @@
64544+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64545+
64546+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
64547+
64548+#ifndef __SEAL_H__
64549+#define __SEAL_H__
64550+
64551+#include "forward.h"
64552+#include "debug.h"
64553+#include "dformat.h"
64554+#include "key.h"
64555+#include "coord.h"
64556+
64557+/* for __u?? types */
64558+/*#include <linux/types.h>*/
64559+
64560+/* seal. See comment at the top of seal.c */
64561+typedef struct seal_s {
64562+ /* version of znode recorder at the time of seal creation */
64563+ __u64 version;
64564+ /* block number of znode attached to this seal */
64565+ reiser4_block_nr block;
64566+#if REISER4_DEBUG
64567+ /* coord this seal is attached to. For debugging. */
64568+ coord_t coord1;
64569+ /* key this seal is attached to. For debugging. */
64570+ reiser4_key key;
64571+#endif
64572+} seal_t;
64573+
71430cf6
MT
64574+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
64575+extern void reiser4_seal_done(seal_t *);
64576+extern int reiser4_seal_is_set(const seal_t *);
64577+extern int reiser4_seal_validate(seal_t *, coord_t *,
44254afd
MT
64578+ const reiser4_key *, lock_handle *,
64579+ znode_lock_mode mode, znode_lock_request request);
64580+
64581+/* __SEAL_H__ */
64582+#endif
64583+
64584+/* Make Linus happy.
64585+ Local variables:
64586+ c-indentation-style: "K&R"
64587+ mode-name: "LC"
64588+ c-basic-offset: 8
64589+ tab-width: 8
64590+ fill-column: 120
64591+ End:
64592+*/
71430cf6
MT
64593diff -urN linux-2.6.22.orig/fs/reiser4/search.c linux-2.6.22/fs/reiser4/search.c
64594--- linux-2.6.22.orig/fs/reiser4/search.c 1970-01-01 03:00:00.000000000 +0300
64595+++ linux-2.6.22/fs/reiser4/search.c 2007-07-29 00:25:35.016732714 +0400
44254afd
MT
64596@@ -0,0 +1,1611 @@
64597+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64598+ * reiser4/README */
64599+
64600+#include "forward.h"
64601+#include "debug.h"
64602+#include "dformat.h"
64603+#include "key.h"
64604+#include "coord.h"
64605+#include "seal.h"
64606+#include "plugin/item/item.h"
64607+#include "plugin/node/node.h"
64608+#include "plugin/plugin.h"
64609+#include "jnode.h"
64610+#include "znode.h"
64611+#include "block_alloc.h"
64612+#include "tree_walk.h"
64613+#include "tree.h"
64614+#include "reiser4.h"
64615+#include "super.h"
64616+#include "inode.h"
64617+
64618+#include <linux/slab.h>
64619+
64620+static const char *bias_name(lookup_bias bias);
64621+
64622+/* tree searching algorithm, intranode searching algorithms are in
64623+ plugin/node/ */
64624+
64625+/* tree lookup cache
64626+ *
64627+ * The coord by key cache consists of small list of recently accessed nodes
64628+ * maintained according to the LRU discipline. Before doing real top-to-down
64629+ * tree traversal this cache is scanned for nodes that can contain key
64630+ * requested.
64631+ *
64632+ * The efficiency of coord cache depends heavily on locality of reference for
64633+ * tree accesses. Our user level simulations show reasonably good hit ratios
64634+ * for coord cache under most loads so far.
64635+ */
64636+
64637+/* Initialise coord cache slot */
64638+static void cbk_cache_init_slot(cbk_cache_slot *slot)
64639+{
64640+ assert("nikita-345", slot != NULL);
64641+
64642+ INIT_LIST_HEAD(&slot->lru);
64643+ slot->node = NULL;
64644+}
64645+
64646+/* Initialize coord cache */
64647+int cbk_cache_init(cbk_cache *cache /* cache to init */ )
64648+{
64649+ int i;
64650+
64651+ assert("nikita-346", cache != NULL);
64652+
64653+ cache->slot =
71430cf6
MT
64654+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
64655+ reiser4_ctx_gfp_mask_get());
44254afd
MT
64656+ if (cache->slot == NULL)
64657+ return RETERR(-ENOMEM);
64658+
64659+ INIT_LIST_HEAD(&cache->lru);
64660+ for (i = 0; i < cache->nr_slots; ++i) {
64661+ cbk_cache_init_slot(cache->slot + i);
64662+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
64663+ }
64664+ rwlock_init(&cache->guard);
64665+ return 0;
64666+}
64667+
64668+/* free cbk cache data */
64669+void cbk_cache_done(cbk_cache * cache /* cache to release */ )
64670+{
64671+ assert("nikita-2493", cache != NULL);
64672+ if (cache->slot != NULL) {
64673+ kfree(cache->slot);
64674+ cache->slot = NULL;
64675+ }
64676+}
64677+
64678+/* macro to iterate over all cbk cache slots */
64679+#define for_all_slots(cache, slot) \
64680+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
64681+ &(cache)->lru != &(slot)->lru; \
64682+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
64683+
44254afd
MT
64684+#if REISER4_DEBUG
64685+/* this function assures that [cbk-cache-invariant] invariant holds */
64686+static int cbk_cache_invariant(const cbk_cache *cache)
64687+{
64688+ cbk_cache_slot *slot;
64689+ int result;
64690+ int unused;
64691+
64692+ if (cache->nr_slots == 0)
64693+ return 1;
64694+
64695+ assert("nikita-2469", cache != NULL);
64696+ unused = 0;
64697+ result = 1;
64698+ read_lock(&((cbk_cache *)cache)->guard);
64699+ for_all_slots(cache, slot) {
64700+ /* in LRU first go all `used' slots followed by `unused' */
64701+ if (unused && (slot->node != NULL))
64702+ result = 0;
64703+ if (slot->node == NULL)
64704+ unused = 1;
64705+ else {
64706+ cbk_cache_slot *scan;
64707+
64708+ /* all cached nodes are different */
64709+ scan = slot;
64710+ while (result) {
64711+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
64712+ if (&cache->lru == &scan->lru)
64713+ break;
64714+ if (slot->node == scan->node)
64715+ result = 0;
64716+ }
64717+ }
64718+ if (!result)
64719+ break;
64720+ }
64721+ read_unlock(&((cbk_cache *)cache)->guard);
64722+ return result;
64723+}
64724+
64725+#endif
64726+
64727+/* Remove references, if any, to @node from coord cache */
64728+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
64729+ reiser4_tree * tree /* tree to remove node from */ )
64730+{
64731+ cbk_cache_slot *slot;
64732+ cbk_cache *cache;
64733+ int i;
64734+
64735+ assert("nikita-350", node != NULL);
64736+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
64737+
64738+ cache = &tree->cbk_cache;
64739+ assert("nikita-2470", cbk_cache_invariant(cache));
64740+
64741+ write_lock(&(cache->guard));
64742+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
64743+ if (slot->node == node) {
64744+ list_move_tail(&slot->lru, &cache->lru);
64745+ slot->node = NULL;
64746+ break;
64747+ }
64748+ }
64749+ write_unlock(&(cache->guard));
64750+ assert("nikita-2471", cbk_cache_invariant(cache));
64751+}
64752+
64753+/* add to the cbk-cache in the "tree" information about "node". This
64754+ can actually be update of existing slot in a cache. */
64755+static void cbk_cache_add(const znode *node /* node to add to the cache */ )
64756+{
64757+ cbk_cache *cache;
64758+ cbk_cache_slot *slot;
64759+ int i;
64760+
64761+ assert("nikita-352", node != NULL);
64762+
64763+ cache = &znode_get_tree(node)->cbk_cache;
64764+ assert("nikita-2472", cbk_cache_invariant(cache));
64765+
64766+ if (cache->nr_slots == 0)
64767+ return;
64768+
64769+ write_lock(&(cache->guard));
64770+ /* find slot to update/add */
64771+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
64772+ /* oops, this node is already in a cache */
64773+ if (slot->node == node)
64774+ break;
64775+ }
64776+ /* if all slots are used, reuse least recently used one */
64777+ if (i == cache->nr_slots) {
64778+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
64779+ slot->node = (znode *) node;
64780+ }
64781+ list_move(&slot->lru, &cache->lru);
64782+ write_unlock(&(cache->guard));
64783+ assert("nikita-2473", cbk_cache_invariant(cache));
64784+}
64785+
64786+static int setup_delimiting_keys(cbk_handle * h);
64787+static lookup_result coord_by_handle(cbk_handle * handle);
64788+static lookup_result traverse_tree(cbk_handle * h);
64789+static int cbk_cache_search(cbk_handle * h);
64790+
64791+static level_lookup_result cbk_level_lookup(cbk_handle * h);
64792+static level_lookup_result cbk_node_lookup(cbk_handle * h);
64793+
64794+/* helper functions */
64795+
64796+static void update_stale_dk(reiser4_tree * tree, znode * node);
64797+
64798+/* release parent node during traversal */
64799+static void put_parent(cbk_handle * h);
64800+/* check consistency of fields */
64801+static int sanity_check(cbk_handle * h);
64802+/* release resources in handle */
64803+static void hput(cbk_handle * h);
64804+
64805+static level_lookup_result search_to_left(cbk_handle * h);
64806+
64807+/* pack numerous (numberous I should say) arguments of coord_by_key() into
64808+ * cbk_handle */
64809+static cbk_handle *cbk_pack(cbk_handle * handle,
64810+ reiser4_tree * tree,
64811+ const reiser4_key * key,
64812+ coord_t * coord,
64813+ lock_handle * active_lh,
64814+ lock_handle * parent_lh,
64815+ znode_lock_mode lock_mode,
64816+ lookup_bias bias,
64817+ tree_level lock_level,
64818+ tree_level stop_level,
64819+ __u32 flags, ra_info_t * info)
64820+{
64821+ memset(handle, 0, sizeof *handle);
64822+
64823+ handle->tree = tree;
64824+ handle->key = key;
64825+ handle->lock_mode = lock_mode;
64826+ handle->bias = bias;
64827+ handle->lock_level = lock_level;
64828+ handle->stop_level = stop_level;
64829+ handle->coord = coord;
64830+ /* set flags. See comment in tree.h:cbk_flags */
64831+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
64832+
64833+ handle->active_lh = active_lh;
64834+ handle->parent_lh = parent_lh;
64835+ handle->ra_info = info;
64836+ return handle;
64837+}
64838+
64839+/* main tree lookup procedure
64840+
64841+ Check coord cache. If key we are looking for is not found there, call cbk()
64842+ to do real tree traversal.
64843+
64844+ As we have extents on the twig level, @lock_level and @stop_level can
64845+ be different from LEAF_LEVEL and each other.
64846+
64847+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
64848+ long term locks) while calling this.
64849+*/
64850+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
64851+ * in. Usually this tree is
64852+ * part of file-system
64853+ * super-block */ ,
64854+ const reiser4_key * key /* key to look for */ ,
64855+ coord_t * coord /* where to store found
64856+ * position in a tree. Fields
64857+ * in "coord" are only valid if
64858+ * coord_by_key() returned
64859+ * "CBK_COORD_FOUND" */ ,
64860+ lock_handle * lh, /* resulting lock handle */
64861+ znode_lock_mode lock_mode /* type of lookup we
64862+ * want on node. Pass
64863+ * ZNODE_READ_LOCK here
64864+ * if you only want to
64865+ * read item found and
64866+ * ZNODE_WRITE_LOCK if
64867+ * you want to modify
64868+ * it */ ,
64869+ lookup_bias bias /* what to return if coord
64870+ * with exactly the @key is
64871+ * not in the tree */ ,
64872+ tree_level lock_level /* tree level where to start
64873+ * taking @lock type of
64874+ * locks */ ,
64875+ tree_level stop_level /* tree level to stop. Pass
64876+ * LEAF_LEVEL or TWIG_LEVEL
64877+ * here Item being looked
64878+ * for has to be between
64879+ * @lock_level and
64880+ * @stop_level, inclusive */ ,
64881+ __u32 flags /* search flags */ ,
64882+ ra_info_t *
64883+ info
64884+ /* information about desired tree traversal readahead */
64885+ )
64886+{
64887+ cbk_handle handle;
64888+ lock_handle parent_lh;
64889+ lookup_result result;
64890+
64891+ init_lh(lh);
64892+ init_lh(&parent_lh);
64893+
71430cf6 64894+ assert("nikita-3023", reiser4_schedulable());
44254afd
MT
64895+
64896+ assert("nikita-353", tree != NULL);
64897+ assert("nikita-354", key != NULL);
64898+ assert("nikita-355", coord != NULL);
64899+ assert("nikita-356", (bias == FIND_EXACT)
64900+ || (bias == FIND_MAX_NOT_MORE_THAN));
64901+ assert("nikita-357", stop_level >= LEAF_LEVEL);
64902+ /* no locks can be held during tree traversal */
64903+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
64904+
64905+ cbk_pack(&handle,
64906+ tree,
64907+ key,
64908+ coord,
64909+ lh,
64910+ &parent_lh,
64911+ lock_mode, bias, lock_level, stop_level, flags, info);
64912+
64913+ result = coord_by_handle(&handle);
64914+ assert("nikita-3247",
64915+ ergo(!IS_CBKERR(result), coord->node == lh->node));
64916+ return result;
64917+}
64918+
64919+/* like coord_by_key(), but starts traversal from vroot of @object rather than
64920+ * from tree root. */
71430cf6
MT
64921+lookup_result reiser4_object_lookup(struct inode * object,
64922+ const reiser4_key * key,
64923+ coord_t * coord,
64924+ lock_handle * lh,
64925+ znode_lock_mode lock_mode,
64926+ lookup_bias bias,
64927+ tree_level lock_level,
64928+ tree_level stop_level, __u32 flags,
64929+ ra_info_t * info)
44254afd
MT
64930+{
64931+ cbk_handle handle;
64932+ lock_handle parent_lh;
64933+ lookup_result result;
64934+
64935+ init_lh(lh);
64936+ init_lh(&parent_lh);
64937+
71430cf6 64938+ assert("nikita-3023", reiser4_schedulable());
44254afd
MT
64939+
64940+ assert("nikita-354", key != NULL);
64941+ assert("nikita-355", coord != NULL);
64942+ assert("nikita-356", (bias == FIND_EXACT)
64943+ || (bias == FIND_MAX_NOT_MORE_THAN));
64944+ assert("nikita-357", stop_level >= LEAF_LEVEL);
64945+ /* no locks can be held during tree search by key */
64946+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
64947+
64948+ cbk_pack(&handle,
71430cf6 64949+ object != NULL ? reiser4_tree_by_inode(object) : current_tree,
44254afd
MT
64950+ key,
64951+ coord,
64952+ lh,
64953+ &parent_lh,
64954+ lock_mode, bias, lock_level, stop_level, flags, info);
64955+ handle.object = object;
64956+
64957+ result = coord_by_handle(&handle);
64958+ assert("nikita-3247",
64959+ ergo(!IS_CBKERR(result), coord->node == lh->node));
64960+ return result;
64961+}
64962+
71430cf6
MT
64963+/* lookup by cbk_handle. Common part of coord_by_key() and
64964+ reiser4_object_lookup(). */
44254afd
MT
64965+static lookup_result coord_by_handle(cbk_handle * handle)
64966+{
64967+ /*
64968+ * first check cbk_cache (which is look-aside cache for our tree) and
64969+ * of this fails, start traversal.
64970+ */
64971+ /* first check whether "key" is in cache of recent lookups. */
64972+ if (cbk_cache_search(handle) == 0)
64973+ return handle->result;
64974+ else
64975+ return traverse_tree(handle);
64976+}
64977+
64978+/* Execute actor for each item (or unit, depending on @through_units_p),
64979+ starting from @coord, right-ward, until either:
64980+
64981+ - end of the tree is reached
64982+ - unformatted node is met
64983+ - error occurred
64984+ - @actor returns 0 or less
64985+
64986+ Error code, or last actor return value is returned.
64987+
71430cf6 64988+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
44254afd
MT
64989+ sequence of entries with identical keys and alikes.
64990+*/
71430cf6
MT
64991+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
64992+ coord_t * coord /* coord to start from */ ,
64993+ lock_handle * lh /* lock handle to start with and to
64994+ * update along the way */ ,
64995+ tree_iterate_actor_t actor /* function to call on each
64996+ * item/unit */ ,
64997+ void *arg /* argument to pass to @actor */ ,
64998+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
64999+ int through_units_p /* call @actor on each item or on
65000+ * each unit */ )
44254afd
MT
65001+{
65002+ int result;
65003+
65004+ assert("nikita-1143", tree != NULL);
65005+ assert("nikita-1145", coord != NULL);
65006+ assert("nikita-1146", lh != NULL);
65007+ assert("nikita-1147", actor != NULL);
65008+
65009+ result = zload(coord->node);
65010+ coord_clear_iplug(coord);
65011+ if (result != 0)
65012+ return result;
65013+ if (!coord_is_existing_unit(coord)) {
65014+ zrelse(coord->node);
65015+ return -ENOENT;
65016+ }
65017+ while ((result = actor(tree, coord, lh, arg)) > 0) {
65018+ /* move further */
65019+ if ((through_units_p && coord_next_unit(coord)) ||
65020+ (!through_units_p && coord_next_item(coord))) {
65021+ do {
65022+ lock_handle couple;
65023+
65024+ /* move to the next node */
65025+ init_lh(&couple);
65026+ result =
65027+ reiser4_get_right_neighbor(&couple,
65028+ coord->node,
65029+ (int)mode,
65030+ GN_CAN_USE_UPPER_LEVELS);
65031+ zrelse(coord->node);
65032+ if (result == 0) {
65033+
65034+ result = zload(couple.node);
65035+ if (result != 0) {
65036+ done_lh(&couple);
65037+ return result;
65038+ }
65039+
65040+ coord_init_first_unit(coord,
65041+ couple.node);
65042+ done_lh(lh);
65043+ move_lh(lh, &couple);
65044+ } else
65045+ return result;
65046+ } while (node_is_empty(coord->node));
65047+ }
65048+
65049+ assert("nikita-1149", coord_is_existing_unit(coord));
65050+ }
65051+ zrelse(coord->node);
65052+ return result;
65053+}
65054+
65055+/* return locked uber znode for @tree */
65056+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65057+ znode_lock_request pri, lock_handle * lh)
65058+{
65059+ int result;
65060+
65061+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
65062+ return result;
65063+}
65064+
65065+/* true if @key is strictly within @node
65066+
65067+ we are looking for possibly non-unique key and it is item is at the edge of
65068+ @node. May be it is in the neighbor.
65069+*/
65070+static int znode_contains_key_strict(znode * node /* node to check key
65071+ * against */ ,
65072+ const reiser4_key *
65073+ key /* key to check */ ,
65074+ int isunique)
65075+{
65076+ int answer;
65077+
65078+ assert("nikita-1760", node != NULL);
65079+ assert("nikita-1722", key != NULL);
65080+
65081+ if (keyge(key, &node->rd_key))
65082+ return 0;
65083+
65084+ answer = keycmp(&node->ld_key, key);
65085+
65086+ if (isunique)
65087+ return answer != GREATER_THAN;
65088+ else
65089+ return answer == LESS_THAN;
65090+}
65091+
65092+/*
65093+ * Virtual Root (vroot) code.
65094+ *
65095+ * For given file system object (e.g., regular file or directory) let's
65096+ * define its "virtual root" as lowest in the tree (that is, furtherest
65097+ * from the tree root) node such that all body items of said object are
65098+ * located in a tree rooted at this node.
65099+ *
65100+ * Once vroot of object is found all tree lookups for items within body of
65101+ * this object ("object lookups") can be started from its vroot rather
65102+ * than from real root. This has following advantages:
65103+ *
65104+ * 1. amount of nodes traversed during lookup (and, hence, amount of
65105+ * key comparisons made) decreases, and
65106+ *
65107+ * 2. contention on tree root is decreased. This latter was actually
65108+ * motivating reason behind vroot, because spin lock of root node,
65109+ * which is taken when acquiring long-term lock on root node is the
65110+ * hottest lock in the reiser4.
65111+ *
65112+ * How to find vroot.
65113+ *
65114+ * When vroot of object F is not yet determined, all object lookups start
65115+ * from the root of the tree. At each tree level during traversal we have
65116+ * a node N such that a key we are looking for (which is the key inside
65117+ * object's body) is located within N. In function handle_vroot() called
65118+ * from cbk_level_lookup() we check whether N is possible vroot for
65119+ * F. Check is trivial---if neither leftmost nor rightmost item of N
65120+ * belongs to F (and we already have helpful ->owns_item() method of
65121+ * object plugin for this), then N is possible vroot of F. This, of
65122+ * course, relies on the assumption that each object occupies contiguous
65123+ * range of keys in the tree.
65124+ *
65125+ * Thus, traversing tree downward and checking each node as we go, we can
65126+ * find lowest such node, which, by definition, is vroot.
65127+ *
65128+ * How to track vroot.
65129+ *
65130+ * Nohow. If actual vroot changes, next object lookup will just restart
65131+ * from the actual tree root, refreshing object's vroot along the way.
65132+ *
65133+ */
65134+
65135+/*
65136+ * Check whether @node is possible vroot of @object.
65137+ */
65138+static void handle_vroot(struct inode *object, znode * node)
65139+{
65140+ file_plugin *fplug;
65141+ coord_t coord;
65142+
65143+ fplug = inode_file_plugin(object);
65144+ assert("nikita-3353", fplug != NULL);
65145+ assert("nikita-3354", fplug->owns_item != NULL);
65146+
65147+ if (unlikely(node_is_empty(node)))
65148+ return;
65149+
65150+ coord_init_first_unit(&coord, node);
65151+ /*
65152+ * if leftmost item of @node belongs to @object, we cannot be sure
65153+ * that @node is vroot of @object, because, some items of @object are
65154+ * probably in the sub-tree rooted at the left neighbor of @node.
65155+ */
65156+ if (fplug->owns_item(object, &coord))
65157+ return;
65158+ coord_init_last_unit(&coord, node);
65159+ /* mutatis mutandis for the rightmost item */
65160+ if (fplug->owns_item(object, &coord))
65161+ return;
65162+ /* otherwise, @node is possible vroot of @object */
65163+ inode_set_vroot(object, node);
65164+}
65165+
65166+/*
65167+ * helper function used by traverse tree to start tree traversal not from the
65168+ * tree root, but from @h->object's vroot, if possible.
65169+ */
65170+static int prepare_object_lookup(cbk_handle * h)
65171+{
65172+ znode *vroot;
65173+ int result;
65174+
65175+ vroot = inode_get_vroot(h->object);
65176+ if (vroot == NULL) {
65177+ /*
65178+ * object doesn't have known vroot, start from real tree root.
65179+ */
65180+ return LOOKUP_CONT;
65181+ }
65182+
65183+ h->level = znode_get_level(vroot);
65184+ /* take a long-term lock on vroot */
65185+ h->result = longterm_lock_znode(h->active_lh, vroot,
65186+ cbk_lock_mode(h->level, h),
65187+ ZNODE_LOCK_LOPRI);
65188+ result = LOOKUP_REST;
65189+ if (h->result == 0) {
65190+ int isunique;
65191+ int inside;
65192+
65193+ isunique = h->flags & CBK_UNIQUE;
65194+ /* check that key is inside vroot */
65195+ read_lock_dk(h->tree);
65196+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65197+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65198+ read_unlock_dk(h->tree);
65199+ if (inside) {
65200+ h->result = zload(vroot);
65201+ if (h->result == 0) {
65202+ /* search for key in vroot. */
65203+ result = cbk_node_lookup(h);
65204+ zrelse(vroot); /*h->active_lh->node); */
65205+ if (h->active_lh->node != vroot) {
65206+ result = LOOKUP_REST;
65207+ } else if (result == LOOKUP_CONT) {
65208+ move_lh(h->parent_lh, h->active_lh);
65209+ h->flags &= ~CBK_DKSET;
65210+ }
65211+ }
65212+ }
71430cf6 65213+ }
44254afd
MT
65214+
65215+ zput(vroot);
65216+
65217+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65218+ hput(h);
65219+ return result;
65220+}
65221+
65222+/* main function that handles common parts of tree traversal: starting
65223+ (fake znode handling), restarts, error handling, completion */
65224+static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65225+{
65226+ int done;
65227+ int iterations;
65228+ int vroot_used;
65229+
65230+ assert("nikita-365", h != NULL);
65231+ assert("nikita-366", h->tree != NULL);
65232+ assert("nikita-367", h->key != NULL);
65233+ assert("nikita-368", h->coord != NULL);
65234+ assert("nikita-369", (h->bias == FIND_EXACT)
65235+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
65236+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65237+ assert("nikita-2949", !(h->flags & CBK_DKSET));
65238+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65239+
65240+ done = 0;
65241+ iterations = 0;
65242+ vroot_used = 0;
65243+
65244+ /* loop for restarts */
65245+ restart:
65246+
71430cf6 65247+ assert("nikita-3024", reiser4_schedulable());
44254afd
MT
65248+
65249+ h->result = CBK_COORD_FOUND;
65250+ /* connect_znode() needs it */
71430cf6
MT
65251+ h->ld_key = *reiser4_min_key();
65252+ h->rd_key = *reiser4_max_key();
44254afd
MT
65253+ h->flags |= CBK_DKSET;
65254+ h->error = NULL;
65255+
65256+ if (!vroot_used && h->object != NULL) {
65257+ vroot_used = 1;
65258+ done = prepare_object_lookup(h);
65259+ if (done == LOOKUP_REST) {
65260+ goto restart;
65261+ } else if (done == LOOKUP_DONE)
65262+ return h->result;
65263+ }
65264+ if (h->parent_lh->node == NULL) {
65265+ done =
65266+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65267+ h->parent_lh);
65268+
65269+ assert("nikita-1637", done != -E_DEADLOCK);
65270+
65271+ h->block = h->tree->root_block;
65272+ h->level = h->tree->height;
65273+ h->coord->node = h->parent_lh->node;
65274+
65275+ if (done != 0)
65276+ return done;
65277+ }
65278+
65279+ /* loop descending a tree */
65280+ while (!done) {
65281+
65282+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65283+ IS_POW(iterations))) {
65284+ warning("nikita-1481", "Too many iterations: %i",
65285+ iterations);
71430cf6 65286+ reiser4_print_key("key", h->key);
44254afd
MT
65287+ ++iterations;
65288+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65289+ h->error =
65290+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65291+ h->result = RETERR(-EIO);
65292+ break;
65293+ }
65294+ switch (cbk_level_lookup(h)) {
65295+ case LOOKUP_CONT:
65296+ move_lh(h->parent_lh, h->active_lh);
65297+ continue;
65298+ default:
65299+ wrong_return_value("nikita-372", "cbk_level");
65300+ case LOOKUP_DONE:
65301+ done = 1;
65302+ break;
65303+ case LOOKUP_REST:
65304+ hput(h);
65305+ /* deadlock avoidance is normal case. */
65306+ if (h->result != -E_DEADLOCK)
65307+ ++iterations;
71430cf6 65308+ reiser4_preempt_point();
44254afd
MT
65309+ goto restart;
65310+ }
65311+ }
65312+ /* that's all. The rest is error handling */
65313+ if (unlikely(h->error != NULL)) {
65314+ warning("nikita-373", "%s: level: %i, "
65315+ "lock_level: %i, stop_level: %i "
65316+ "lock_mode: %s, bias: %s",
65317+ h->error, h->level, h->lock_level, h->stop_level,
65318+ lock_mode_name(h->lock_mode), bias_name(h->bias));
65319+ reiser4_print_address("block", &h->block);
71430cf6 65320+ reiser4_print_key("key", h->key);
44254afd
MT
65321+ print_coord_content("coord", h->coord);
65322+ }
65323+ /* `unlikely' error case */
65324+ if (unlikely(IS_CBKERR(h->result))) {
65325+ /* failure. do cleanup */
65326+ hput(h);
65327+ } else {
65328+ assert("nikita-1605", WITH_DATA_RET
65329+ (h->coord->node, 1,
65330+ ergo((h->result == CBK_COORD_FOUND) &&
65331+ (h->bias == FIND_EXACT) &&
65332+ (!node_is_empty(h->coord->node)),
65333+ coord_is_existing_item(h->coord))));
65334+ }
65335+ return h->result;
65336+}
65337+
65338+/* find delimiting keys of child
65339+
65340+ Determine left and right delimiting keys for child pointed to by
65341+ @parent_coord.
65342+
65343+*/
65344+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
65345+ * locked */ ,
65346+ const coord_t * parent_coord /* coord where
65347+ * pointer to
65348+ * child is
65349+ * stored */ ,
65350+ reiser4_key * ld /* where to store left
65351+ * delimiting key */ ,
65352+ reiser4_key * rd /* where to store right
65353+ * delimiting key */ )
65354+{
65355+ coord_t neighbor;
65356+
65357+ assert("nikita-1484", parent != NULL);
65358+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65359+
65360+ coord_dup(&neighbor, parent_coord);
65361+
65362+ if (neighbor.between == AT_UNIT)
65363+ /* imitate item ->lookup() behavior. */
65364+ neighbor.between = AFTER_UNIT;
65365+
65366+ if (coord_set_to_left(&neighbor) == 0)
65367+ unit_key_by_coord(&neighbor, ld);
65368+ else {
65369+ assert("nikita-14851", 0);
65370+ *ld = *znode_get_ld_key(parent);
65371+ }
65372+
65373+ coord_dup(&neighbor, parent_coord);
65374+ if (neighbor.between == AT_UNIT)
65375+ neighbor.between = AFTER_UNIT;
65376+ if (coord_set_to_right(&neighbor) == 0)
65377+ unit_key_by_coord(&neighbor, rd);
65378+ else
65379+ *rd = *znode_get_rd_key(parent);
65380+}
65381+
65382+/*
65383+ * setup delimiting keys for a child
65384+ *
65385+ * @parent parent node
65386+ *
65387+ * @coord location in @parent where pointer to @child is
65388+ *
65389+ * @child child node
65390+ */
65391+int
65392+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65393+{
65394+ reiser4_tree *tree;
65395+
65396+ assert("nikita-2952",
65397+ znode_get_level(parent) == znode_get_level(coord->node));
65398+
65399+ /* fast check without taking dk lock. This is safe, because
65400+ * JNODE_DKSET is never cleared once set. */
65401+ if (!ZF_ISSET(child, JNODE_DKSET)) {
65402+ tree = znode_get_tree(parent);
65403+ write_lock_dk(tree);
65404+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
65405+ find_child_delimiting_keys(parent, coord,
65406+ &child->ld_key,
65407+ &child->rd_key);
65408+ ON_DEBUG(child->ld_key_version =
65409+ atomic_inc_return(&delim_key_version);
65410+ child->rd_key_version =
65411+ atomic_inc_return(&delim_key_version););
65412+ ZF_SET(child, JNODE_DKSET);
65413+ }
65414+ write_unlock_dk(tree);
65415+ return 1;
65416+ }
65417+ return 0;
65418+}
65419+
65420+/* Perform tree lookup at one level. This is called from cbk_traverse()
65421+ function that drives lookup through tree and calls cbk_node_lookup() to
65422+ perform lookup within one node.
65423+
65424+ See comments in a code.
65425+*/
65426+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
65427+{
65428+ int ret;
65429+ int setdk;
65430+ int ldkeyset = 0;
65431+ reiser4_key ldkey;
65432+ reiser4_key key;
65433+ znode *active;
65434+
71430cf6 65435+ assert("nikita-3025", reiser4_schedulable());
44254afd
MT
65436+
65437+ /* acquire reference to @active node */
65438+ active =
71430cf6
MT
65439+ zget(h->tree, &h->block, h->parent_lh->node, h->level,
65440+ reiser4_ctx_gfp_mask_get());
44254afd
MT
65441+
65442+ if (IS_ERR(active)) {
65443+ h->result = PTR_ERR(active);
65444+ return LOOKUP_DONE;
65445+ }
65446+
65447+ /* lock @active */
65448+ h->result = longterm_lock_znode(h->active_lh,
65449+ active,
65450+ cbk_lock_mode(h->level, h),
65451+ ZNODE_LOCK_LOPRI);
65452+ /* longterm_lock_znode() acquires additional reference to znode (which
65453+ will be later released by longterm_unlock_znode()). Release
65454+ reference acquired by zget().
65455+ */
65456+ zput(active);
65457+ if (unlikely(h->result != 0))
65458+ goto fail_or_restart;
65459+
65460+ setdk = 0;
65461+ /* if @active is accessed for the first time, setup delimiting keys on
65462+ it. Delimiting keys are taken from the parent node. See
65463+ setup_delimiting_keys() for details.
65464+ */
65465+ if (h->flags & CBK_DKSET) {
65466+ setdk = setup_delimiting_keys(h);
65467+ h->flags &= ~CBK_DKSET;
65468+ } else {
65469+ znode *parent;
65470+
65471+ parent = h->parent_lh->node;
65472+ h->result = zload(parent);
65473+ if (unlikely(h->result != 0))
65474+ goto fail_or_restart;
65475+
65476+ if (!ZF_ISSET(active, JNODE_DKSET))
65477+ setdk = set_child_delimiting_keys(parent,
65478+ h->coord, active);
65479+ else {
65480+ read_lock_dk(h->tree);
65481+ find_child_delimiting_keys(parent, h->coord, &ldkey,
65482+ &key);
65483+ read_unlock_dk(h->tree);
65484+ ldkeyset = 1;
65485+ }
65486+ zrelse(parent);
65487+ }
65488+
65489+ /* this is ugly kludge. Reminder: this is necessary, because
65490+ ->lookup() method returns coord with ->between field probably set
65491+ to something different from AT_UNIT.
65492+ */
65493+ h->coord->between = AT_UNIT;
65494+
65495+ if (znode_just_created(active) && (h->coord->node != NULL)) {
65496+ write_lock_tree(h->tree);
65497+ /* if we are going to load znode right now, setup
65498+ ->in_parent: coord where pointer to this node is stored in
65499+ parent.
65500+ */
65501+ coord_to_parent_coord(h->coord, &active->in_parent);
65502+ write_unlock_tree(h->tree);
65503+ }
65504+
65505+ /* check connectedness without holding tree lock---false negatives
65506+ * will be re-checked by connect_znode(), and false positives are
65507+ * impossible---@active cannot suddenly turn into unconnected
65508+ * state. */
65509+ if (!znode_is_connected(active)) {
65510+ h->result = connect_znode(h->coord, active);
65511+ if (unlikely(h->result != 0)) {
65512+ put_parent(h);
65513+ goto fail_or_restart;
65514+ }
65515+ }
65516+
65517+ jload_prefetch(ZJNODE(active));
65518+
65519+ if (setdk)
65520+ update_stale_dk(h->tree, active);
65521+
65522+ /* put_parent() cannot be called earlier, because connect_znode()
65523+ assumes parent node is referenced; */
65524+ put_parent(h);
65525+
65526+ if ((!znode_contains_key_lock(active, h->key) &&
65527+ (h->flags & CBK_TRUST_DK))
65528+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
65529+ /* 1. key was moved out of this node while this thread was
65530+ waiting for the lock. Restart. More elaborate solution is
65531+ to determine where key moved (to the left, or to the right)
65532+ and try to follow it through sibling pointers.
65533+
65534+ 2. or, node itself is going to be removed from the
65535+ tree. Release lock and restart.
65536+ */
65537+ h->result = -E_REPEAT;
65538+ }
65539+ if (h->result == -E_REPEAT)
65540+ return LOOKUP_REST;
65541+
65542+ h->result = zload_ra(active, h->ra_info);
65543+ if (h->result) {
65544+ return LOOKUP_DONE;
65545+ }
65546+
65547+ /* sanity checks */
65548+ if (sanity_check(h)) {
65549+ zrelse(active);
65550+ return LOOKUP_DONE;
65551+ }
65552+
65553+ /* check that key of leftmost item in the @active is the same as in
65554+ * its parent */
65555+ if (ldkeyset && !node_is_empty(active) &&
65556+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
65557+ warning("vs-3533", "Keys are inconsistent. Fsck?");
71430cf6
MT
65558+ reiser4_print_key("inparent", &ldkey);
65559+ reiser4_print_key("inchild", &key);
44254afd
MT
65560+ h->result = RETERR(-EIO);
65561+ zrelse(active);
65562+ return LOOKUP_DONE;
65563+ }
65564+
65565+ if (h->object != NULL)
65566+ handle_vroot(h->object, active);
65567+
65568+ ret = cbk_node_lookup(h);
65569+
65570+ /* h->active_lh->node might change, but active is yet to be zrelsed */
65571+ zrelse(active);
65572+
65573+ return ret;
65574+
65575+ fail_or_restart:
65576+ if (h->result == -E_DEADLOCK)
65577+ return LOOKUP_REST;
65578+ return LOOKUP_DONE;
65579+}
65580+
65581+#if REISER4_DEBUG
65582+/* check left and right delimiting keys of a znode */
65583+void check_dkeys(znode * node)
65584+{
65585+ znode *left;
65586+ znode *right;
65587+
65588+ read_lock_tree(current_tree);
65589+ read_lock_dk(current_tree);
65590+
65591+ assert("vs-1710", znode_is_any_locked(node));
65592+ assert("vs-1197",
65593+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
65594+
65595+ left = node->left;
65596+ right = node->right;
65597+
65598+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65599+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
65600+ /* check left neighbor. Note that left neighbor is not locked,
65601+ so it might get wrong delimiting keys therefore */
65602+ assert("vs-1198",
65603+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
65604+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
65605+
65606+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
65607+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
65608+ /* check right neighbor. Note that right neighbor is not
65609+ locked, so it might get wrong delimiting keys therefore */
65610+ assert("vs-1199",
65611+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
65612+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
65613+
65614+ read_unlock_dk(current_tree);
65615+ read_unlock_tree(current_tree);
65616+}
65617+#endif
65618+
65619+/* true if @key is left delimiting key of @node */
65620+static int key_is_ld(znode * node, const reiser4_key * key)
65621+{
65622+ int ld;
65623+
65624+ assert("nikita-1716", node != NULL);
65625+ assert("nikita-1758", key != NULL);
65626+
65627+ read_lock_dk(znode_get_tree(node));
65628+ assert("nikita-1759", znode_contains_key(node, key));
65629+ ld = keyeq(znode_get_ld_key(node), key);
65630+ read_unlock_dk(znode_get_tree(node));
65631+ return ld;
65632+}
65633+
65634+/* Process one node during tree traversal.
65635+
65636+ This is called by cbk_level_lookup(). */
65637+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
65638+{
65639+ /* node plugin of @active */
65640+ node_plugin *nplug;
65641+ /* item plugin of item that was found */
65642+ item_plugin *iplug;
65643+ /* search bias */
65644+ lookup_bias node_bias;
65645+ /* node we are operating upon */
65646+ znode *active;
65647+ /* tree we are searching in */
65648+ reiser4_tree *tree;
65649+ /* result */
65650+ int result;
65651+
65652+ assert("nikita-379", h != NULL);
65653+
65654+ active = h->active_lh->node;
65655+ tree = h->tree;
65656+
65657+ nplug = active->nplug;
65658+ assert("nikita-380", nplug != NULL);
65659+
65660+ ON_DEBUG(check_dkeys(active));
65661+
65662+ /* return item from "active" node with maximal key not greater than
65663+ "key" */
65664+ node_bias = h->bias;
65665+ result = nplug->lookup(active, h->key, node_bias, h->coord);
65666+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
65667+ /* error occurred */
65668+ h->result = result;
65669+ return LOOKUP_DONE;
65670+ }
65671+ if (h->level == h->stop_level) {
65672+ /* welcome to the stop level */
65673+ assert("nikita-381", h->coord->node == active);
65674+ if (result == NS_FOUND) {
65675+ /* success of tree lookup */
65676+ if (!(h->flags & CBK_UNIQUE)
65677+ && key_is_ld(active, h->key)) {
65678+ return search_to_left(h);
65679+ } else
65680+ h->result = CBK_COORD_FOUND;
65681+ } else {
65682+ h->result = CBK_COORD_NOTFOUND;
65683+ }
65684+ if (!(h->flags & CBK_IN_CACHE))
65685+ cbk_cache_add(active);
65686+ return LOOKUP_DONE;
65687+ }
65688+
65689+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
65690+ h->error = "not found on internal node";
65691+ h->result = result;
65692+ return LOOKUP_DONE;
65693+ }
65694+
65695+ assert("vs-361", h->level > h->stop_level);
65696+
65697+ if (handle_eottl(h, &result)) {
65698+ assert("vs-1674", (result == LOOKUP_DONE ||
65699+ result == LOOKUP_REST));
65700+ return result;
65701+ }
65702+
65703+ /* go down to next level */
65704+ check_me("vs-12", zload(h->coord->node) == 0);
65705+ assert("nikita-2116", item_is_internal(h->coord));
65706+ iplug = item_plugin_by_coord(h->coord);
65707+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
65708+ zrelse(h->coord->node);
65709+ --h->level;
65710+ return LOOKUP_CONT; /* continue */
65711+}
65712+
65713+/* scan cbk_cache slots looking for a match for @h */
65714+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
65715+{
65716+ level_lookup_result llr;
65717+ znode *node;
65718+ reiser4_tree *tree;
65719+ cbk_cache_slot *slot;
65720+ cbk_cache *cache;
65721+ tree_level level;
65722+ int isunique;
65723+ const reiser4_key *key;
65724+ int result;
65725+
65726+ assert("nikita-1317", h != NULL);
65727+ assert("nikita-1315", h->tree != NULL);
65728+ assert("nikita-1316", h->key != NULL);
65729+
65730+ tree = h->tree;
65731+ cache = &tree->cbk_cache;
65732+ if (cache->nr_slots == 0)
65733+ /* size of cbk cache was set to 0 by mount time option. */
65734+ return RETERR(-ENOENT);
65735+
65736+ assert("nikita-2474", cbk_cache_invariant(cache));
65737+ node = NULL; /* to keep gcc happy */
65738+ level = h->level;
65739+ key = h->key;
65740+ isunique = h->flags & CBK_UNIQUE;
65741+ result = RETERR(-ENOENT);
65742+
65743+ /*
65744+ * this is time-critical function and dragons had, hence, been settled
65745+ * here.
65746+ *
65747+ * Loop below scans cbk cache slots trying to find matching node with
65748+ * suitable range of delimiting keys and located at the h->level.
65749+ *
65750+ * Scan is done under cbk cache spin lock that protects slot->node
65751+ * pointers. If suitable node is found we want to pin it in
65752+ * memory. But slot->node can point to the node with x_count 0
65753+ * (unreferenced). Such node can be recycled at any moment, or can
65754+ * already be in the process of being recycled (within jput()).
65755+ *
65756+ * As we found node in the cbk cache, it means that jput() hasn't yet
65757+ * called cbk_cache_invalidate().
65758+ *
65759+ * We acquire reference to the node without holding tree lock, and
65760+ * later, check node's RIP bit. This avoids races with jput().
65761+ */
65762+
65763+ rcu_read_lock();
65764+ read_lock(&((cbk_cache *)cache)->guard);
65765+
65766+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
65767+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
65768+ BUG_ON(&slot->lru != &cache->lru);/*????*/
65769+ while (1) {
65770+
65771+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
65772+
65773+ if (&cache->lru != &slot->lru)
65774+ node = slot->node;
65775+ else
65776+ node = NULL;
65777+
65778+ if (unlikely(node == NULL))
65779+ break;
65780+
65781+ /*
65782+ * this is (hopefully) the only place in the code where we are
65783+ * working with delimiting keys without holding dk lock. This
65784+ * is fine here, because this is only "guess" anyway---keys
65785+ * are rechecked under dk lock below.
65786+ */
65787+ if (znode_get_level(node) == level &&
71430cf6 65788+ /* reiser4_min_key < key < reiser4_max_key */
44254afd
MT
65789+ znode_contains_key_strict(node, key, isunique)) {
65790+ zref(node);
65791+ result = 0;
65792+ spin_lock_prefetch(&tree->tree_lock);
65793+ break;
65794+ }
65795+ }
65796+ read_unlock(&((cbk_cache *)cache)->guard);
65797+
65798+ assert("nikita-2475", cbk_cache_invariant(cache));
65799+
65800+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
65801+ result = -ENOENT;
65802+
65803+ rcu_read_unlock();
65804+
65805+ if (result != 0) {
65806+ h->result = CBK_COORD_NOTFOUND;
65807+ return RETERR(-ENOENT);
65808+ }
65809+
65810+ result =
65811+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
65812+ ZNODE_LOCK_LOPRI);
65813+ zput(node);
65814+ if (result != 0)
65815+ return result;
65816+ result = zload(node);
65817+ if (result != 0)
65818+ return result;
65819+
65820+ /* recheck keys */
65821+ read_lock_dk(tree);
65822+ result = (znode_contains_key_strict(node, key, isunique) &&
65823+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
65824+ read_unlock_dk(tree);
65825+ if (result) {
65826+ /* do lookup inside node */
65827+ llr = cbk_node_lookup(h);
65828+ /* if cbk_node_lookup() wandered to another node (due to eottl
65829+ or non-unique keys), adjust @node */
65830+ /*node = h->active_lh->node; */
65831+
65832+ if (llr != LOOKUP_DONE) {
65833+ /* restart or continue on the next level */
65834+ result = RETERR(-ENOENT);
65835+ } else if (IS_CBKERR(h->result))
65836+ /* io or oom */
65837+ result = RETERR(-ENOENT);
65838+ else {
65839+ /* good. Either item found or definitely not found. */
65840+ result = 0;
65841+
65842+ write_lock(&(cache->guard));
65843+ if (slot->node == h->active_lh->node /*node */ ) {
65844+ /* if this node is still in cbk cache---move
65845+ its slot to the head of the LRU list. */
65846+ list_move(&slot->lru, &cache->lru);
65847+ }
65848+ write_unlock(&(cache->guard));
65849+ }
65850+ } else {
65851+ /* race. While this thread was waiting for the lock, node was
65852+ rebalanced and item we are looking for, shifted out of it
65853+ (if it ever was here).
65854+
65855+ Continuing scanning is almost hopeless: node key range was
65856+ moved to, is almost certainly at the beginning of the LRU
65857+ list at this time, because it's hot, but restarting
65858+ scanning from the very beginning is complex. Just return,
65859+ so that cbk() will be performed. This is not that
65860+ important, because such races should be rare. Are they?
65861+ */
65862+ result = RETERR(-ENOENT); /* -ERAUGHT */
65863+ }
65864+ zrelse(node);
65865+ assert("nikita-2476", cbk_cache_invariant(cache));
65866+ return result;
65867+}
65868+
65869+/* look for item with given key in the coord cache
65870+
65871+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
65872+ which is a small LRU list of znodes accessed lately. For each znode in
65873+ znode in this list, it checks whether key we are looking for fits into key
65874+ range covered by this node. If so, and in addition, node lies at allowed
65875+ level (this is to handle extents on a twig level), node is locked, and
65876+ lookup inside it is performed.
65877+
65878+ we need a measurement of the cost of this cache search compared to the cost
65879+ of coord_by_key.
65880+
65881+*/
65882+static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
65883+{
65884+ int result = 0;
65885+ tree_level level;
65886+
65887+ /* add CBK_IN_CACHE to the handle flags. This means that
65888+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
65889+ * found node to the cache. */
65890+ h->flags |= CBK_IN_CACHE;
65891+ for (level = h->stop_level; level <= h->lock_level; ++level) {
65892+ h->level = level;
65893+ result = cbk_cache_scan_slots(h);
65894+ if (result != 0) {
65895+ done_lh(h->active_lh);
65896+ done_lh(h->parent_lh);
65897+ } else {
65898+ assert("nikita-1319", !IS_CBKERR(h->result));
65899+ break;
65900+ }
65901+ }
65902+ h->flags &= ~CBK_IN_CACHE;
65903+ return result;
65904+}
65905+
65906+/* type of lock we want to obtain during tree traversal. On stop level
65907+ we want type of lock user asked for, on upper levels: read lock. */
65908+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
65909+{
65910+ assert("nikita-382", h != NULL);
65911+
65912+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
65913+}
65914+
65915+/* update outdated delimiting keys */
65916+static void stale_dk(reiser4_tree * tree, znode * node)
65917+{
65918+ znode *right;
65919+
65920+ read_lock_tree(tree);
65921+ write_lock_dk(tree);
65922+ right = node->right;
65923+
65924+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
65925+ right && ZF_ISSET(right, JNODE_DKSET) &&
65926+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
65927+ znode_set_rd_key(node, znode_get_ld_key(right));
65928+
65929+ write_unlock_dk(tree);
65930+ read_unlock_tree(tree);
65931+}
65932+
65933+/* check for possibly outdated delimiting keys, and update them if
65934+ * necessary. */
65935+static void update_stale_dk(reiser4_tree * tree, znode * node)
65936+{
65937+ znode *right;
65938+ reiser4_key rd;
65939+
65940+ read_lock_tree(tree);
65941+ read_lock_dk(tree);
65942+ rd = *znode_get_rd_key(node);
65943+ right = node->right;
65944+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
65945+ right && ZF_ISSET(right, JNODE_DKSET) &&
65946+ !keyeq(&rd, znode_get_ld_key(right)))) {
65947+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
65948+ read_unlock_dk(tree);
65949+ read_unlock_tree(tree);
65950+ stale_dk(tree, node);
65951+ return;
65952+ }
65953+ read_unlock_dk(tree);
65954+ read_unlock_tree(tree);
65955+}
65956+
65957+/*
65958+ * handle searches a the non-unique key.
65959+ *
65960+ * Suppose that we are looking for an item with possibly non-unique key 100.
65961+ *
65962+ * Root node contains two pointers: one to a node with left delimiting key 0,
65963+ * and another to a node with left delimiting key 100. Item we interested in
65964+ * may well happen in the sub-tree rooted at the first pointer.
65965+ *
65966+ * To handle this search_to_left() is called when search reaches stop
65967+ * level. This function checks it is _possible_ that item we are looking for
65968+ * is in the left neighbor (this can be done by comparing delimiting keys) and
65969+ * if so, tries to lock left neighbor (this is low priority lock, so it can
65970+ * deadlock, tree traversal is just restarted if it did) and then checks
65971+ * whether left neighbor actually contains items with our key.
65972+ *
65973+ * Note that this is done on the stop level only. It is possible to try such
65974+ * left-check on each level, but as duplicate keys are supposed to be rare
65975+ * (very unlikely that more than one node is completely filled with items with
65976+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
65977+ *
65978+ */
65979+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
65980+{
65981+ level_lookup_result result;
65982+ coord_t *coord;
65983+ znode *node;
65984+ znode *neighbor;
65985+
65986+ lock_handle lh;
65987+
65988+ assert("nikita-1761", h != NULL);
65989+ assert("nikita-1762", h->level == h->stop_level);
65990+
65991+ init_lh(&lh);
65992+ coord = h->coord;
65993+ node = h->active_lh->node;
65994+ assert("nikita-1763", coord_is_leftmost_unit(coord));
65995+
65996+ h->result =
65997+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
65998+ GN_CAN_USE_UPPER_LEVELS);
65999+ neighbor = NULL;
66000+ switch (h->result) {
66001+ case -E_DEADLOCK:
66002+ result = LOOKUP_REST;
66003+ break;
66004+ case 0:{
66005+ node_plugin *nplug;
66006+ coord_t crd;
66007+ lookup_bias bias;
66008+
66009+ neighbor = lh.node;
66010+ h->result = zload(neighbor);
66011+ if (h->result != 0) {
66012+ result = LOOKUP_DONE;
66013+ break;
66014+ }
66015+
66016+ nplug = neighbor->nplug;
66017+
66018+ coord_init_zero(&crd);
66019+ bias = h->bias;
66020+ h->bias = FIND_EXACT;
66021+ h->result =
66022+ nplug->lookup(neighbor, h->key, h->bias, &crd);
66023+ h->bias = bias;
66024+
66025+ if (h->result == NS_NOT_FOUND) {
66026+ case -E_NO_NEIGHBOR:
66027+ h->result = CBK_COORD_FOUND;
66028+ if (!(h->flags & CBK_IN_CACHE))
66029+ cbk_cache_add(node);
66030+ default: /* some other error */
66031+ result = LOOKUP_DONE;
66032+ } else if (h->result == NS_FOUND) {
66033+ read_lock_dk(znode_get_tree(neighbor));
66034+ h->rd_key = *znode_get_ld_key(node);
66035+ leftmost_key_in_node(neighbor, &h->ld_key);
66036+ read_unlock_dk(znode_get_tree(neighbor));
66037+ h->flags |= CBK_DKSET;
66038+
66039+ h->block = *znode_get_block(neighbor);
66040+ /* clear coord -> node so that cbk_level_lookup()
66041+ wouldn't overwrite parent hint in neighbor.
66042+
66043+ Parent hint was set up by
66044+ reiser4_get_left_neighbor()
66045+ */
66046+ /* FIXME: why do we have to spinlock here? */
66047+ write_lock_tree(znode_get_tree(neighbor));
66048+ h->coord->node = NULL;
66049+ write_unlock_tree(znode_get_tree(neighbor));
66050+ result = LOOKUP_CONT;
66051+ } else {
66052+ result = LOOKUP_DONE;
66053+ }
66054+ if (neighbor != NULL)
66055+ zrelse(neighbor);
66056+ }
66057+ }
66058+ done_lh(&lh);
66059+ return result;
66060+}
66061+
66062+/* debugging aid: return symbolic name of search bias */
66063+static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66064+{
66065+ if (bias == FIND_EXACT)
66066+ return "exact";
66067+ else if (bias == FIND_MAX_NOT_MORE_THAN)
66068+ return "left-slant";
66069+/* else if( bias == RIGHT_SLANT_BIAS ) */
66070+/* return "right-bias"; */
66071+ else {
66072+ static char buf[30];
66073+
66074+ sprintf(buf, "unknown: %i", bias);
66075+ return buf;
66076+ }
66077+}
66078+
66079+#if REISER4_DEBUG
66080+/* debugging aid: print human readable information about @p */
66081+void print_coord_content(const char *prefix /* prefix to print */ ,
66082+ coord_t * p /* coord to print */ )
66083+{
66084+ reiser4_key key;
66085+
66086+ if (p == NULL) {
66087+ printk("%s: null\n", prefix);
66088+ return;
66089+ }
66090+ if ((p->node != NULL) && znode_is_loaded(p->node)
66091+ && coord_is_existing_item(p))
66092+ printk("%s: data: %p, length: %i\n", prefix,
66093+ item_body_by_coord(p), item_length_by_coord(p));
66094+ if (znode_is_loaded(p->node)) {
66095+ item_key_by_coord(p, &key);
71430cf6 66096+ reiser4_print_key(prefix, &key);
44254afd
MT
66097+ }
66098+}
66099+
66100+/* debugging aid: print human readable information about @block */
66101+void reiser4_print_address(const char *prefix /* prefix to print */ ,
66102+ const reiser4_block_nr * block /* block number to print */ )
66103+{
66104+ printk("%s: %s\n", prefix, sprint_address(block));
66105+}
66106+#endif
66107+
66108+/* return string containing human readable representation of @block */
66109+char *sprint_address(const reiser4_block_nr *
66110+ block /* block number to print */ )
66111+{
66112+ static char address[30];
66113+
66114+ if (block == NULL)
66115+ sprintf(address, "null");
71430cf6 66116+ else if (reiser4_blocknr_is_fake(block))
44254afd
MT
66117+ sprintf(address, "%llx", (unsigned long long)(*block));
66118+ else
66119+ sprintf(address, "%llu", (unsigned long long)(*block));
66120+ return address;
66121+}
66122+
66123+/* release parent node during traversal */
66124+static void put_parent(cbk_handle * h /* search handle */ )
66125+{
66126+ assert("nikita-383", h != NULL);
66127+ if (h->parent_lh->node != NULL) {
66128+ longterm_unlock_znode(h->parent_lh);
66129+ }
66130+}
66131+
66132+/* helper function used by coord_by_key(): release reference to parent znode
66133+ stored in handle before processing its child. */
66134+static void hput(cbk_handle * h /* search handle */ )
66135+{
66136+ assert("nikita-385", h != NULL);
66137+ done_lh(h->parent_lh);
66138+ done_lh(h->active_lh);
66139+}
66140+
66141+/* Helper function used by cbk(): update delimiting keys of child node (stored
66142+ in h->active_lh->node) using key taken from parent on the parent level. */
66143+static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66144+{
66145+ znode *active;
66146+ reiser4_tree *tree;
66147+
66148+ assert("nikita-1088", h != NULL);
66149+
66150+ active = h->active_lh->node;
66151+
66152+ /* fast check without taking dk lock. This is safe, because
66153+ * JNODE_DKSET is never cleared once set. */
66154+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66155+ tree = znode_get_tree(active);
66156+ write_lock_dk(tree);
66157+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66158+ znode_set_ld_key(active, &h->ld_key);
66159+ znode_set_rd_key(active, &h->rd_key);
66160+ ZF_SET(active, JNODE_DKSET);
66161+ }
66162+ write_unlock_dk(tree);
66163+ return 1;
66164+ }
66165+ return 0;
66166+}
66167+
66168+/* true if @block makes sense for the @tree. Used to detect corrupted node
66169+ * pointers */
66170+static int
66171+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66172+ reiser4_tree * tree /* tree to check against */ )
66173+{
66174+ assert("nikita-757", block != NULL);
66175+ assert("nikita-758", tree != NULL);
66176+
66177+ /* check to see if it exceeds the size of the device. */
66178+ return reiser4_blocknr_is_sane_for(tree->super, block);
66179+}
66180+
66181+/* check consistency of fields */
66182+static int sanity_check(cbk_handle * h /* search handle */ )
66183+{
66184+ assert("nikita-384", h != NULL);
66185+
66186+ if (h->level < h->stop_level) {
66187+ h->error = "Buried under leaves";
66188+ h->result = RETERR(-EIO);
66189+ return LOOKUP_DONE;
66190+ } else if (!block_nr_is_correct(&h->block, h->tree)) {
66191+ h->error = "bad block number";
66192+ h->result = RETERR(-EIO);
66193+ return LOOKUP_DONE;
66194+ } else
66195+ return 0;
66196+}
66197+
66198+/* Make Linus happy.
66199+ Local variables:
66200+ c-indentation-style: "K&R"
66201+ mode-name: "LC"
66202+ c-basic-offset: 8
66203+ tab-width: 8
66204+ fill-column: 120
66205+ scroll-step: 1
66206+ End:
66207+*/
71430cf6
MT
66208diff -urN linux-2.6.22.orig/fs/reiser4/status_flags.c linux-2.6.22/fs/reiser4/status_flags.c
66209--- linux-2.6.22.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
66210+++ linux-2.6.22/fs/reiser4/status_flags.c 2007-07-29 00:25:35.016732714 +0400
66211@@ -0,0 +1,175 @@
44254afd
MT
66212+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66213+ * reiser4/README */
66214+
66215+/* Functions that deal with reiser4 status block, query status and update it, if needed */
66216+
66217+#include <linux/bio.h>
66218+#include <linux/highmem.h>
66219+#include <linux/fs.h>
66220+#include <linux/blkdev.h>
66221+#include "debug.h"
66222+#include "dformat.h"
66223+#include "status_flags.h"
66224+#include "super.h"
66225+
66226+/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66227+ unconditionally unlocks the page, so we can see that io was done.
66228+ We do not free bio, because we hope to reuse that. */
66229+static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66230+ int err)
66231+{
66232+ if (bio->bi_size)
66233+ return 1;
66234+
66235+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66236+ SetPageUptodate(bio->bi_io_vec->bv_page);
66237+ } else {
66238+ ClearPageUptodate(bio->bi_io_vec->bv_page);
66239+ SetPageError(bio->bi_io_vec->bv_page);
66240+ }
66241+ unlock_page(bio->bi_io_vec->bv_page);
66242+ return 0;
66243+}
66244+
66245+/* Initialise status code. This is expected to be called from the disk format
66246+ code. block paremeter is where status block lives. */
66247+int reiser4_status_init(reiser4_block_nr block)
66248+{
66249+ struct super_block *sb = reiser4_get_current_sb();
66250+ struct reiser4_status *statuspage;
66251+ struct bio *bio;
66252+ struct page *page;
66253+
44254afd
MT
66254+ get_super_private(sb)->status_page = NULL;
66255+ get_super_private(sb)->status_bio = NULL;
66256+
71430cf6 66257+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
44254afd
MT
66258+ if (!page)
66259+ return -ENOMEM;
66260+
71430cf6 66261+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
44254afd
MT
66262+ if (bio != NULL) {
66263+ bio->bi_sector = block * (sb->s_blocksize >> 9);
66264+ bio->bi_bdev = sb->s_bdev;
66265+ bio->bi_io_vec[0].bv_page = page;
66266+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66267+ bio->bi_io_vec[0].bv_offset = 0;
66268+ bio->bi_vcnt = 1;
66269+ bio->bi_size = sb->s_blocksize;
66270+ bio->bi_end_io = reiser4_status_endio;
66271+ } else {
66272+ __free_pages(page, 0);
66273+ return -ENOMEM;
66274+ }
66275+ lock_page(page);
66276+ submit_bio(READ, bio);
71430cf6 66277+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
44254afd
MT
66278+ wait_on_page_locked(page);
66279+ if (!PageUptodate(page)) {
66280+ warning("green-2007",
66281+ "I/O error while tried to read status page\n");
66282+ return -EIO;
66283+ }
66284+
66285+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66286+ if (memcmp
66287+ (statuspage->magic, REISER4_STATUS_MAGIC,
66288+ sizeof(REISER4_STATUS_MAGIC))) {
66289+ /* Magic does not match. */
66290+ kunmap_atomic((char *)statuspage, KM_USER0);
66291+ warning("green-2008", "Wrong magic in status block\n");
66292+ __free_pages(page, 0);
66293+ bio_put(bio);
66294+ return -EINVAL;
66295+ }
66296+ kunmap_atomic((char *)statuspage, KM_USER0);
66297+
66298+ get_super_private(sb)->status_page = page;
66299+ get_super_private(sb)->status_bio = bio;
66300+ return 0;
66301+}
66302+
66303+/* Query the status of fs. Returns if the FS can be safely mounted.
66304+ Also if "status" and "extended" parameters are given, it will fill
66305+ actual parts of status from disk there. */
66306+int reiser4_status_query(u64 * status, u64 * extended)
66307+{
66308+ struct super_block *sb = reiser4_get_current_sb();
66309+ struct reiser4_status *statuspage;
66310+ int retval;
66311+
66312+ if (!get_super_private(sb)->status_page) { // No status page?
66313+ return REISER4_STATUS_MOUNT_UNKNOWN;
66314+ }
66315+ statuspage = (struct reiser4_status *)
66316+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66317+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
66318+ case REISER4_STATUS_OK:
66319+ retval = REISER4_STATUS_MOUNT_OK;
66320+ break;
66321+ case REISER4_STATUS_CORRUPTED:
66322+ retval = REISER4_STATUS_MOUNT_WARN;
66323+ break;
66324+ case REISER4_STATUS_DAMAGED:
66325+ case REISER4_STATUS_DESTROYED:
66326+ case REISER4_STATUS_IOERROR:
66327+ retval = REISER4_STATUS_MOUNT_RO;
66328+ break;
66329+ default:
66330+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
66331+ break;
66332+ }
66333+
66334+ if (status)
66335+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
66336+ if (extended)
66337+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66338+
66339+ kunmap_atomic((char *)statuspage, KM_USER0);
66340+ return retval;
66341+}
66342+
66343+/* This function should be called when something bad happens (e.g. from reiser4_panic).
66344+ It fills the status structure and tries to push it to disk. */
66345+int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66346+{
66347+ struct super_block *sb = reiser4_get_current_sb();
66348+ struct reiser4_status *statuspage;
66349+ struct bio *bio = get_super_private(sb)->status_bio;
66350+
66351+ if (!get_super_private(sb)->status_page) { // No status page?
66352+ return -1;
66353+ }
66354+ statuspage = (struct reiser4_status *)
66355+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66356+
66357+ put_unaligned(cpu_to_le64(status), &statuspage->status);
66358+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66359+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66360+
66361+ kunmap_atomic((char *)statuspage, KM_USER0);
66362+ bio->bi_bdev = sb->s_bdev;
66363+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66364+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66365+ bio->bi_io_vec[0].bv_offset = 0;
66366+ bio->bi_vcnt = 1;
66367+ bio->bi_size = sb->s_blocksize;
66368+ bio->bi_end_io = reiser4_status_endio;
66369+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
66370+ /* We can block now, but we have no other choice anyway */
66371+ submit_bio(WRITE, bio);
71430cf6 66372+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
44254afd
MT
66373+ return 0; // We do not wait for io to finish.
66374+}
66375+
66376+/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66377+int reiser4_status_finish(void)
66378+{
66379+ struct super_block *sb = reiser4_get_current_sb();
66380+
66381+ __free_pages(get_super_private(sb)->status_page, 0);
66382+ get_super_private(sb)->status_page = NULL;
66383+ bio_put(get_super_private(sb)->status_bio);
66384+ get_super_private(sb)->status_bio = NULL;
66385+ return 0;
66386+}
71430cf6
MT
66387diff -urN linux-2.6.22.orig/fs/reiser4/status_flags.h linux-2.6.22/fs/reiser4/status_flags.h
66388--- linux-2.6.22.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
66389+++ linux-2.6.22/fs/reiser4/status_flags.h 2007-07-29 00:25:35.016732714 +0400
44254afd
MT
66390@@ -0,0 +1,43 @@
66391+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66392+ * reiser4/README */
66393+
66394+/* Here we declare structures and flags that store reiser4 status on disk.
66395+ The status that helps us to find out if the filesystem is valid or if it
66396+ contains some critical, or not so critical errors */
66397+
66398+#if !defined( __REISER4_STATUS_FLAGS_H__ )
66399+#define __REISER4_STATUS_FLAGS_H__
66400+
66401+#include "dformat.h"
66402+/* These are major status flags */
66403+#define REISER4_STATUS_OK 0
66404+#define REISER4_STATUS_CORRUPTED 0x1
66405+#define REISER4_STATUS_DAMAGED 0x2
66406+#define REISER4_STATUS_DESTROYED 0x4
66407+#define REISER4_STATUS_IOERROR 0x8
66408+
66409+/* Return values for reiser4_status_query() */
66410+#define REISER4_STATUS_MOUNT_OK 0
66411+#define REISER4_STATUS_MOUNT_WARN 1
66412+#define REISER4_STATUS_MOUNT_RO 2
66413+#define REISER4_STATUS_MOUNT_UNKNOWN -1
66414+
66415+#define REISER4_TEXTERROR_LEN 256
66416+
66417+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
66418+/* We probably need to keep its size under sector size which is 512 bytes */
66419+struct reiser4_status {
66420+ char magic[16];
66421+ d64 status; /* Current FS state */
66422+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
66423+ last sector where io error happened if status is "io error encountered" */
66424+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
66425+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
66426+};
66427+
66428+int reiser4_status_init(reiser4_block_nr block);
66429+int reiser4_status_query(u64 * status, u64 * extended);
66430+int reiser4_status_write(u64 status, u64 extended_status, char *message);
66431+int reiser4_status_finish(void);
66432+
66433+#endif
71430cf6
MT
66434diff -urN linux-2.6.22.orig/fs/reiser4/super.c linux-2.6.22/fs/reiser4/super.c
66435--- linux-2.6.22.orig/fs/reiser4/super.c 1970-01-01 03:00:00.000000000 +0300
66436+++ linux-2.6.22/fs/reiser4/super.c 2007-07-29 00:25:35.020733749 +0400
66437@@ -0,0 +1,316 @@
44254afd
MT
66438+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66439+ * reiser4/README */
66440+
66441+/* Super-block manipulations. */
66442+
66443+#include "debug.h"
66444+#include "dformat.h"
66445+#include "key.h"
66446+#include "plugin/security/perm.h"
66447+#include "plugin/space/space_allocator.h"
66448+#include "plugin/plugin.h"
66449+#include "tree.h"
66450+#include "vfs_ops.h"
66451+#include "super.h"
66452+#include "reiser4.h"
66453+
66454+#include <linux/types.h> /* for __u?? */
66455+#include <linux/fs.h> /* for struct super_block */
66456+
44254afd
MT
66457+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
66458+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
66459+static __u64 reserved_for_root(const struct super_block *super);
66460+
66461+/* Return reiser4-specific part of super block */
66462+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
66463+ * queried */ )
66464+{
66465+ return (reiser4_super_info_data *) super->s_fs_info;
66466+}
66467+
66468+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
71430cf6 66469+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
44254afd
MT
66470+{
66471+ assert("nikita-448", super != NULL);
66472+ assert("nikita-449", is_reiser4_super(super));
66473+ return (long)REISER4_SUPER_MAGIC;
66474+}
66475+
66476+/* functions to read/modify fields of reiser4_super_info_data */
66477+
66478+/* get number of blocks in file system */
66479+__u64 reiser4_block_count(const struct super_block *super /* super block
66480+ queried */ )
66481+{
66482+ assert("vs-494", super != NULL);
66483+ assert("vs-495", is_reiser4_super(super));
66484+ return get_super_private(super)->block_count;
66485+}
66486+
71430cf6 66487+#if REISER4_DEBUG
44254afd
MT
66488+/*
66489+ * number of blocks in the current file system
66490+ */
66491+__u64 reiser4_current_block_count(void)
66492+{
66493+ return get_current_super_private()->block_count;
66494+}
71430cf6 66495+#endif /* REISER4_DEBUG */
44254afd
MT
66496+
66497+/* set number of block in filesystem */
66498+void reiser4_set_block_count(const struct super_block *super, __u64 nr)
66499+{
66500+ assert("vs-501", super != NULL);
66501+ assert("vs-502", is_reiser4_super(super));
66502+ get_super_private(super)->block_count = nr;
66503+ /*
66504+ * The proper calculation of the reserved space counter (%5 of device
66505+ * block counter) we need a 64 bit division which is missing in Linux
66506+ * on i386 platform. Because we do not need a precise calculation here
66507+ * we can replace a div64 operation by this combination of
66508+ * multiplication and shift: 51. / (2^10) == .0498 .
66509+ * FIXME: this is a bug. It comes up only for very small filesystems
66510+ * which probably are never used. Nevertheless, it is a bug. Number of
66511+ * reserved blocks must be not less than maximal number of blocks which
66512+ * get grabbed with BA_RESERVED.
66513+ */
66514+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
66515+}
66516+
66517+/* amount of blocks used (allocated for data) in file system */
66518+__u64 reiser4_data_blocks(const struct super_block *super /* super block
66519+ queried */ )
66520+{
66521+ assert("nikita-452", super != NULL);
66522+ assert("nikita-453", is_reiser4_super(super));
66523+ return get_super_private(super)->blocks_used;
66524+}
66525+
66526+/* set number of block used in filesystem */
66527+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
66528+{
66529+ assert("vs-503", super != NULL);
66530+ assert("vs-504", is_reiser4_super(super));
66531+ get_super_private(super)->blocks_used = nr;
66532+}
66533+
66534+/* amount of free blocks in file system */
66535+__u64 reiser4_free_blocks(const struct super_block *super /* super block
66536+ queried */ )
66537+{
66538+ assert("nikita-454", super != NULL);
66539+ assert("nikita-455", is_reiser4_super(super));
66540+ return get_super_private(super)->blocks_free;
66541+}
66542+
66543+/* set number of blocks free in filesystem */
66544+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
66545+{
66546+ assert("vs-505", super != NULL);
66547+ assert("vs-506", is_reiser4_super(super));
66548+ get_super_private(super)->blocks_free = nr;
66549+}
66550+
66551+/* get mkfs unique identifier */
66552+__u32 reiser4_mkfs_id(const struct super_block *super /* super block
66553+ queried */ )
66554+{
66555+ assert("vpf-221", super != NULL);
66556+ assert("vpf-222", is_reiser4_super(super));
66557+ return get_super_private(super)->mkfs_id;
66558+}
66559+
66560+/* amount of free blocks in file system */
66561+__u64 reiser4_free_committed_blocks(const struct super_block *super)
66562+{
66563+ assert("vs-497", super != NULL);
66564+ assert("vs-498", is_reiser4_super(super));
66565+ return get_super_private(super)->blocks_free_committed;
66566+}
66567+
66568+/* amount of blocks in the file system reserved for @uid and @gid */
66569+long reiser4_reserved_blocks(const struct super_block *super /* super block
66570+ queried */ ,
66571+ uid_t uid /* user id */ ,
66572+ gid_t gid /* group id */ )
66573+{
66574+ long reserved;
66575+
66576+ assert("nikita-456", super != NULL);
66577+ assert("nikita-457", is_reiser4_super(super));
66578+
66579+ reserved = 0;
66580+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
66581+ reserved += reserved_for_gid(super, gid);
66582+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
66583+ reserved += reserved_for_uid(super, uid);
66584+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
66585+ reserved += reserved_for_root(super);
66586+ return reserved;
66587+}
66588+
66589+/* get/set value of/to grabbed blocks counter */
66590+__u64 reiser4_grabbed_blocks(const struct super_block * super)
66591+{
66592+ assert("zam-512", super != NULL);
66593+ assert("zam-513", is_reiser4_super(super));
66594+
66595+ return get_super_private(super)->blocks_grabbed;
66596+}
66597+
71430cf6 66598+__u64 reiser4_flush_reserved(const struct super_block * super)
44254afd
MT
66599+{
66600+ assert("vpf-285", super != NULL);
66601+ assert("vpf-286", is_reiser4_super(super));
66602+
66603+ return get_super_private(super)->blocks_flush_reserved;
66604+}
66605+
66606+/* get/set value of/to counter of fake allocated formatted blocks */
66607+__u64 reiser4_fake_allocated(const struct super_block * super)
66608+{
66609+ assert("zam-516", super != NULL);
66610+ assert("zam-517", is_reiser4_super(super));
66611+
66612+ return get_super_private(super)->blocks_fake_allocated;
66613+}
66614+
66615+/* get/set value of/to counter of fake allocated unformatted blocks */
66616+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
66617+{
66618+ assert("zam-516", super != NULL);
66619+ assert("zam-517", is_reiser4_super(super));
66620+
66621+ return get_super_private(super)->blocks_fake_allocated_unformatted;
66622+}
66623+
66624+/* get/set value of/to counter of clustered blocks */
66625+__u64 reiser4_clustered_blocks(const struct super_block * super)
66626+{
66627+ assert("edward-601", super != NULL);
66628+ assert("edward-602", is_reiser4_super(super));
66629+
66630+ return get_super_private(super)->blocks_clustered;
66631+}
66632+
66633+/* space allocator used by this file system */
71430cf6
MT
66634+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
66635+ *super)
44254afd
MT
66636+{
66637+ assert("nikita-1965", super != NULL);
66638+ assert("nikita-1966", is_reiser4_super(super));
66639+ return &get_super_private(super)->space_allocator;
66640+}
66641+
66642+/* return fake inode used to bind formatted nodes in the page cache */
71430cf6 66643+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
44254afd
MT
66644+ queried */ )
66645+{
66646+ assert("nikita-1757", super != NULL);
66647+ return get_super_private(super)->fake;
66648+}
66649+
66650+/* return fake inode used to bind copied on capture nodes in the page cache */
71430cf6 66651+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
44254afd
MT
66652+ queried */ )
66653+{
66654+ assert("nikita-1757", super != NULL);
66655+ return get_super_private(super)->cc;
66656+}
66657+
66658+/* return fake inode used to bind bitmaps and journlal heads */
71430cf6 66659+struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
44254afd
MT
66660+{
66661+ assert("nikita-17571", super != NULL);
66662+ return get_super_private(super)->bitmap;
66663+}
66664+
66665+/* tree used by this file system */
71430cf6 66666+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
44254afd
MT
66667+ * queried */ )
66668+{
66669+ assert("nikita-460", super != NULL);
66670+ assert("nikita-461", is_reiser4_super(super));
66671+ return &get_super_private(super)->tree;
66672+}
66673+
66674+/* Check that @super is (looks like) reiser4 super block. This is mainly for
66675+ use in assertions. */
66676+int is_reiser4_super(const struct super_block *super /* super block
66677+ * queried */ )
66678+{
66679+ return
66680+ super != NULL &&
66681+ get_super_private(super) != NULL &&
66682+ super->s_op == &(get_super_private(super)->ops.super);
66683+}
66684+
66685+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
66686+{
66687+ return test_bit((int)f, &get_super_private(super)->fs_flags);
66688+}
66689+
66690+/* amount of blocks reserved for given group in file system */
66691+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
66692+ * block
66693+ * queried */ ,
66694+ gid_t gid UNUSED_ARG /* group id */ )
66695+{
66696+ return 0;
66697+}
66698+
66699+/* amount of blocks reserved for given user in file system */
66700+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
66701+ block
66702+ queried */ ,
66703+ uid_t uid UNUSED_ARG /* user id */ )
66704+{
66705+ return 0;
66706+}
66707+
66708+/* amount of blocks reserved for super user in file system */
66709+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
66710+ block
66711+ queried */ )
66712+{
66713+ return 0;
66714+}
66715+
66716+/*
66717+ * true if block number @blk makes sense for the file system at @super.
66718+ */
66719+int
66720+reiser4_blocknr_is_sane_for(const struct super_block *super,
66721+ const reiser4_block_nr * blk)
66722+{
66723+ reiser4_super_info_data *sbinfo;
66724+
66725+ assert("nikita-2957", super != NULL);
66726+ assert("nikita-2958", blk != NULL);
66727+
71430cf6 66728+ if (reiser4_blocknr_is_fake(blk))
44254afd
MT
66729+ return 1;
66730+
66731+ sbinfo = get_super_private(super);
66732+ return *blk < sbinfo->block_count;
66733+}
66734+
71430cf6 66735+#if REISER4_DEBUG
44254afd
MT
66736+/*
66737+ * true, if block number @blk makes sense for the current file system
66738+ */
66739+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
66740+{
66741+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
66742+}
71430cf6 66743+#endif /* REISER4_DEBUG */
44254afd
MT
66744+
66745+/* Make Linus happy.
66746+ Local variables:
66747+ c-indentation-style: "K&R"
66748+ mode-name: "LC"
66749+ c-basic-offset: 8
66750+ tab-width: 8
66751+ fill-column: 120
66752+ End:
66753+*/
71430cf6
MT
66754diff -urN linux-2.6.22.orig/fs/reiser4/super.h linux-2.6.22/fs/reiser4/super.h
66755--- linux-2.6.22.orig/fs/reiser4/super.h 1970-01-01 03:00:00.000000000 +0300
66756+++ linux-2.6.22/fs/reiser4/super.h 2007-07-29 00:25:35.020733749 +0400
66757@@ -0,0 +1,464 @@
44254afd
MT
66758+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
66759+ * reiser4/README */
66760+
66761+/* Super-block functions. See super.c for details. */
66762+
66763+#if !defined( __REISER4_SUPER_H__ )
66764+#define __REISER4_SUPER_H__
66765+
66766+#include "tree.h"
66767+#include "entd.h"
66768+#include "wander.h"
66769+#include "fsdata.h"
66770+#include "plugin/object.h"
66771+#include "plugin/space/space_allocator.h"
66772+
66773+/*
66774+ * Flush algorithms parameters.
66775+ */
71430cf6 66776+struct flush_params {
44254afd
MT
66777+ unsigned relocate_threshold;
66778+ unsigned relocate_distance;
66779+ unsigned written_threshold;
66780+ unsigned scan_maxnodes;
71430cf6 66781+};
44254afd
MT
66782+
66783+typedef enum {
66784+ /*
66785+ * True if this file system doesn't support hard-links (multiple names)
66786+ * for directories: this is default UNIX behavior.
66787+ *
66788+ * If hard-links on directoires are not allowed, file system is Acyclic
66789+ * Directed Graph (modulo dot, and dotdot, of course).
66790+ *
66791+ * This is used by reiser4_link().
66792+ */
66793+ REISER4_ADG = 0,
66794+ /*
66795+ * set if all nodes in internal tree have the same node layout plugin.
66796+ * If so, znode_guess_plugin() will return tree->node_plugin in stead
66797+ * of guessing plugin by plugin id stored in the node.
66798+ */
66799+ REISER4_ONE_NODE_PLUGIN = 1,
66800+ /* if set, bsd gid assignment is supported. */
66801+ REISER4_BSD_GID = 2,
66802+ /* [mac]_time are 32 bit in inode */
66803+ REISER4_32_BIT_TIMES = 3,
44254afd
MT
66804+ /* load all bitmap blocks at mount time */
66805+ REISER4_DONT_LOAD_BITMAP = 5,
66806+ /* enforce atomicity during write(2) */
66807+ REISER4_ATOMIC_WRITE = 6,
66808+ /* don't use write barriers in the log writer code. */
66809+ REISER4_NO_WRITE_BARRIER = 7
44254afd
MT
66810+} reiser4_fs_flag;
66811+
66812+/*
66813+ * VFS related operation vectors.
66814+ */
71430cf6 66815+struct object_ops {
44254afd
MT
66816+ struct super_operations super;
66817+ struct dentry_operations dentry;
66818+ struct export_operations export;
71430cf6 66819+};
44254afd
MT
66820+
66821+/* reiser4-specific part of super block
66822+
66823+ Locking
66824+
66825+ Fields immutable after mount:
66826+
66827+ ->oid*
66828+ ->space*
66829+ ->default_[ug]id
66830+ ->mkfs_id
66831+ ->trace_flags
66832+ ->debug_flags
66833+ ->fs_flags
66834+ ->df_plug
66835+ ->optimal_io_size
66836+ ->plug
66837+ ->flush
66838+ ->u (bad name)
66839+ ->txnmgr
66840+ ->ra_params
66841+ ->fsuid
66842+ ->journal_header
66843+ ->journal_footer
66844+
66845+ Fields protected by ->lnode_guard
66846+
66847+ ->lnode_htable
66848+
66849+ Fields protected by per-super block spin lock
66850+
66851+ ->block_count
66852+ ->blocks_used
66853+ ->blocks_free
66854+ ->blocks_free_committed
66855+ ->blocks_grabbed
66856+ ->blocks_fake_allocated_unformatted
66857+ ->blocks_fake_allocated
66858+ ->blocks_flush_reserved
66859+ ->eflushed
66860+ ->blocknr_hint_default
66861+
66862+ After journal replaying during mount,
66863+
66864+ ->last_committed_tx
66865+
71430cf6 66866+ is protected by ->tmgr.commit_mutex
44254afd
MT
66867+
66868+ Invariants involving this data-type:
66869+
66870+ [sb-block-counts]
66871+ [sb-grabbed]
66872+ [sb-fake-allocated]
66873+*/
66874+struct reiser4_super_info_data {
66875+ /*
66876+ * guard spinlock which protects reiser4 super block fields (currently
66877+ * blocks_free, blocks_free_committed)
66878+ */
66879+ spinlock_t guard;
66880+
66881+ /* next oid that will be returned by oid_allocate() */
66882+ oid_t next_to_use;
66883+ /* total number of used oids */
66884+ oid_t oids_in_use;
66885+
66886+ /* space manager plugin */
66887+ reiser4_space_allocator space_allocator;
66888+
66889+ /* reiser4 internal tree */
66890+ reiser4_tree tree;
66891+
66892+ /*
66893+ * default user id used for light-weight files without their own
66894+ * stat-data.
66895+ */
66896+ uid_t default_uid;
66897+
66898+ /*
66899+ * default group id used for light-weight files without their own
66900+ * stat-data.
66901+ */
66902+ gid_t default_gid;
66903+
66904+ /* mkfs identifier generated at mkfs time. */
66905+ __u32 mkfs_id;
66906+ /* amount of blocks in a file system */
66907+ __u64 block_count;
66908+
66909+ /* inviolable reserve */
66910+ __u64 blocks_reserved;
66911+
66912+ /* amount of blocks used by file system data and meta-data. */
66913+ __u64 blocks_used;
66914+
66915+ /*
66916+ * amount of free blocks. This is "working" free blocks counter. It is
66917+ * like "working" bitmap, please see block_alloc.c for description.
66918+ */
66919+ __u64 blocks_free;
66920+
66921+ /*
66922+ * free block count for fs committed state. This is "commit" version of
66923+ * free block counter.
66924+ */
66925+ __u64 blocks_free_committed;
66926+
66927+ /*
66928+ * number of blocks reserved for further allocation, for all
66929+ * threads.
66930+ */
66931+ __u64 blocks_grabbed;
66932+
66933+ /* number of fake allocated unformatted blocks in tree. */
66934+ __u64 blocks_fake_allocated_unformatted;
66935+
66936+ /* number of fake allocated formatted blocks in tree. */
66937+ __u64 blocks_fake_allocated;
66938+
66939+ /* number of blocks reserved for flush operations. */
66940+ __u64 blocks_flush_reserved;
66941+
66942+ /* number of blocks reserved for cluster operations. */
66943+ __u64 blocks_clustered;
66944+
66945+ /* unique file-system identifier */
66946+ __u32 fsuid;
66947+
71430cf6
MT
66948+ /* On-disk format version. If does not equal to the disk_format
66949+ plugin version, some format updates (e.g. enlarging plugin
66950+ set, etc) may have place on mount. */
66951+ int version;
66952+
44254afd
MT
66953+ /* file-system wide flags. See reiser4_fs_flag enum */
66954+ unsigned long fs_flags;
66955+
66956+ /* transaction manager */
66957+ txn_mgr tmgr;
66958+
66959+ /* ent thread */
66960+ entd_context entd;
66961+
66962+ /* fake inode used to bind formatted nodes */
66963+ struct inode *fake;
66964+ /* inode used to bind bitmaps (and journal heads) */
66965+ struct inode *bitmap;
66966+ /* inode used to bind copied on capture nodes */
66967+ struct inode *cc;
66968+
66969+ /* disk layout plugin */
66970+ disk_format_plugin *df_plug;
66971+
66972+ /* disk layout specific part of reiser4 super info data */
66973+ union {
66974+ format40_super_info format40;
66975+ } u;
66976+
66977+ /* value we return in st_blksize on stat(2) */
66978+ unsigned long optimal_io_size;
66979+
66980+ /* parameters for the flush algorithm */
71430cf6 66981+ struct flush_params flush;
44254afd
MT
66982+
66983+ /* pointers to jnodes for journal header and footer */
66984+ jnode *journal_header;
66985+ jnode *journal_footer;
66986+
66987+ journal_location jloc;
66988+
66989+ /* head block number of last committed transaction */
66990+ __u64 last_committed_tx;
66991+
66992+ /*
66993+ * we remember last written location for using as a hint for new block
66994+ * allocation
66995+ */
66996+ __u64 blocknr_hint_default;
66997+
66998+ /* committed number of files (oid allocator state variable ) */
66999+ __u64 nr_files_committed;
67000+
71430cf6 67001+ struct formatted_ra_params ra_params;
44254afd
MT
67002+
67003+ /*
71430cf6 67004+ * A mutex for serializing cut tree operation if out-of-free-space:
44254afd
MT
67005+ * the only one cut_tree thread is allowed to grab space from reserved
67006+ * area (it is 5% of disk space)
67007+ */
71430cf6
MT
67008+ struct mutex delete_mutex;
67009+ /* task owning ->delete_mutex */
67010+ struct task_struct *delete_mutex_owner;
44254afd
MT
67011+
67012+ /* Diskmap's blocknumber */
67013+ __u64 diskmap_block;
67014+
67015+ /* What to do in case of error */
67016+ int onerror;
67017+
67018+ /* operations for objects on this file system */
71430cf6 67019+ struct object_ops ops;
44254afd
MT
67020+
67021+ /*
67022+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67023+ * more details
67024+ */
71430cf6 67025+ struct d_cursor_info d_info;
44254afd
MT
67026+
67027+#ifdef CONFIG_REISER4_BADBLOCKS
67028+ /* Alternative master superblock offset (in bytes) */
67029+ unsigned long altsuper;
67030+#endif
67031+ struct repacker *repacker;
67032+ struct page *status_page;
67033+ struct bio *status_bio;
67034+
67035+#if REISER4_DEBUG
67036+ /*
67037+ * minimum used blocks value (includes super blocks, bitmap blocks and
67038+ * other fs reserved areas), depends on fs format and fs size.
67039+ */
67040+ __u64 min_blocks_used;
67041+
67042+ /*
67043+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67044+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
67045+ * protected by sbinfo->all_guard spin lock. This lock should be taken
67046+ * with _irq modifier, because it is also modified from interrupt
67047+ * contexts (by RCU).
67048+ */
67049+ spinlock_t all_guard;
67050+ /* list of all jnodes */
67051+ struct list_head all_jnodes;
67052+#endif
67053+ struct dentry *debugfs_root;
67054+};
67055+
67056+extern reiser4_super_info_data *get_super_private_nocheck(const struct
67057+ super_block *super);
67058+
44254afd
MT
67059+/* Return reiser4-specific part of super block */
67060+static inline reiser4_super_info_data *get_super_private(const struct
67061+ super_block *super)
67062+{
67063+ assert("nikita-447", super != NULL);
67064+
67065+ return (reiser4_super_info_data *) super->s_fs_info;
67066+}
67067+
67068+/* get ent context for the @super */
67069+static inline entd_context *get_entd_context(struct super_block *super)
67070+{
67071+ return &get_super_private(super)->entd;
67072+}
67073+
44254afd
MT
67074+/* "Current" super-block: main super block used during current system
67075+ call. Reference to this super block is stored in reiser4_context. */
67076+static inline struct super_block *reiser4_get_current_sb(void)
67077+{
67078+ return get_current_context()->super;
67079+}
67080+
67081+/* Reiser4-specific part of "current" super-block: main super block used
67082+ during current system call. Reference to this super block is stored in
67083+ reiser4_context. */
67084+static inline reiser4_super_info_data *get_current_super_private(void)
67085+{
67086+ return get_super_private(reiser4_get_current_sb());
67087+}
67088+
71430cf6 67089+static inline struct formatted_ra_params *get_current_super_ra_params(void)
44254afd
MT
67090+{
67091+ return &(get_current_super_private()->ra_params);
67092+}
67093+
67094+/*
67095+ * true, if file system on @super is read-only
67096+ */
67097+static inline int rofs_super(struct super_block *super)
67098+{
67099+ return super->s_flags & MS_RDONLY;
67100+}
67101+
67102+/*
67103+ * true, if @tree represents read-only file system
67104+ */
67105+static inline int rofs_tree(reiser4_tree * tree)
67106+{
67107+ return rofs_super(tree->super);
67108+}
67109+
67110+/*
67111+ * true, if file system where @inode lives on, is read-only
67112+ */
67113+static inline int rofs_inode(struct inode *inode)
67114+{
67115+ return rofs_super(inode->i_sb);
67116+}
67117+
67118+/*
67119+ * true, if file system where @node lives on, is read-only
67120+ */
67121+static inline int rofs_jnode(jnode * node)
67122+{
67123+ return rofs_tree(jnode_get_tree(node));
67124+}
67125+
67126+extern __u64 reiser4_current_block_count(void);
67127+
71430cf6 67128+extern void build_object_ops(struct super_block *super, struct object_ops * ops);
44254afd
MT
67129+
67130+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67131+
67132+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67133+{
67134+ spin_lock(&(sbinfo->guard));
67135+}
67136+
67137+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67138+{
67139+ assert_spin_locked(&(sbinfo->guard));
67140+ spin_unlock(&(sbinfo->guard));
67141+}
67142+
71430cf6 67143+extern __u64 reiser4_flush_reserved(const struct super_block *);
44254afd 67144+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
71430cf6 67145+extern long reiser4_statfs_type(const struct super_block *super);
44254afd
MT
67146+extern __u64 reiser4_block_count(const struct super_block *super);
67147+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67148+extern __u64 reiser4_data_blocks(const struct super_block *super);
67149+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67150+extern __u64 reiser4_free_blocks(const struct super_block *super);
67151+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67152+extern __u32 reiser4_mkfs_id(const struct super_block *super);
67153+
67154+extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67155+
67156+extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67157+extern __u64 reiser4_fake_allocated(const struct super_block *);
67158+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67159+extern __u64 reiser4_clustered_blocks(const struct super_block *);
67160+
67161+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67162+ gid_t gid);
67163+
71430cf6
MT
67164+extern reiser4_space_allocator *
67165+reiser4_get_space_allocator(const struct super_block *super);
67166+extern reiser4_oid_allocator *
67167+reiser4_get_oid_allocator(const struct super_block *super);
67168+extern struct inode *reiser4_get_super_fake(const struct super_block *super);
67169+extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
67170+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
67171+extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
44254afd
MT
67172+extern int is_reiser4_super(const struct super_block *super);
67173+
67174+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67175+extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67176+ const reiser4_block_nr * blk);
67177+extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67178+extern int reiser4_done_super(struct super_block *s);
67179+
67180+/* step of fill super */
71430cf6
MT
67181+extern int reiser4_init_fs_info(struct super_block *);
67182+extern void reiser4_done_fs_info(struct super_block *);
67183+extern int reiser4_init_super_data(struct super_block *, char *opt_string);
67184+extern int reiser4_init_read_super(struct super_block *, int silent);
67185+extern int reiser4_init_root_inode(struct super_block *);
67186+extern reiser4_plugin *get_default_plugin(pset_member memb);
44254afd
MT
67187+
67188+/* Maximal possible object id. */
67189+#define ABSOLUTE_MAX_OID ((oid_t)~0)
67190+
67191+#define OIDS_RESERVED ( 1 << 16 )
67192+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67193+oid_t oid_allocate(struct super_block *);
67194+int oid_release(struct super_block *, oid_t);
67195+oid_t oid_next(const struct super_block *);
67196+void oid_count_allocated(void);
67197+void oid_count_released(void);
67198+long oids_used(const struct super_block *);
67199+
67200+#if REISER4_DEBUG
67201+void print_fs_info(const char *prefix, const struct super_block *);
67202+#endif
67203+
71430cf6 67204+extern void destroy_reiser4_cache(struct kmem_cache **);
44254afd
MT
67205+
67206+extern struct super_operations reiser4_super_operations;
67207+extern struct export_operations reiser4_export_operations;
67208+extern struct dentry_operations reiser4_dentry_operations;
44254afd
MT
67209+
67210+/* __REISER4_SUPER_H__ */
67211+#endif
67212+
67213+/*
67214+ * Local variables:
67215+ * c-indentation-style: "K&R"
67216+ * mode-name: "LC"
67217+ * c-basic-offset: 8
67218+ * tab-width: 8
67219+ * fill-column: 120
67220+ * End:
67221+ */
71430cf6
MT
67222diff -urN linux-2.6.22.orig/fs/reiser4/super_ops.c linux-2.6.22/fs/reiser4/super_ops.c
67223--- linux-2.6.22.orig/fs/reiser4/super_ops.c 1970-01-01 03:00:00.000000000 +0300
67224+++ linux-2.6.22/fs/reiser4/super_ops.c 2007-07-29 00:25:35.020733749 +0400
67225@@ -0,0 +1,725 @@
44254afd
MT
67226+/* Copyright 2005 by Hans Reiser, licensing governed by
67227+ * reiser4/README */
67228+
67229+#include "inode.h"
67230+#include "page_cache.h"
67231+#include "ktxnmgrd.h"
67232+#include "flush.h"
67233+#include "safe_link.h"
67234+
67235+#include <linux/vfs.h>
67236+#include <linux/writeback.h>
67237+#include <linux/mount.h>
67238+#include <linux/seq_file.h>
67239+#include <linux/debugfs.h>
67240+
67241+/* slab cache for inodes */
71430cf6
MT
67242+static struct kmem_cache *inode_cache;
67243+
67244+static struct dentry *reiser4_debugfs_root = NULL;
44254afd
MT
67245+
67246+/**
67247+ * init_once - constructor for reiser4 inodes
67248+ * @obj: inode to be initialized
67249+ * @cache: cache @obj belongs to
67250+ * @flags: SLAB flags
67251+ *
67252+ * Initialization function to be called when new page is allocated by reiser4
67253+ * inode cache. It is set on inode cache creation.
67254+ */
71430cf6 67255+static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
44254afd 67256+{
71430cf6 67257+ struct reiser4_inode_object *info;
44254afd
MT
67258+
67259+ info = obj;
67260+
71430cf6
MT
67261+ /* initialize vfs inode */
67262+ inode_init_once(&info->vfs_inode);
44254afd 67263+
71430cf6
MT
67264+ /*
67265+ * initialize reiser4 specific part fo inode.
67266+ * NOTE-NIKITA add here initializations for locks, list heads,
67267+ * etc. that will be added to our private inode part.
67268+ */
67269+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67270+ init_rwsem(&info->p.conv_sem);
67271+ /* init semaphore which is used during inode loading */
67272+ loading_init_once(&info->p);
67273+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67274+ GFP_ATOMIC);
44254afd 67275+#if REISER4_DEBUG
71430cf6 67276+ info->p.nr_jnodes = 0;
44254afd 67277+#endif
44254afd
MT
67278+}
67279+
67280+/**
67281+ * init_inodes - create znode cache
67282+ *
67283+ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67284+ */
67285+static int init_inodes(void)
67286+{
67287+ inode_cache = kmem_cache_create("reiser4_inode",
71430cf6 67288+ sizeof(struct reiser4_inode_object),
44254afd
MT
67289+ 0,
67290+ SLAB_HWCACHE_ALIGN |
67291+ SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67292+ if (inode_cache == NULL)
67293+ return RETERR(-ENOMEM);
67294+ return 0;
67295+}
67296+
67297+/**
67298+ * done_inodes - delete inode cache
67299+ *
67300+ * This is called on reiser4 module unloading or system shutdown.
67301+ */
67302+static void done_inodes(void)
67303+{
67304+ destroy_reiser4_cache(&inode_cache);
67305+}
67306+
67307+/**
67308+ * reiser4_alloc_inode - alloc_inode of super operations
67309+ * @super: super block new inode is allocated for
67310+ *
67311+ * Allocates new inode, initializes reiser4 specific part of it.
67312+ */
67313+static struct inode *reiser4_alloc_inode(struct super_block *super)
67314+{
71430cf6 67315+ struct reiser4_inode_object *obj;
44254afd
MT
67316+
67317+ assert("nikita-1696", super != NULL);
71430cf6 67318+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
44254afd
MT
67319+ if (obj != NULL) {
67320+ reiser4_inode *info;
67321+
67322+ info = &obj->p;
67323+
71430cf6
MT
67324+ info->pset = plugin_set_get_empty();
67325+ info->hset = plugin_set_get_empty();
44254afd
MT
67326+ info->extmask = 0;
67327+ info->locality_id = 0ull;
67328+ info->plugin_mask = 0;
71430cf6 67329+ info->heir_mask = 0;
44254afd
MT
67330+#if !REISER4_INO_IS_OID
67331+ info->oid_hi = 0;
67332+#endif
71430cf6 67333+ reiser4_seal_init(&info->sd_seal, NULL, NULL);
44254afd
MT
67334+ coord_init_invalid(&info->sd_coord, NULL);
67335+ info->flags = 0;
67336+ spin_lock_init(&info->guard);
67337+ /* this deals with info's loading semaphore */
67338+ loading_alloc(info);
67339+ info->vroot = UBER_TREE_ADDR;
67340+ return &obj->vfs_inode;
67341+ } else
67342+ return NULL;
67343+}
67344+
67345+/**
67346+ * reiser4_destroy_inode - destroy_inode of super operations
67347+ * @inode: inode being destroyed
67348+ *
67349+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67350+ */
67351+static void reiser4_destroy_inode(struct inode *inode)
67352+{
67353+ reiser4_inode *info;
67354+
67355+ info = reiser4_inode_data(inode);
67356+
67357+ assert("vs-1220", inode_has_no_jnodes(info));
67358+
67359+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67360+ file_plugin *fplug = inode_file_plugin(inode);
67361+ if (fplug->destroy_inode != NULL)
67362+ fplug->destroy_inode(inode);
67363+ }
71430cf6 67364+ reiser4_dispose_cursors(inode);
44254afd
MT
67365+ if (info->pset)
67366+ plugin_set_put(info->pset);
71430cf6
MT
67367+ if (info->hset)
67368+ plugin_set_put(info->hset);
44254afd
MT
67369+
67370+ /*
67371+ * cannot add similar assertion about ->i_list as prune_icache return
67372+ * inode into slab with dangling ->list.{next,prev}. This is safe,
67373+ * because they are re-initialized in the new_inode().
67374+ */
67375+ assert("nikita-2895", list_empty(&inode->i_dentry));
67376+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67377+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67378+
67379+ /* this deals with info's loading semaphore */
67380+ loading_destroy(info);
67381+
67382+ kmem_cache_free(inode_cache,
71430cf6 67383+ container_of(info, struct reiser4_inode_object, p));
44254afd
MT
67384+}
67385+
67386+/**
67387+ * reiser4_dirty_inode - dirty_inode of super operations
67388+ * @inode: inode being dirtied
67389+ *
67390+ * Updates stat data.
67391+ */
67392+static void reiser4_dirty_inode(struct inode *inode)
67393+{
67394+ int result;
67395+
67396+ if (!is_in_reiser4_context())
67397+ return;
67398+ assert("", !IS_RDONLY(inode));
67399+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
67400+ get_current_context()->grabbed_blocks));
67401+
67402+ result = reiser4_update_sd(inode);
67403+ if (result)
67404+ warning("", "failed to dirty inode for %llu: %d",
67405+ get_inode_oid(inode), result);
67406+}
67407+
67408+/**
67409+ * reiser4_delete_inode - delete_inode of super operations
67410+ * @inode: inode to delete
67411+ *
67412+ * Calls file plugin's delete_object method to delete object items from
67413+ * filesystem tree and calls clear_inode.
67414+ */
67415+static void reiser4_delete_inode(struct inode *inode)
67416+{
67417+ reiser4_context *ctx;
67418+ file_plugin *fplug;
67419+
71430cf6 67420+ ctx = reiser4_init_context(inode->i_sb);
44254afd
MT
67421+ if (IS_ERR(ctx)) {
67422+ warning("vs-15", "failed to init context");
67423+ return;
67424+ }
67425+
67426+ if (is_inode_loaded(inode)) {
67427+ fplug = inode_file_plugin(inode);
67428+ if (fplug != NULL && fplug->delete_object != NULL)
67429+ fplug->delete_object(inode);
67430+ }
67431+
71430cf6 67432+ truncate_inode_pages(&inode->i_data, 0);
44254afd
MT
67433+ inode->i_blocks = 0;
67434+ clear_inode(inode);
67435+ reiser4_exit_context(ctx);
67436+}
67437+
67438+/**
67439+ * reiser4_put_super - put_super of super operations
67440+ * @super: super block to free
67441+ *
67442+ * Stops daemons, release resources, umounts in short.
67443+ */
67444+static void reiser4_put_super(struct super_block *super)
67445+{
67446+ reiser4_super_info_data *sbinfo;
67447+ reiser4_context *ctx;
67448+
67449+ sbinfo = get_super_private(super);
67450+ assert("vs-1699", sbinfo);
67451+
67452+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
67453+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
67454+ debugfs_remove(sbinfo->debugfs_root);
67455+
71430cf6 67456+ ctx = reiser4_init_context(super);
44254afd
MT
67457+ if (IS_ERR(ctx)) {
67458+ warning("vs-17", "failed to init context");
67459+ return;
71430cf6 67460+ }
44254afd
MT
67461+
67462+ /* have disk format plugin to free its resources */
67463+ if (get_super_private(super)->df_plug->release)
67464+ get_super_private(super)->df_plug->release(super);
67465+
71430cf6 67466+ reiser4_done_formatted_fake(super);
44254afd
MT
67467+
67468+ /* stop daemons: ktxnmgr and entd */
71430cf6
MT
67469+ reiser4_done_entd(super);
67470+ reiser4_done_ktxnmgrd(super);
67471+ reiser4_done_txnmgr(&sbinfo->tmgr);
44254afd 67472+
71430cf6 67473+ reiser4_done_fs_info(super);
44254afd
MT
67474+ reiser4_exit_context(ctx);
67475+}
67476+
67477+/**
67478+ * reiser4_write_super - write_super of super operations
67479+ * @super: super block to write
67480+ *
67481+ * Captures znode associated with super block, comit all transactions.
67482+ */
67483+static void reiser4_write_super(struct super_block *super)
67484+{
67485+ int ret;
67486+ reiser4_context *ctx;
67487+
67488+ assert("vs-1700", !rofs_super(super));
67489+
71430cf6 67490+ ctx = reiser4_init_context(super);
44254afd
MT
67491+ if (IS_ERR(ctx)) {
67492+ warning("vs-16", "failed to init context");
67493+ return;
67494+ }
67495+
71430cf6 67496+ ret = reiser4_capture_super_block(super);
44254afd
MT
67497+ if (ret != 0)
67498+ warning("vs-1701",
71430cf6
MT
67499+ "reiser4_capture_super_block failed in write_super: %d",
67500+ ret);
44254afd
MT
67501+ ret = txnmgr_force_commit_all(super, 0);
67502+ if (ret != 0)
67503+ warning("jmacd-77113",
67504+ "txn_force failed in write_super: %d", ret);
67505+
67506+ super->s_dirt = 0;
67507+
67508+ reiser4_exit_context(ctx);
67509+}
67510+
67511+/**
67512+ * reiser4_statfs - statfs of super operations
67513+ * @super: super block of file system in queried
67514+ * @stafs: buffer to fill with statistics
67515+ *
67516+ * Returns information about filesystem.
67517+ */
71430cf6 67518+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
44254afd
MT
67519+{
67520+ sector_t total;
67521+ sector_t reserved;
67522+ sector_t free;
67523+ sector_t forroot;
67524+ sector_t deleted;
67525+ reiser4_context *ctx;
71430cf6 67526+ struct super_block *super = dentry->d_sb;
44254afd
MT
67527+
67528+ assert("nikita-408", super != NULL);
67529+ assert("nikita-409", statfs != NULL);
67530+
71430cf6 67531+ ctx = reiser4_init_context(super);
44254afd
MT
67532+ if (IS_ERR(ctx))
67533+ return PTR_ERR(ctx);
67534+
71430cf6 67535+ statfs->f_type = reiser4_statfs_type(super);
44254afd
MT
67536+ statfs->f_bsize = super->s_blocksize;
67537+
67538+ /*
67539+ * 5% of total block space is reserved. This is needed for flush and
67540+ * for truncates (so that we are able to perform truncate/unlink even
67541+ * on the otherwise completely full file system). If this reservation
67542+ * is hidden from statfs(2), users will mistakenly guess that they
67543+ * have enough free space to complete some operation, which is
67544+ * frustrating.
67545+ *
67546+ * Another possible solution is to subtract ->blocks_reserved from
67547+ * ->f_bfree, but changing available space seems less intrusive than
67548+ * letting user to see 5% of disk space to be used directly after
67549+ * mkfs.
67550+ */
67551+ total = reiser4_block_count(super);
67552+ reserved = get_super_private(super)->blocks_reserved;
67553+ deleted = txnmgr_count_deleted_blocks();
67554+ free = reiser4_free_blocks(super) + deleted;
67555+ forroot = reiser4_reserved_blocks(super, 0, 0);
67556+
67557+ /*
67558+ * These counters may be in inconsistent state because we take the
67559+ * values without keeping any global spinlock. Here we do a sanity
67560+ * check that free block counter does not exceed the number of all
67561+ * blocks.
67562+ */
67563+ if (free > total)
67564+ free = total;
67565+ statfs->f_blocks = total - reserved;
67566+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
67567+ if (free > reserved)
67568+ free -= reserved;
67569+ else
67570+ free = 0;
67571+ statfs->f_bfree = free;
67572+
67573+ if (free > forroot)
67574+ free -= forroot;
67575+ else
67576+ free = 0;
67577+ statfs->f_bavail = free;
67578+
67579+ statfs->f_files = 0;
67580+ statfs->f_ffree = 0;
67581+
67582+ /* maximal acceptable name length depends on directory plugin. */
67583+ assert("nikita-3351", super->s_root->d_inode != NULL);
67584+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
67585+ reiser4_exit_context(ctx);
67586+ return 0;
67587+}
67588+
67589+/**
67590+ * reiser4_clear_inode - clear_inode of super operation
67591+ * @inode: inode about to destroy
67592+ *
67593+ * Does sanity checks: being destroyed should have all jnodes detached.
67594+ */
67595+static void reiser4_clear_inode(struct inode *inode)
67596+{
67597+#if REISER4_DEBUG
67598+ reiser4_inode *r4_inode;
67599+
67600+ r4_inode = reiser4_inode_data(inode);
67601+ if (!inode_has_no_jnodes(r4_inode))
67602+ warning("vs-1732", "reiser4 inode has %ld jnodes\n",
67603+ r4_inode->nr_jnodes);
67604+#endif
67605+}
67606+
67607+/**
67608+ * reiser4_sync_inodes - sync_inodes of super operations
67609+ * @super:
67610+ * @wbc:
67611+ *
67612+ * This method is called by background and non-backgound writeback. Reiser4's
67613+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
67614+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
67615+ * mapping - dirty pages get into atoms. Writeout is called to flush some
67616+ * atoms.
67617+ */
67618+static void reiser4_sync_inodes(struct super_block *super,
67619+ struct writeback_control *wbc)
67620+{
67621+ reiser4_context *ctx;
67622+ long to_write;
67623+
67624+ if (wbc->for_kupdate)
67625+ /* reiser4 has its own means of periodical write-out */
67626+ return;
67627+
67628+ to_write = wbc->nr_to_write;
67629+ assert("vs-49", wbc->older_than_this == NULL);
67630+
71430cf6 67631+ ctx = reiser4_init_context(super);
44254afd
MT
67632+ if (IS_ERR(ctx)) {
67633+ warning("vs-13", "failed to init context");
67634+ return;
67635+ }
67636+
67637+ /*
67638+ * call reiser4_writepages for each of dirty inodes to turn dirty pages
67639+ * into transactions if they were not yet.
67640+ */
67641+ generic_sync_sb_inodes(super, wbc);
67642+
67643+ /* flush goes here */
67644+ wbc->nr_to_write = to_write;
71430cf6 67645+ reiser4_writeout(super, wbc);
44254afd
MT
67646+
67647+ /* avoid recursive calls to ->sync_inodes */
67648+ context_set_commit_async(ctx);
67649+ reiser4_exit_context(ctx);
67650+}
67651+
67652+/**
67653+ * reiser4_show_options - show_options of super operations
67654+ * @m: file where to write information
67655+ * @mnt: mount structure
67656+ *
67657+ * Makes reiser4 mount options visible in /proc/mounts.
67658+ */
67659+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
67660+{
67661+ struct super_block *super;
67662+ reiser4_super_info_data *sbinfo;
67663+
67664+ super = mnt->mnt_sb;
67665+ sbinfo = get_super_private(super);
67666+
67667+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
67668+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
67669+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
67670+ seq_printf(m, ",atom_max_flushers=0x%x",
67671+ sbinfo->tmgr.atom_max_flushers);
67672+ seq_printf(m, ",cbk_cache_slots=0x%x",
67673+ sbinfo->tree.cbk_cache.nr_slots);
67674+
67675+ return 0;
67676+}
67677+
67678+struct super_operations reiser4_super_operations = {
67679+ .alloc_inode = reiser4_alloc_inode,
67680+ .destroy_inode = reiser4_destroy_inode,
67681+ .dirty_inode = reiser4_dirty_inode,
67682+ .delete_inode = reiser4_delete_inode,
67683+ .put_super = reiser4_put_super,
67684+ .write_super = reiser4_write_super,
67685+ .statfs = reiser4_statfs,
67686+ .clear_inode = reiser4_clear_inode,
67687+ .sync_inodes = reiser4_sync_inodes,
67688+ .show_options = reiser4_show_options
67689+};
67690+
67691+/**
67692+ * fill_super - initialize super block on mount
67693+ * @super: super block to fill
67694+ * @data: reiser4 specific mount option
67695+ * @silent:
67696+ *
67697+ * This is to be called by reiser4_get_sb. Mounts filesystem.
67698+ */
67699+static int fill_super(struct super_block *super, void *data, int silent)
67700+{
67701+ reiser4_context ctx;
67702+ int result;
67703+ reiser4_super_info_data *sbinfo;
67704+
67705+ assert("zam-989", super != NULL);
67706+
67707+ super->s_op = NULL;
67708+ init_stack_context(&ctx, super);
67709+
67710+ /* allocate reiser4 specific super block */
71430cf6 67711+ if ((result = reiser4_init_fs_info(super)) != 0)
44254afd
MT
67712+ goto failed_init_sinfo;
67713+
67714+ sbinfo = get_super_private(super);
67715+ /* initialize various reiser4 parameters, parse mount options */
71430cf6 67716+ if ((result = reiser4_init_super_data(super, data)) != 0)
44254afd
MT
67717+ goto failed_init_super_data;
67718+
67719+ /* read reiser4 master super block, initialize disk format plugin */
71430cf6 67720+ if ((result = reiser4_init_read_super(super, silent)) != 0)
44254afd
MT
67721+ goto failed_init_read_super;
67722+
67723+ /* initialize transaction manager */
71430cf6 67724+ reiser4_init_txnmgr(&sbinfo->tmgr);
44254afd
MT
67725+
67726+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
71430cf6 67727+ if ((result = reiser4_init_ktxnmgrd(super)) != 0)
44254afd
MT
67728+ goto failed_init_ktxnmgrd;
67729+
67730+ /* initialize entd context and start kernel thread entd */
71430cf6 67731+ if ((result = reiser4_init_entd(super)) != 0)
44254afd
MT
67732+ goto failed_init_entd;
67733+
67734+ /* initialize address spaces for formatted nodes and bitmaps */
71430cf6 67735+ if ((result = reiser4_init_formatted_fake(super)) != 0)
44254afd
MT
67736+ goto failed_init_formatted_fake;
67737+
67738+ /* initialize disk format plugin */
67739+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
67740+ goto failed_init_disk_format;
67741+
67742+ /*
67743+ * There are some 'committed' versions of reiser4 super block counters,
67744+ * which correspond to reiser4 on-disk state. These counters are
67745+ * initialized here
67746+ */
67747+ sbinfo->blocks_free_committed = sbinfo->blocks_free;
67748+ sbinfo->nr_files_committed = oids_used(super);
67749+
67750+ /* get inode of root directory */
71430cf6 67751+ if ((result = reiser4_init_root_inode(super)) != 0)
44254afd
MT
67752+ goto failed_init_root_inode;
67753+
71430cf6
MT
67754+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
67755+ goto failed_update_format_version;
67756+
44254afd
MT
67757+ process_safelinks(super);
67758+ reiser4_exit_context(&ctx);
67759+
67760+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
67761+ reiser4_debugfs_root);
67762+ if (sbinfo->debugfs_root) {
67763+ sbinfo->tmgr.debugfs_atom_count =
67764+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
67765+ sbinfo->debugfs_root,
67766+ &sbinfo->tmgr.atom_count);
67767+ sbinfo->tmgr.debugfs_id_count =
67768+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
67769+ sbinfo->debugfs_root,
67770+ &sbinfo->tmgr.id_count);
67771+ }
67772+ return 0;
67773+
71430cf6 67774+ failed_update_format_version:
44254afd
MT
67775+ failed_init_root_inode:
67776+ if (sbinfo->df_plug->release)
67777+ sbinfo->df_plug->release(super);
67778+ failed_init_disk_format:
71430cf6 67779+ reiser4_done_formatted_fake(super);
44254afd 67780+ failed_init_formatted_fake:
71430cf6 67781+ reiser4_done_entd(super);
44254afd 67782+ failed_init_entd:
71430cf6 67783+ reiser4_done_ktxnmgrd(super);
44254afd 67784+ failed_init_ktxnmgrd:
71430cf6 67785+ reiser4_done_txnmgr(&sbinfo->tmgr);
44254afd
MT
67786+ failed_init_read_super:
67787+ failed_init_super_data:
71430cf6 67788+ reiser4_done_fs_info(super);
44254afd
MT
67789+ failed_init_sinfo:
67790+ reiser4_exit_context(&ctx);
67791+ return result;
67792+}
67793+
67794+/**
67795+ * reiser4_get_sb - get_sb of file_system_type operations
67796+ * @fs_type:
67797+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
67798+ * @dev_name: block device file name
67799+ * @data: specific mount options
67800+ *
67801+ * Reiser4 mount entry.
67802+ */
71430cf6
MT
67803+static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
67804+ const char *dev_name, void *data, struct vfsmount *mnt)
44254afd 67805+{
71430cf6 67806+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
44254afd
MT
67807+}
67808+
67809+/* structure describing the reiser4 filesystem implementation */
67810+static struct file_system_type reiser4_fs_type = {
67811+ .owner = THIS_MODULE,
67812+ .name = "reiser4",
67813+ .fs_flags = FS_REQUIRES_DEV,
67814+ .get_sb = reiser4_get_sb,
67815+ .kill_sb = kill_block_super,
67816+ .next = NULL
67817+};
67818+
71430cf6 67819+void destroy_reiser4_cache(struct kmem_cache **cachep)
44254afd 67820+{
44254afd 67821+ BUG_ON(*cachep == NULL);
71430cf6 67822+ kmem_cache_destroy(*cachep);
44254afd
MT
67823+ *cachep = NULL;
67824+}
67825+
44254afd
MT
67826+/**
67827+ * init_reiser4 - reiser4 initialization entry point
67828+ *
67829+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
67830+ * on kernel initialization or during reiser4 module load.
67831+ */
67832+static int __init init_reiser4(void)
67833+{
67834+ int result;
67835+
67836+ printk(KERN_INFO
67837+ "Loading Reiser4. "
67838+ "See www.namesys.com for a description of Reiser4.\n");
67839+
67840+ /* initialize slab cache of inodes */
67841+ if ((result = init_inodes()) != 0)
67842+ goto failed_inode_cache;
67843+
67844+ /* initialize cache of znodes */
67845+ if ((result = init_znodes()) != 0)
67846+ goto failed_init_znodes;
67847+
67848+ /* initialize all plugins */
67849+ if ((result = init_plugins()) != 0)
67850+ goto failed_init_plugins;
67851+
67852+ /* initialize cache of plugin_set-s and plugin_set's hash table */
67853+ if ((result = init_plugin_set()) != 0)
67854+ goto failed_init_plugin_set;
67855+
67856+ /* initialize caches of txn_atom-s and txn_handle-s */
67857+ if ((result = init_txnmgr_static()) != 0)
67858+ goto failed_init_txnmgr_static;
67859+
67860+ /* initialize cache of jnodes */
67861+ if ((result = init_jnodes()) != 0)
67862+ goto failed_init_jnodes;
67863+
67864+ /* initialize cache of flush queues */
71430cf6 67865+ if ((result = reiser4_init_fqs()) != 0)
44254afd
MT
67866+ goto failed_init_fqs;
67867+
67868+ /* initialize cache of structures attached to dentry->d_fsdata */
71430cf6 67869+ if ((result = reiser4_init_dentry_fsdata()) != 0)
44254afd
MT
67870+ goto failed_init_dentry_fsdata;
67871+
67872+ /* initialize cache of structures attached to file->private_data */
71430cf6 67873+ if ((result = reiser4_init_file_fsdata()) != 0)
44254afd
MT
67874+ goto failed_init_file_fsdata;
67875+
67876+ /*
67877+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
67878+ * more details
67879+ */
71430cf6 67880+ if ((result = reiser4_init_d_cursor()) != 0)
44254afd
MT
67881+ goto failed_init_d_cursor;
67882+
67883+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
67884+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
67885+ return 0;
67886+ }
67887+
71430cf6 67888+ reiser4_done_d_cursor();
44254afd 67889+ failed_init_d_cursor:
71430cf6 67890+ reiser4_done_file_fsdata();
44254afd 67891+ failed_init_file_fsdata:
71430cf6 67892+ reiser4_done_dentry_fsdata();
44254afd 67893+ failed_init_dentry_fsdata:
71430cf6 67894+ reiser4_done_fqs();
44254afd
MT
67895+ failed_init_fqs:
67896+ done_jnodes();
67897+ failed_init_jnodes:
67898+ done_txnmgr_static();
67899+ failed_init_txnmgr_static:
67900+ done_plugin_set();
67901+ failed_init_plugin_set:
67902+ failed_init_plugins:
67903+ done_znodes();
67904+ failed_init_znodes:
67905+ done_inodes();
67906+ failed_inode_cache:
67907+ return result;
67908+}
67909+
67910+/**
67911+ * done_reiser4 - reiser4 exit entry point
67912+ *
67913+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
67914+ * or at module unload.
67915+ */
67916+static void __exit done_reiser4(void)
67917+{
67918+ int result;
67919+
67920+ debugfs_remove(reiser4_debugfs_root);
67921+ result = unregister_filesystem(&reiser4_fs_type);
67922+ BUG_ON(result != 0);
71430cf6
MT
67923+ reiser4_done_d_cursor();
67924+ reiser4_done_file_fsdata();
67925+ reiser4_done_dentry_fsdata();
67926+ reiser4_done_fqs();
44254afd
MT
67927+ done_jnodes();
67928+ done_txnmgr_static();
67929+ done_plugin_set();
67930+ done_znodes();
67931+ destroy_reiser4_cache(&inode_cache);
67932+}
67933+
67934+module_init(init_reiser4);
67935+module_exit(done_reiser4);
67936+
67937+MODULE_DESCRIPTION("Reiser4 filesystem");
67938+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
67939+
67940+MODULE_LICENSE("GPL");
67941+
67942+/*
67943+ * Local variables:
67944+ * c-indentation-style: "K&R"
67945+ * mode-name: "LC"
67946+ * c-basic-offset: 8
67947+ * tab-width: 8
67948+ * fill-column: 79
67949+ * End:
67950+ */
71430cf6
MT
67951diff -urN linux-2.6.22.orig/fs/reiser4/tap.c linux-2.6.22/fs/reiser4/tap.c
67952--- linux-2.6.22.orig/fs/reiser4/tap.c 1970-01-01 03:00:00.000000000 +0300
67953+++ linux-2.6.22/fs/reiser4/tap.c 2007-07-29 00:25:35.024734784 +0400
44254afd
MT
67954@@ -0,0 +1,377 @@
67955+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67956+ * reiser4/README */
67957+
67958+/*
67959+ Tree Access Pointer (tap).
67960+
67961+ tap is data structure combining coord and lock handle (mostly). It is
67962+ useful when one has to scan tree nodes (for example, in readdir, or flush),
67963+ for tap functions allow to move tap in either direction transparently
67964+ crossing unit/item/node borders.
67965+
67966+ Tap doesn't provide automatic synchronization of its fields as it is
67967+ supposed to be per-thread object.
67968+*/
67969+
67970+#include "forward.h"
67971+#include "debug.h"
67972+#include "coord.h"
67973+#include "tree.h"
67974+#include "context.h"
67975+#include "tap.h"
67976+#include "znode.h"
67977+#include "tree_walk.h"
67978+
67979+#if REISER4_DEBUG
67980+static int tap_invariant(const tap_t * tap);
67981+static void tap_check(const tap_t * tap);
67982+#else
67983+#define tap_check(tap) noop
67984+#endif
67985+
67986+/** load node tap is pointing to, if not loaded already */
71430cf6 67987+int reiser4_tap_load(tap_t * tap)
44254afd
MT
67988+{
67989+ tap_check(tap);
67990+ if (tap->loaded == 0) {
67991+ int result;
67992+
67993+ result = zload_ra(tap->coord->node, &tap->ra_info);
67994+ if (result != 0)
67995+ return result;
67996+ coord_clear_iplug(tap->coord);
67997+ }
67998+ ++tap->loaded;
67999+ tap_check(tap);
68000+ return 0;
68001+}
68002+
68003+/** release node tap is pointing to. Dual to tap_load() */
71430cf6 68004+void reiser4_tap_relse(tap_t * tap)
44254afd
MT
68005+{
68006+ tap_check(tap);
68007+ if (tap->loaded > 0) {
68008+ --tap->loaded;
68009+ if (tap->loaded == 0) {
68010+ zrelse(tap->coord->node);
68011+ }
68012+ }
68013+ tap_check(tap);
68014+}
68015+
68016+/**
68017+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68018+ * @mode
68019+ */
71430cf6
MT
68020+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68021+ znode_lock_mode mode)
44254afd
MT
68022+{
68023+ tap->coord = coord;
68024+ tap->lh = lh;
68025+ tap->mode = mode;
68026+ tap->loaded = 0;
68027+ INIT_LIST_HEAD(&tap->linkage);
71430cf6 68028+ reiser4_init_ra_info(&tap->ra_info);
44254afd
MT
68029+}
68030+
68031+/** add @tap to the per-thread list of all taps */
71430cf6 68032+void reiser4_tap_monitor(tap_t * tap)
44254afd
MT
68033+{
68034+ assert("nikita-2623", tap != NULL);
68035+ tap_check(tap);
71430cf6 68036+ list_add(&tap->linkage, reiser4_taps_list());
44254afd
MT
68037+ tap_check(tap);
68038+}
68039+
68040+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68041+ * loaded. */
71430cf6 68042+void reiser4_tap_copy(tap_t * dst, tap_t * src)
44254afd
MT
68043+{
68044+ assert("nikita-3193", src != NULL);
68045+ assert("nikita-3194", dst != NULL);
68046+
68047+ *dst->coord = *src->coord;
68048+ if (src->lh->node)
68049+ copy_lh(dst->lh, src->lh);
68050+ dst->mode = src->mode;
68051+ dst->loaded = 0;
68052+ INIT_LIST_HEAD(&dst->linkage);
68053+ dst->ra_info = src->ra_info;
68054+}
68055+
68056+/** finish with @tap */
71430cf6 68057+void reiser4_tap_done(tap_t * tap)
44254afd
MT
68058+{
68059+ assert("nikita-2565", tap != NULL);
68060+ tap_check(tap);
68061+ if (tap->loaded > 0)
68062+ zrelse(tap->coord->node);
68063+ done_lh(tap->lh);
68064+ tap->loaded = 0;
68065+ list_del_init(&tap->linkage);
68066+ tap->coord->node = NULL;
68067+}
68068+
68069+/**
68070+ * move @tap to the new node, locked with @target. Load @target, if @tap was
68071+ * already loaded.
68072+ */
71430cf6 68073+int reiser4_tap_move(tap_t * tap, lock_handle * target)
44254afd
MT
68074+{
68075+ int result = 0;
68076+
68077+ assert("nikita-2567", tap != NULL);
68078+ assert("nikita-2568", target != NULL);
68079+ assert("nikita-2570", target->node != NULL);
68080+ assert("nikita-2569", tap->coord->node == tap->lh->node);
68081+
68082+ tap_check(tap);
68083+ if (tap->loaded > 0)
68084+ result = zload_ra(target->node, &tap->ra_info);
68085+
68086+ if (result == 0) {
68087+ if (tap->loaded > 0)
68088+ zrelse(tap->coord->node);
68089+ done_lh(tap->lh);
68090+ copy_lh(tap->lh, target);
68091+ tap->coord->node = target->node;
68092+ coord_clear_iplug(tap->coord);
68093+ }
68094+ tap_check(tap);
68095+ return result;
68096+}
68097+
68098+/**
68099+ * move @tap to @target. Acquire lock on @target, if @tap was already
68100+ * loaded.
68101+ */
68102+static int tap_to(tap_t * tap, znode * target)
68103+{
68104+ int result;
68105+
68106+ assert("nikita-2624", tap != NULL);
68107+ assert("nikita-2625", target != NULL);
68108+
68109+ tap_check(tap);
68110+ result = 0;
68111+ if (tap->coord->node != target) {
68112+ lock_handle here;
68113+
68114+ init_lh(&here);
68115+ result = longterm_lock_znode(&here, target,
68116+ tap->mode, ZNODE_LOCK_HIPRI);
68117+ if (result == 0) {
71430cf6 68118+ result = reiser4_tap_move(tap, &here);
44254afd
MT
68119+ done_lh(&here);
68120+ }
68121+ }
68122+ tap_check(tap);
68123+ return result;
68124+}
68125+
68126+/**
68127+ * move @tap to given @target, loading and locking @target->node if
68128+ * necessary
68129+ */
68130+int tap_to_coord(tap_t * tap, coord_t * target)
68131+{
68132+ int result;
68133+
68134+ tap_check(tap);
68135+ result = tap_to(tap, target->node);
68136+ if (result == 0)
68137+ coord_dup(tap->coord, target);
68138+ tap_check(tap);
68139+ return result;
68140+}
68141+
68142+/** return list of all taps */
71430cf6 68143+struct list_head *reiser4_taps_list(void)
44254afd
MT
68144+{
68145+ return &get_current_context()->taps;
68146+}
68147+
68148+/** helper function for go_{next,prev}_{item,unit,node}() */
68149+int go_dir_el(tap_t * tap, sideof dir, int units_p)
68150+{
68151+ coord_t dup;
68152+ coord_t *coord;
68153+ int result;
68154+
68155+ int (*coord_dir) (coord_t *);
68156+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68157+ void (*coord_init) (coord_t *, const znode *);
68158+ ON_DEBUG(int (*coord_check) (const coord_t *));
68159+
68160+ assert("nikita-2556", tap != NULL);
68161+ assert("nikita-2557", tap->coord != NULL);
68162+ assert("nikita-2558", tap->lh != NULL);
68163+ assert("nikita-2559", tap->coord->node != NULL);
68164+
68165+ tap_check(tap);
68166+ if (dir == LEFT_SIDE) {
68167+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68168+ get_dir_neighbor = reiser4_get_left_neighbor;
68169+ coord_init = coord_init_last_unit;
68170+ } else {
68171+ coord_dir = units_p ? coord_next_unit : coord_next_item;
68172+ get_dir_neighbor = reiser4_get_right_neighbor;
68173+ coord_init = coord_init_first_unit;
68174+ }
68175+ ON_DEBUG(coord_check =
68176+ units_p ? coord_is_existing_unit : coord_is_existing_item);
68177+ assert("nikita-2560", coord_check(tap->coord));
68178+
68179+ coord = tap->coord;
68180+ coord_dup(&dup, coord);
68181+ if (coord_dir(&dup) != 0) {
68182+ do {
68183+ /* move to the left neighboring node */
68184+ lock_handle dup;
68185+
68186+ init_lh(&dup);
68187+ result =
68188+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68189+ GN_CAN_USE_UPPER_LEVELS);
68190+ if (result == 0) {
71430cf6 68191+ result = reiser4_tap_move(tap, &dup);
44254afd
MT
68192+ if (result == 0)
68193+ coord_init(tap->coord, dup.node);
68194+ done_lh(&dup);
68195+ }
68196+ /* skip empty nodes */
68197+ } while ((result == 0) && node_is_empty(coord->node));
68198+ } else {
68199+ result = 0;
68200+ coord_dup(coord, &dup);
68201+ }
68202+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68203+ tap_check(tap);
68204+ return result;
68205+}
68206+
68207+/**
68208+ * move @tap to the next unit, transparently crossing item and node
68209+ * boundaries
68210+ */
68211+int go_next_unit(tap_t * tap)
68212+{
68213+ return go_dir_el(tap, RIGHT_SIDE, 1);
68214+}
68215+
68216+/**
68217+ * move @tap to the previous unit, transparently crossing item and node
68218+ * boundaries
68219+ */
68220+int go_prev_unit(tap_t * tap)
68221+{
68222+ return go_dir_el(tap, LEFT_SIDE, 1);
68223+}
68224+
68225+/**
68226+ * @shift times apply @actor to the @tap. This is used to move @tap by
68227+ * @shift units (or items, or nodes) in either direction.
68228+ */
68229+static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68230+{
68231+ int result;
68232+
68233+ assert("nikita-2555", shift >= 0);
68234+ assert("nikita-2562", tap->coord->node == tap->lh->node);
68235+
68236+ tap_check(tap);
71430cf6 68237+ result = reiser4_tap_load(tap);
44254afd
MT
68238+ if (result != 0)
68239+ return result;
68240+
68241+ for (; shift > 0; --shift) {
68242+ result = actor(tap);
68243+ assert("nikita-2563", tap->coord->node == tap->lh->node);
68244+ if (result != 0)
68245+ break;
68246+ }
71430cf6 68247+ reiser4_tap_relse(tap);
44254afd
MT
68248+ tap_check(tap);
68249+ return result;
68250+}
68251+
68252+/** move @tap @shift units rightward */
68253+int rewind_right(tap_t * tap, int shift)
68254+{
68255+ return rewind_to(tap, go_next_unit, shift);
68256+}
68257+
68258+/** move @tap @shift units leftward */
68259+int rewind_left(tap_t * tap, int shift)
68260+{
68261+ return rewind_to(tap, go_prev_unit, shift);
68262+}
68263+
68264+#if REISER4_DEBUG
68265+/** debugging function: print @tap content in human readable form */
68266+static void print_tap(const char *prefix, const tap_t * tap)
68267+{
68268+ if (tap == NULL) {
68269+ printk("%s: null tap\n", prefix);
68270+ return;
68271+ }
68272+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68273+ tap->loaded, (&tap->linkage == tap->linkage.next &&
68274+ &tap->linkage == tap->linkage.prev),
68275+ tap->lh->node,
68276+ lock_mode_name(tap->mode));
68277+ print_coord("\tcoord", tap->coord, 0);
68278+}
68279+
68280+/** check [tap-sane] invariant */
68281+static int tap_invariant(const tap_t * tap)
68282+{
68283+ /* [tap-sane] invariant */
68284+
68285+ if (tap == NULL)
68286+ return 1;
68287+ /* tap->mode is one of
68288+ *
68289+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68290+ */
68291+ if (tap->mode != ZNODE_NO_LOCK &&
68292+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68293+ return 2;
68294+ /* tap->coord != NULL, and */
68295+ if (tap->coord == NULL)
68296+ return 3;
68297+ /* tap->lh != NULL, and */
68298+ if (tap->lh == NULL)
68299+ return 4;
68300+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68301+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68302+ return 5;
68303+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68304+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68305+ return 6;
68306+ return 0;
68307+}
68308+
68309+/** debugging function: check internal @tap consistency */
68310+static void tap_check(const tap_t * tap)
68311+{
68312+ int result;
68313+
68314+ result = tap_invariant(tap);
68315+ if (result != 0) {
68316+ print_tap("broken", tap);
68317+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68318+ }
68319+}
68320+#endif
68321+
68322+/* Make Linus happy.
68323+ Local variables:
68324+ c-indentation-style: "K&R"
68325+ mode-name: "LC"
68326+ c-basic-offset: 8
68327+ tab-width: 8
68328+ fill-column: 120
68329+ scroll-step: 1
68330+ End:
68331+*/
71430cf6
MT
68332diff -urN linux-2.6.22.orig/fs/reiser4/tap.h linux-2.6.22/fs/reiser4/tap.h
68333--- linux-2.6.22.orig/fs/reiser4/tap.h 1970-01-01 03:00:00.000000000 +0300
68334+++ linux-2.6.22/fs/reiser4/tap.h 2007-07-29 00:25:35.024734784 +0400
68335@@ -0,0 +1,70 @@
44254afd
MT
68336+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68337+
68338+/* Tree Access Pointers. See tap.c for more details. */
68339+
68340+#if !defined( __REISER4_TAP_H__ )
68341+#define __REISER4_TAP_H__
68342+
68343+#include "forward.h"
68344+#include "readahead.h"
68345+
68346+/**
68347+ tree_access_pointer aka tap. Data structure combining coord_t and lock
68348+ handle.
68349+ Invariants involving this data-type, see doc/lock-ordering for details:
68350+
68351+ [tap-sane]
68352+ */
68353+struct tree_access_pointer {
68354+ /* coord tap is at */
68355+ coord_t *coord;
68356+ /* lock handle on ->coord->node */
68357+ lock_handle *lh;
68358+ /* mode of lock acquired by this tap */
68359+ znode_lock_mode mode;
71430cf6
MT
68360+ /* incremented by reiser4_tap_load().
68361+ Decremented by reiser4_tap_relse(). */
44254afd
MT
68362+ int loaded;
68363+ /* list of taps */
68364+ struct list_head linkage;
68365+ /* read-ahead hint */
68366+ ra_info_t ra_info;
68367+};
68368+
68369+typedef int (*go_actor_t) (tap_t * tap);
68370+
71430cf6
MT
68371+extern int reiser4_tap_load(tap_t * tap);
68372+extern void reiser4_tap_relse(tap_t * tap);
68373+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
44254afd 68374+ znode_lock_mode mode);
71430cf6
MT
68375+extern void reiser4_tap_monitor(tap_t * tap);
68376+extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
68377+extern void reiser4_tap_done(tap_t * tap);
68378+extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
44254afd
MT
68379+extern int tap_to_coord(tap_t * tap, coord_t * target);
68380+
68381+extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68382+extern int go_next_unit(tap_t * tap);
68383+extern int go_prev_unit(tap_t * tap);
68384+extern int rewind_right(tap_t * tap, int shift);
68385+extern int rewind_left(tap_t * tap, int shift);
68386+
71430cf6 68387+extern struct list_head *reiser4_taps_list(void);
44254afd 68388+
71430cf6
MT
68389+#define for_all_taps(tap) \
68390+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
68391+ reiser4_taps_list() != &tap->linkage; \
44254afd
MT
68392+ tap = list_entry(tap->linkage.next, tap_t, linkage))
68393+
68394+/* __REISER4_TAP_H__ */
68395+#endif
68396+/* Make Linus happy.
68397+ Local variables:
68398+ c-indentation-style: "K&R"
68399+ mode-name: "LC"
68400+ c-basic-offset: 8
68401+ tab-width: 8
68402+ fill-column: 120
68403+ scroll-step: 1
68404+ End:
68405+*/
71430cf6
MT
68406diff -urN linux-2.6.22.orig/fs/reiser4/tree.c linux-2.6.22/fs/reiser4/tree.c
68407--- linux-2.6.22.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
68408+++ linux-2.6.22/fs/reiser4/tree.c 2007-07-29 00:25:35.028735820 +0400
68409@@ -0,0 +1,1876 @@
44254afd
MT
68410+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68411+ * reiser4/README */
68412+
68413+/*
68414+ * KEYS IN A TREE.
68415+ *
68416+ * The tree consists of nodes located on the disk. Node in the tree is either
68417+ * formatted or unformatted. Formatted node is one that has structure
68418+ * understood by the tree balancing and traversal code. Formatted nodes are
68419+ * further classified into leaf and internal nodes. Latter distinctions is
68420+ * (almost) of only historical importance: general structure of leaves and
68421+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
68422+ * that are part of bodies of ordinary files and attributes.
68423+ *
68424+ * Each node in the tree spawns some interval in the key space. Key ranges for
68425+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
68426+ * sense, because of the non-unique keys: intersection of key ranges for
68427+ * different nodes is either empty, or consists of exactly one key.
68428+ *
68429+ * Formatted node consists of a sequence of items. Each item spawns some
68430+ * interval in key space. Key ranges for all items in a tree are disjoint,
68431+ * modulo non-unique keys again. Items within nodes are ordered in the key
68432+ * order of the smallest key in a item.
68433+ *
68434+ * Particular type of item can be further split into units. Unit is piece of
68435+ * item that can be cut from item and moved into another item of the same
68436+ * time. Units are used by balancing code to repack data during balancing.
68437+ *
68438+ * Unit can be further split into smaller entities (for example, extent unit
68439+ * represents several pages, and it is natural for extent code to operate on
68440+ * particular pages and even bytes within one unit), but this is of no
68441+ * relevance to the generic balancing and lookup code.
68442+ *
68443+ * Although item is said to "spawn" range or interval of keys, it is not
68444+ * necessary that item contains piece of data addressable by each and every
68445+ * key in this range. For example, compound directory item, consisting of
68446+ * units corresponding to directory entries and keyed by hashes of file names,
68447+ * looks more as having "discrete spectrum": only some disjoint keys inside
68448+ * range occupied by this item really address data.
68449+ *
68450+ * No than less, each item always has well-defined least (minimal) key, that
68451+ * is recorded in item header, stored in the node this item is in. Also, item
68452+ * plugin can optionally define method ->max_key_inside() returning maximal
68453+ * key that can _possibly_ be located within this item. This method is used
68454+ * (mainly) to determine when given piece of data should be merged into
68455+ * existing item, in stead of creating new one. Because of this, even though
68456+ * ->max_key_inside() can be larger that any key actually located in the item,
68457+ * intervals
68458+ *
71430cf6 68459+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
44254afd
MT
68460+ *
68461+ * are still disjoint for all items within the _same_ node.
68462+ *
68463+ * In memory node is represented by znode. It plays several roles:
68464+ *
68465+ * . something locks are taken on
68466+ *
68467+ * . something tracked by transaction manager (this is going to change)
68468+ *
68469+ * . something used to access node data
68470+ *
68471+ * . something used to maintain tree structure in memory: sibling and
68472+ * parental linkage.
68473+ *
68474+ * . something used to organize nodes into "slums"
68475+ *
68476+ * More on znodes see in znode.[ch]
68477+ *
68478+ * DELIMITING KEYS
68479+ *
68480+ * To simplify balancing, allow some flexibility in locking and speed up
68481+ * important coord cache optimization, we keep delimiting keys of nodes in
68482+ * memory. Depending on disk format (implemented by appropriate node plugin)
68483+ * node on disk can record both left and right delimiting key, only one of
68484+ * them, or none. Still, our balancing and tree traversal code keep both
68485+ * delimiting keys for a node that is in memory stored in the znode. When
68486+ * node is first brought into memory during tree traversal, its left
68487+ * delimiting key is taken from its parent, and its right delimiting key is
68488+ * either next key in its parent, or is right delimiting key of parent if
68489+ * node is the rightmost child of parent.
68490+ *
68491+ * Physical consistency of delimiting key is protected by special dk
68492+ * read-write lock. That is, delimiting keys can only be inspected or
68493+ * modified under this lock. But dk lock is only sufficient for fast
68494+ * "pessimistic" check, because to simplify code and to decrease lock
68495+ * contention, balancing (carry) only updates delimiting keys right before
68496+ * unlocking all locked nodes on the given tree level. For example,
68497+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
68498+ * node it first does fast check under dk spin lock. If key looked for is
68499+ * not between delimiting keys for this node, next node is inspected and so
68500+ * on. If key is inside of the key range, long term lock is taken on node
68501+ * and key range is rechecked.
68502+ *
68503+ * COORDINATES
68504+ *
68505+ * To find something in the tree, you supply a key, and the key is resolved
68506+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
68507+ * node the coord points to remains locked. As mentioned above trees
68508+ * consist of nodes that consist of items that consist of units. A unit is
68509+ * the smallest and indivisible piece of tree as far as balancing and tree
68510+ * search are concerned. Each node, item, and unit can be addressed by
68511+ * giving its level in the tree and the key occupied by this entity. A node
68512+ * knows what the key ranges are of the items within it, and how to find its
68513+ * items and invoke their item handlers, but it does not know how to access
68514+ * individual units within its items except through the item handlers.
68515+ * coord is a structure containing a pointer to the node, the ordinal number
68516+ * of the item within this node (a sort of item offset), and the ordinal
68517+ * number of the unit within this item.
68518+ *
68519+ * TREE LOOKUP
68520+ *
68521+ * There are two types of access to the tree: lookup and modification.
68522+ *
68523+ * Lookup is a search for the key in the tree. Search can look for either
68524+ * exactly the key given to it, or for the largest key that is not greater
68525+ * than the key given to it. This distinction is determined by "bias"
68526+ * parameter of search routine (coord_by_key()). coord_by_key() either
68527+ * returns error (key is not in the tree, or some kind of external error
68528+ * occurred), or successfully resolves key into coord.
68529+ *
68530+ * This resolution is done by traversing tree top-to-bottom from root level
68531+ * to the desired level. On levels above twig level (level one above the
68532+ * leaf level) nodes consist exclusively of internal items. Internal item is
68533+ * nothing more than pointer to the tree node on the child level. On twig
68534+ * level nodes consist of internal items intermixed with extent
68535+ * items. Internal items form normal search tree structure used by traversal
68536+ * to descent through the tree.
68537+ *
68538+ * TREE LOOKUP OPTIMIZATIONS
68539+ *
68540+ * Tree lookup described above is expensive even if all nodes traversed are
68541+ * already in the memory: for each node binary search within it has to be
68542+ * performed and binary searches are CPU consuming and tend to destroy CPU
68543+ * caches.
68544+ *
68545+ * Several optimizations are used to work around this:
68546+ *
68547+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
68548+ * details)
68549+ *
68550+ * . seals (see seal.[ch])
68551+ *
68552+ * . vroot (see search.c)
68553+ *
68554+ * General search-by-key is layered thusly:
68555+ *
68556+ * [check seal, if any] --ok--> done
68557+ * |
68558+ * failed
68559+ * |
68560+ * V
68561+ * [vroot defined] --no--> node = tree_root
68562+ * | |
68563+ * yes |
68564+ * | |
68565+ * V |
68566+ * node = vroot |
68567+ * | |
68568+ * | |
68569+ * | |
68570+ * V V
68571+ * [check cbk_cache for key] --ok--> done
68572+ * |
68573+ * failed
68574+ * |
68575+ * V
68576+ * [start tree traversal from node]
68577+ *
68578+ */
68579+
68580+#include "forward.h"
68581+#include "debug.h"
68582+#include "dformat.h"
68583+#include "key.h"
68584+#include "coord.h"
68585+#include "plugin/item/static_stat.h"
68586+#include "plugin/item/item.h"
68587+#include "plugin/node/node.h"
68588+#include "plugin/plugin.h"
68589+#include "txnmgr.h"
68590+#include "jnode.h"
68591+#include "znode.h"
68592+#include "block_alloc.h"
68593+#include "tree_walk.h"
68594+#include "carry.h"
68595+#include "carry_ops.h"
68596+#include "tap.h"
68597+#include "tree.h"
68598+#include "vfs_ops.h"
68599+#include "page_cache.h"
68600+#include "super.h"
68601+#include "reiser4.h"
68602+#include "inode.h"
68603+
68604+#include <linux/fs.h> /* for struct super_block */
68605+#include <linux/spinlock.h>
68606+
68607+/* Disk address (block number) never ever used for any real tree node. This is
68608+ used as block number of "uber" znode.
68609+
68610+ Invalid block addresses are 0 by tradition.
68611+
68612+*/
68613+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
68614+
68615+#define CUT_TREE_MIN_ITERATIONS 64
68616+
68617+static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
68618+
68619+/* return node plugin of coord->node */
68620+node_plugin *node_plugin_by_coord(const coord_t * coord)
68621+{
68622+ assert("vs-1", coord != NULL);
68623+ assert("vs-2", coord->node != NULL);
68624+
68625+ return coord->node->nplug;
68626+}
68627+
68628+/* insert item into tree. Fields of @coord are updated so that they can be
68629+ * used by consequent insert operation. */
68630+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
68631+ * into */ ,
68632+ const reiser4_key * key /* key of new item */ ,
68633+ reiser4_item_data * data /* parameters for item
68634+ * creation */ ,
68635+ coord_t * coord /* resulting insertion coord */ ,
68636+ lock_handle * lh /* resulting lock
68637+ * handle */ ,
68638+ tree_level stop_level /** level where to insert */ ,
68639+ __u32 flags /* insertion flags */ )
68640+{
68641+ int result;
68642+
68643+ assert("nikita-358", tree != NULL);
68644+ assert("nikita-360", coord != NULL);
68645+
68646+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
68647+ FIND_EXACT, stop_level, stop_level,
68648+ flags | CBK_FOR_INSERT, NULL /*ra_info */ );
68649+ switch (result) {
68650+ default:
68651+ break;
68652+ case CBK_COORD_FOUND:
68653+ result = IBK_ALREADY_EXISTS;
68654+ break;
68655+ case CBK_COORD_NOTFOUND:
68656+ assert("nikita-2017", coord->node != NULL);
68657+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
68658+ break;
68659+ }
68660+ return result;
68661+}
68662+
68663+/* insert item by calling carry. Helper function called if short-cut
68664+ insertion failed */
68665+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
68666+ lock_handle * lh, /* lock handle of insertion
68667+ * node */
68668+ reiser4_item_data * data, /* parameters of new
68669+ * item */
68670+ const reiser4_key * key, /* key of new item */
68671+ carry_opcode cop, /* carry operation to perform */
68672+ cop_insert_flag flags
68673+ /* carry flags */ )
68674+{
68675+ int result;
68676+ carry_pool *pool;
68677+ carry_level *lowest_level;
68678+ carry_insert_data *cdata;
68679+ carry_op *op;
68680+
68681+ assert("umka-314", coord != NULL);
68682+
68683+ /* allocate carry_pool and 3 carry_level-s */
68684+ pool =
68685+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68686+ sizeof(*cdata));
68687+ if (IS_ERR(pool))
68688+ return PTR_ERR(pool);
68689+ lowest_level = (carry_level *) (pool + 1);
68690+ init_carry_level(lowest_level, pool);
68691+
71430cf6 68692+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
44254afd
MT
68693+ if (IS_ERR(op) || (op == NULL)) {
68694+ done_carry_pool(pool);
68695+ return RETERR(op ? PTR_ERR(op) : -EIO);
68696+ }
68697+ cdata = (carry_insert_data *) (lowest_level + 3);
68698+ cdata->coord = coord;
68699+ cdata->data = data;
68700+ cdata->key = key;
68701+ op->u.insert.d = cdata;
68702+ if (flags == 0)
68703+ flags = znode_get_tree(coord->node)->carry.insert_flags;
68704+ op->u.insert.flags = flags;
68705+ op->u.insert.type = COPT_ITEM_DATA;
68706+ op->u.insert.child = NULL;
68707+ if (lh != NULL) {
68708+ assert("nikita-3245", lh->node == coord->node);
68709+ lowest_level->track_type = CARRY_TRACK_CHANGE;
68710+ lowest_level->tracked = lh;
68711+ }
68712+
71430cf6 68713+ result = reiser4_carry(lowest_level, NULL);
44254afd
MT
68714+ done_carry_pool(pool);
68715+
68716+ return result;
68717+}
68718+
68719+/* form carry queue to perform paste of @data with @key at @coord, and launch
68720+ its execution by calling carry().
68721+
68722+ Instruct carry to update @lh it after balancing insertion coord moves into
68723+ different block.
68724+
68725+*/
68726+static int paste_with_carry(coord_t * coord, /* coord of paste */
68727+ lock_handle * lh, /* lock handle of node
68728+ * where item is
68729+ * pasted */
68730+ reiser4_item_data * data, /* parameters of new
68731+ * item */
68732+ const reiser4_key * key, /* key of new item */
68733+ unsigned flags /* paste flags */ )
68734+{
68735+ int result;
68736+ carry_pool *pool;
68737+ carry_level *lowest_level;
68738+ carry_insert_data *cdata;
68739+ carry_op *op;
68740+
68741+ assert("umka-315", coord != NULL);
68742+ assert("umka-316", key != NULL);
68743+
68744+ pool =
68745+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68746+ sizeof(*cdata));
68747+ if (IS_ERR(pool))
68748+ return PTR_ERR(pool);
68749+ lowest_level = (carry_level *) (pool + 1);
68750+ init_carry_level(lowest_level, pool);
68751+
71430cf6 68752+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
44254afd
MT
68753+ if (IS_ERR(op) || (op == NULL)) {
68754+ done_carry_pool(pool);
68755+ return RETERR(op ? PTR_ERR(op) : -EIO);
68756+ }
68757+ cdata = (carry_insert_data *) (lowest_level + 3);
68758+ cdata->coord = coord;
68759+ cdata->data = data;
68760+ cdata->key = key;
68761+ op->u.paste.d = cdata;
68762+ if (flags == 0)
68763+ flags = znode_get_tree(coord->node)->carry.paste_flags;
68764+ op->u.paste.flags = flags;
68765+ op->u.paste.type = COPT_ITEM_DATA;
68766+ if (lh != NULL) {
68767+ lowest_level->track_type = CARRY_TRACK_CHANGE;
68768+ lowest_level->tracked = lh;
68769+ }
68770+
71430cf6 68771+ result = reiser4_carry(lowest_level, NULL);
44254afd
MT
68772+ done_carry_pool(pool);
68773+
68774+ return result;
68775+}
68776+
68777+/* insert item at the given coord.
68778+
68779+ First try to skip carry by directly calling ->create_item() method of node
68780+ plugin. If this is impossible (there is not enough free space in the node,
68781+ or leftmost item in the node is created), call insert_with_carry_by_coord()
68782+ that will do full carry().
68783+
68784+*/
68785+insert_result insert_by_coord(coord_t * coord /* coord where to
68786+ * insert. coord->node has
68787+ * to be write locked by
68788+ * caller */ ,
68789+ reiser4_item_data * data /* data to be
68790+ * inserted */ ,
68791+ const reiser4_key * key /* key of new item */ ,
68792+ lock_handle * lh /* lock handle of write
68793+ * lock on node */ ,
68794+ __u32 flags /* insertion flags */ )
68795+{
68796+ unsigned item_size;
68797+ int result;
68798+ znode *node;
68799+
68800+ assert("vs-247", coord != NULL);
68801+ assert("vs-248", data != NULL);
68802+ assert("vs-249", data->length >= 0);
68803+ assert("nikita-1191", znode_is_write_locked(coord->node));
68804+
68805+ node = coord->node;
68806+ coord_clear_iplug(coord);
68807+ result = zload(node);
68808+ if (result != 0)
68809+ return result;
68810+
68811+ item_size = space_needed(node, NULL, data, 1);
68812+ if (item_size > znode_free_space(node) &&
68813+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
68814+ && (flags & COPI_DONT_ALLOCATE)) {
68815+ /* we are forced to use free space of coord->node and new item
68816+ does not fit into it.
68817+
68818+ Currently we get here only when we allocate and copy units
68819+ of extent item from a node to its left neighbor during
68820+ "squalloc"-ing. If @node (this is left neighbor) does not
68821+ have enough free space - we do not want to attempt any
68822+ shifting and allocations because we are in squeezing and
68823+ everything to the left of @node is tightly packed.
68824+ */
68825+ result = -E_NODE_FULL;
68826+ } else if ((item_size <= znode_free_space(node)) &&
68827+ !coord_is_before_leftmost(coord) &&
68828+ (node_plugin_by_node(node)->fast_insert != NULL)
68829+ && node_plugin_by_node(node)->fast_insert(coord)) {
68830+ /* shortcut insertion without carry() overhead.
68831+
68832+ Only possible if:
68833+
68834+ - there is enough free space
68835+
68836+ - insertion is not into the leftmost position in a node
68837+ (otherwise it would require updating of delimiting key in a
68838+ parent)
68839+
68840+ - node plugin agrees with this
68841+
68842+ */
68843+ result =
68844+ node_plugin_by_node(node)->create_item(coord, key, data,
68845+ NULL);
68846+ znode_make_dirty(node);
68847+ } else {
68848+ /* otherwise do full-fledged carry(). */
68849+ result =
68850+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
68851+ flags);
68852+ }
68853+ zrelse(node);
68854+ return result;
68855+}
68856+
68857+/* @coord is set to leaf level and @data is to be inserted to twig level */
68858+insert_result
68859+insert_extent_by_coord(coord_t *
68860+ coord
68861+ /* coord where to insert. coord->node * has to be write * locked by caller */
68862+ ,
68863+ reiser4_item_data * data /* data to be inserted */ ,
68864+ const reiser4_key * key /* key of new item */ ,
68865+ lock_handle *
68866+ lh /* lock handle of write lock on * node */ )
68867+{
68868+ assert("vs-405", coord != NULL);
68869+ assert("vs-406", data != NULL);
68870+ assert("vs-407", data->length > 0);
68871+ assert("vs-408", znode_is_write_locked(coord->node));
68872+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
68873+
68874+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
68875+ 0 /*flags */ );
68876+}
68877+
68878+/* Insert into the item at the given coord.
68879+
68880+ First try to skip carry by directly calling ->paste() method of item
68881+ plugin. If this is impossible (there is not enough free space in the node,
68882+ or we are pasting into leftmost position in the node), call
68883+ paste_with_carry() that will do full carry().
68884+
68885+*/
68886+/* paste_into_item */
68887+int insert_into_item(coord_t * coord /* coord of pasting */ ,
68888+ lock_handle * lh /* lock handle on node involved */ ,
68889+ const reiser4_key * key /* key of unit being pasted */ ,
68890+ reiser4_item_data * data /* parameters for new unit */ ,
68891+ unsigned flags /* insert/paste flags */ )
68892+{
68893+ int result;
68894+ int size_change;
68895+ node_plugin *nplug;
68896+ item_plugin *iplug;
68897+
68898+ assert("umka-317", coord != NULL);
68899+ assert("umka-318", key != NULL);
68900+
68901+ iplug = item_plugin_by_coord(coord);
68902+ nplug = node_plugin_by_coord(coord);
68903+
68904+ assert("nikita-1480", iplug == data->iplug);
68905+
68906+ size_change = space_needed(coord->node, coord, data, 0);
68907+ if (size_change > (int)znode_free_space(coord->node) &&
68908+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
68909+ && (flags & COPI_DONT_ALLOCATE)) {
68910+ /* we are forced to use free space of coord->node and new data
68911+ does not fit into it. */
68912+ return -E_NODE_FULL;
68913+ }
68914+
68915+ /* shortcut paste without carry() overhead.
68916+
68917+ Only possible if:
68918+
68919+ - there is enough free space
68920+
68921+ - paste is not into the leftmost unit in a node (otherwise
68922+ it would require updating of delimiting key in a parent)
68923+
68924+ - node plugin agrees with this
68925+
68926+ - item plugin agrees with us
68927+ */
68928+ if (size_change <= (int)znode_free_space(coord->node) &&
68929+ (coord->item_pos != 0 ||
68930+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
68931+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
68932+ nplug->fast_paste(coord) &&
68933+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
68934+ if (size_change > 0)
68935+ nplug->change_item_size(coord, size_change);
68936+ /* NOTE-NIKITA: huh? where @key is used? */
68937+ result = iplug->b.paste(coord, data, NULL);
68938+ if (size_change < 0)
68939+ nplug->change_item_size(coord, size_change);
68940+ znode_make_dirty(coord->node);
68941+ } else
68942+ /* otherwise do full-fledged carry(). */
68943+ result = paste_with_carry(coord, lh, data, key, flags);
68944+ return result;
68945+}
68946+
68947+/* this either appends or truncates item @coord */
71430cf6
MT
68948+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
68949+ reiser4_item_data * data /* parameters of resize */ ,
68950+ reiser4_key * key /* key of new unit */ ,
68951+ lock_handle * lh /* lock handle of node
68952+ * being modified */ ,
68953+ cop_insert_flag flags /* carry flags */ )
44254afd
MT
68954+{
68955+ int result;
68956+ znode *node;
68957+
68958+ assert("nikita-362", coord != NULL);
68959+ assert("nikita-363", data != NULL);
68960+ assert("vs-245", data->length != 0);
68961+
68962+ node = coord->node;
68963+ coord_clear_iplug(coord);
68964+ result = zload(node);
68965+ if (result != 0)
68966+ return result;
68967+
68968+ if (data->length < 0)
68969+ result = node_plugin_by_coord(coord)->shrink_item(coord,
68970+ -data->length);
68971+ else
68972+ result = insert_into_item(coord, lh, key, data, flags);
68973+
68974+ zrelse(node);
68975+ return result;
68976+}
68977+
68978+/* insert flow @f */
71430cf6 68979+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
44254afd
MT
68980+{
68981+ int result;
68982+ carry_pool *pool;
68983+ carry_level *lowest_level;
68984+ reiser4_item_data *data;
68985+ carry_op *op;
68986+
68987+ pool =
68988+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68989+ sizeof(*data));
68990+ if (IS_ERR(pool))
68991+ return PTR_ERR(pool);
68992+ lowest_level = (carry_level *) (pool + 1);
68993+ init_carry_level(lowest_level, pool);
68994+
71430cf6 68995+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
44254afd
MT
68996+ 0 /* operate directly on coord -> node */ );
68997+ if (IS_ERR(op) || (op == NULL)) {
68998+ done_carry_pool(pool);
68999+ return RETERR(op ? PTR_ERR(op) : -EIO);
69000+ }
69001+
69002+ /* these are permanent during insert_flow */
69003+ data = (reiser4_item_data *) (lowest_level + 3);
69004+ data->user = 1;
69005+ data->iplug = item_plugin_by_id(FORMATTING_ID);
69006+ data->arg = NULL;
69007+ /* data.length and data.data will be set before calling paste or
69008+ insert */
69009+ data->length = 0;
69010+ data->data = NULL;
69011+
69012+ op->u.insert_flow.flags = 0;
69013+ op->u.insert_flow.insert_point = coord;
69014+ op->u.insert_flow.flow = f;
69015+ op->u.insert_flow.data = data;
69016+ op->u.insert_flow.new_nodes = 0;
69017+
69018+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69019+ lowest_level->tracked = lh;
69020+
71430cf6 69021+ result = reiser4_carry(lowest_level, NULL);
44254afd
MT
69022+ done_carry_pool(pool);
69023+
69024+ return result;
69025+}
69026+
69027+/* Given a coord in parent node, obtain a znode for the corresponding child */
69028+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69029+ * child */ ,
69030+ znode * parent /* parent of child */ ,
69031+ int incore_p /* if !0 only return child if already in
69032+ * memory */ ,
69033+ int setup_dkeys_p /* if !0 update delimiting keys of
69034+ * child */ )
69035+{
69036+ znode *child;
69037+
69038+ assert("nikita-1374", parent_coord != NULL);
69039+ assert("nikita-1482", parent != NULL);
69040+#if REISER4_DEBUG
69041+ if (setup_dkeys_p)
69042+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69043+#endif
69044+ assert("nikita-2947", znode_is_any_locked(parent));
69045+
69046+ if (znode_get_level(parent) <= LEAF_LEVEL) {
69047+ /* trying to get child of leaf node */
69048+ warning("nikita-1217", "Child of maize?");
69049+ return ERR_PTR(RETERR(-EIO));
69050+ }
69051+ if (item_is_internal(parent_coord)) {
69052+ reiser4_block_nr addr;
69053+ item_plugin *iplug;
69054+ reiser4_tree *tree;
69055+
69056+ iplug = item_plugin_by_coord(parent_coord);
69057+ assert("vs-512", iplug->s.internal.down_link);
69058+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
69059+
69060+ tree = znode_get_tree(parent);
69061+ if (incore_p)
69062+ child = zlook(tree, &addr);
69063+ else
69064+ child =
69065+ zget(tree, &addr, parent,
71430cf6
MT
69066+ znode_get_level(parent) - 1,
69067+ reiser4_ctx_gfp_mask_get());
44254afd
MT
69068+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69069+ set_child_delimiting_keys(parent, parent_coord, child);
69070+ } else {
69071+ warning("nikita-1483", "Internal item expected");
69072+ child = ERR_PTR(RETERR(-EIO));
69073+ }
69074+ return child;
69075+}
69076+
69077+/* remove znode from transaction */
69078+static void uncapture_znode(znode * node)
69079+{
69080+ struct page *page;
69081+
69082+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69083+
71430cf6 69084+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
44254afd
MT
69085+ int ret;
69086+
69087+ /* An already allocated block goes right to the atom's delete set. */
69088+ ret =
69089+ reiser4_dealloc_block(znode_get_block(node), 0,
69090+ BA_DEFER | BA_FORMATTED);
69091+ if (ret)
69092+ warning("zam-942",
69093+ "can\'t add a block (%llu) number to atom's delete set\n",
69094+ (unsigned long long)(*znode_get_block(node)));
69095+
69096+ spin_lock_znode(node);
69097+ /* Here we return flush reserved block which was reserved at the
69098+ * moment when this allocated node was marked dirty and still
69099+ * not used by flush in node relocation procedure. */
69100+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69101+ txn_atom *atom;
69102+
69103+ atom = jnode_get_atom(ZJNODE(node));
69104+ assert("zam-939", atom != NULL);
69105+ spin_unlock_znode(node);
69106+ flush_reserved2grabbed(atom, (__u64) 1);
69107+ spin_unlock_atom(atom);
69108+ } else
69109+ spin_unlock_znode(node);
69110+ } else {
69111+ /* znode has assigned block which is counted as "fake
69112+ allocated". Return it back to "free blocks") */
69113+ fake_allocated2free((__u64) 1, BA_FORMATTED);
69114+ }
69115+
69116+ /*
69117+ * uncapture page from transaction. There is a possibility of a race
69118+ * with ->releasepage(): reiser4_releasepage() detaches page from this
69119+ * jnode and we have nothing to uncapture. To avoid this, get
71430cf6
MT
69120+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
69121+ * will deal with released page itself.
44254afd
MT
69122+ */
69123+ spin_lock_znode(node);
69124+ page = znode_page(node);
69125+ if (likely(page != NULL)) {
69126+ /*
71430cf6
MT
69127+ * reiser4_uncapture_page() can only be called when we are sure
69128+ * that znode is pinned in memory, which we are, because
44254afd
MT
69129+ * forget_znode() is only called from longterm_unlock_znode().
69130+ */
69131+ page_cache_get(page);
69132+ spin_unlock_znode(node);
69133+ lock_page(page);
71430cf6 69134+ reiser4_uncapture_page(page);
44254afd
MT
69135+ unlock_page(page);
69136+ page_cache_release(page);
69137+ } else {
69138+ txn_atom *atom;
69139+
69140+ /* handle "flush queued" znodes */
69141+ while (1) {
69142+ atom = jnode_get_atom(ZJNODE(node));
69143+ assert("zam-943", atom != NULL);
69144+
69145+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69146+ || !atom->nr_running_queues)
69147+ break;
69148+
69149+ spin_unlock_znode(node);
71430cf6 69150+ reiser4_atom_wait_event(atom);
44254afd
MT
69151+ spin_lock_znode(node);
69152+ }
69153+
71430cf6 69154+ reiser4_uncapture_block(ZJNODE(node));
44254afd
MT
69155+ spin_unlock_atom(atom);
69156+ zput(node);
69157+ }
69158+}
69159+
69160+/* This is called from longterm_unlock_znode() when last lock is released from
69161+ the node that has been removed from the tree. At this point node is removed
69162+ from sibling list and its lock is invalidated. */
69163+void forget_znode(lock_handle * handle)
69164+{
69165+ znode *node;
69166+ reiser4_tree *tree;
69167+
69168+ assert("umka-319", handle != NULL);
69169+
69170+ node = handle->node;
69171+ tree = znode_get_tree(node);
69172+
69173+ assert("vs-164", znode_is_write_locked(node));
69174+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69175+ assert_rw_locked(&(node->lock.guard));
69176+
69177+ /* We assume that this node was detached from its parent before
69178+ * unlocking, it gives no way to reach this node from parent through a
69179+ * down link. The node should have no children and, thereby, can't be
69180+ * reached from them by their parent pointers. The only way to obtain a
69181+ * reference to the node is to use sibling pointers from its left and
69182+ * right neighbors. In the next several lines we remove the node from
69183+ * the sibling list. */
69184+
69185+ write_lock_tree(tree);
69186+ sibling_list_remove(node);
69187+ znode_remove(node, tree);
69188+ write_unlock_tree(tree);
69189+
69190+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
69191+ * forces all lock requestor threads to repeat iterations of getting
69192+ * lock on a child, neighbor or parent node. But, those threads can't
69193+ * come to this node again, because this node is no longer a child,
69194+ * neighbor or parent of any other node. This order of znode
69195+ * invalidation does not allow other threads to waste cpu time is a busy
69196+ * loop, trying to lock dying object. The exception is in the flush
69197+ * code when we take node directly from atom's capture list.*/
71430cf6 69198+ reiser4_invalidate_lock(handle);
44254afd
MT
69199+ uncapture_znode(node);
69200+}
69201+
69202+/* Check that internal item at @pointer really contains pointer to @child. */
69203+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69204+ * @child */ ,
69205+ const znode * child /* child znode */ )
69206+{
69207+ assert("nikita-1016", pointer != NULL);
69208+ assert("nikita-1017", child != NULL);
69209+ assert("nikita-1018", pointer->node != NULL);
69210+
69211+ assert("nikita-1325", znode_is_any_locked(pointer->node));
69212+
69213+ assert("nikita-2985",
69214+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
69215+
69216+ coord_clear_iplug((coord_t *) pointer);
69217+
69218+ if (coord_is_existing_unit(pointer)) {
69219+ item_plugin *iplug;
69220+ reiser4_block_nr addr;
69221+
69222+ if (item_is_internal(pointer)) {
69223+ iplug = item_plugin_by_coord(pointer);
69224+ assert("vs-513", iplug->s.internal.down_link);
69225+ iplug->s.internal.down_link(pointer, NULL, &addr);
69226+ /* check that cached value is correct */
69227+ if (disk_addr_eq(&addr, znode_get_block(child))) {
69228+ return NS_FOUND;
69229+ }
69230+ }
69231+ }
69232+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
69233+ return NS_NOT_FOUND;
69234+}
69235+
69236+/* find coord of pointer to new @child in @parent.
69237+
69238+ Find the &coord_t in the @parent where pointer to a given @child will
69239+ be in.
69240+
69241+*/
69242+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69243+ znode *
69244+ child UNUSED_ARG /* child znode, passed locked */ ,
69245+ znode * left /* left brother of new node */ ,
69246+ coord_t * result /* where result is stored in */ )
69247+{
69248+ int ret;
69249+
69250+ assert("nikita-1486", parent != NULL);
69251+ assert("nikita-1487", child != NULL);
69252+ assert("nikita-1488", result != NULL);
69253+
69254+ ret = find_child_ptr(parent, left, result);
69255+ if (ret != NS_FOUND) {
69256+ warning("nikita-1489", "Cannot find brother position: %i", ret);
69257+ return RETERR(-EIO);
69258+ } else {
69259+ result->between = AFTER_UNIT;
69260+ return RETERR(NS_NOT_FOUND);
69261+ }
69262+}
69263+
69264+/* find coord of pointer to @child in @parent.
69265+
69266+ Find the &coord_t in the @parent where pointer to a given @child is in.
69267+
69268+*/
69269+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69270+ znode * child /* child znode, passed locked */ ,
69271+ coord_t * result /* where result is stored in */ )
69272+{
69273+ int lookup_res;
69274+ node_plugin *nplug;
69275+ /* left delimiting key of a child */
69276+ reiser4_key ld;
69277+ reiser4_tree *tree;
69278+
69279+ assert("nikita-934", parent != NULL);
69280+ assert("nikita-935", child != NULL);
69281+ assert("nikita-936", result != NULL);
69282+ assert("zam-356", znode_is_loaded(parent));
69283+
69284+ coord_init_zero(result);
69285+ result->node = parent;
69286+
69287+ nplug = parent->nplug;
69288+ assert("nikita-939", nplug != NULL);
69289+
69290+ tree = znode_get_tree(parent);
69291+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69292+ * not aliased to ->in_parent of some znode. Otherwise,
69293+ * parent_coord_to_coord() below would modify data protected by tree
69294+ * lock. */
69295+ read_lock_tree(tree);
69296+ /* fast path. Try to use cached value. Lock tree to keep
69297+ node->pos_in_parent and pos->*_blocknr consistent. */
69298+ if (child->in_parent.item_pos + 1 != 0) {
69299+ parent_coord_to_coord(&child->in_parent, result);
69300+ if (check_tree_pointer(result, child) == NS_FOUND) {
69301+ read_unlock_tree(tree);
69302+ return NS_FOUND;
69303+ }
69304+
69305+ child->in_parent.item_pos = (unsigned short)~0;
69306+ }
69307+ read_unlock_tree(tree);
69308+
69309+ /* is above failed, find some key from @child. We are looking for the
69310+ least key in a child. */
69311+ read_lock_dk(tree);
69312+ ld = *znode_get_ld_key(child);
69313+ read_unlock_dk(tree);
69314+ /*
69315+ * now, lookup parent with key just found. Note, that left delimiting
69316+ * key doesn't identify node uniquely, because (in extremely rare
69317+ * case) two nodes can have equal left delimiting keys, if one of them
69318+ * is completely filled with directory entries that all happened to be
69319+ * hash collision. But, we check block number in check_tree_pointer()
69320+ * and, so, are safe.
69321+ */
69322+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69323+ /* update cached pos_in_node */
69324+ if (lookup_res == NS_FOUND) {
69325+ write_lock_tree(tree);
69326+ coord_to_parent_coord(result, &child->in_parent);
69327+ write_unlock_tree(tree);
69328+ lookup_res = check_tree_pointer(result, child);
69329+ }
69330+ if (lookup_res == NS_NOT_FOUND)
69331+ lookup_res = find_child_by_addr(parent, child, result);
69332+ return lookup_res;
69333+}
69334+
69335+/* find coord of pointer to @child in @parent by scanning
69336+
69337+ Find the &coord_t in the @parent where pointer to a given @child
69338+ is in by scanning all internal items in @parent and comparing block
69339+ numbers in them with that of @child.
69340+
69341+*/
69342+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69343+ znode * child /* child znode, passed locked */ ,
69344+ coord_t * result /* where result is stored in */ )
69345+{
69346+ int ret;
69347+
69348+ assert("nikita-1320", parent != NULL);
69349+ assert("nikita-1321", child != NULL);
69350+ assert("nikita-1322", result != NULL);
69351+
69352+ ret = NS_NOT_FOUND;
69353+
69354+ for_all_units(result, parent) {
69355+ if (check_tree_pointer(result, child) == NS_FOUND) {
69356+ write_lock_tree(znode_get_tree(parent));
69357+ coord_to_parent_coord(result, &child->in_parent);
69358+ write_unlock_tree(znode_get_tree(parent));
69359+ ret = NS_FOUND;
69360+ break;
69361+ }
69362+ }
69363+ return ret;
69364+}
69365+
69366+/* true, if @addr is "unallocated block number", which is just address, with
69367+ highest bit set. */
69368+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
69369+ * check */ )
69370+{
69371+ assert("nikita-1766", addr != NULL);
69372+ cassert(sizeof(reiser4_block_nr) == 8);
69373+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69374+ REISER4_UNALLOCATED_STATUS_VALUE;
69375+}
69376+
69377+/* returns true if removing bytes of given range of key [from_key, to_key]
69378+ causes removing of whole item @from */
69379+static int
69380+item_removed_completely(coord_t * from, const reiser4_key * from_key,
69381+ const reiser4_key * to_key)
69382+{
69383+ item_plugin *iplug;
69384+ reiser4_key key_in_item;
69385+
69386+ assert("umka-325", from != NULL);
69387+ assert("", item_is_extent(from));
69388+
69389+ /* check first key just for case */
69390+ item_key_by_coord(from, &key_in_item);
69391+ if (keygt(from_key, &key_in_item))
69392+ return 0;
69393+
69394+ /* check last key */
69395+ iplug = item_plugin_by_coord(from);
69396+ assert("vs-611", iplug && iplug->s.file.append_key);
69397+
69398+ iplug->s.file.append_key(from, &key_in_item);
69399+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
69400+
69401+ if (keylt(to_key, &key_in_item))
69402+ /* last byte is not removed */
69403+ return 0;
69404+ return 1;
69405+}
69406+
69407+/* helper function for prepare_twig_kill(): @left and @right are formatted
69408+ * neighbors of extent item being completely removed. Load and lock neighbors
69409+ * and store lock handles into @cdata for later use by kill_hook_extent() */
69410+static int
69411+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
69412+{
69413+ int result;
69414+ int left_loaded;
69415+ int right_loaded;
69416+
69417+ result = 0;
69418+ left_loaded = right_loaded = 0;
69419+
69420+ if (left != NULL) {
69421+ result = zload(left);
69422+ if (result == 0) {
69423+ left_loaded = 1;
69424+ result = longterm_lock_znode(kdata->left, left,
69425+ ZNODE_READ_LOCK,
69426+ ZNODE_LOCK_LOPRI);
69427+ }
69428+ }
69429+ if (result == 0 && right != NULL) {
69430+ result = zload(right);
69431+ if (result == 0) {
69432+ right_loaded = 1;
69433+ result = longterm_lock_znode(kdata->right, right,
69434+ ZNODE_READ_LOCK,
69435+ ZNODE_LOCK_HIPRI |
69436+ ZNODE_LOCK_NONBLOCK);
69437+ }
69438+ }
69439+ if (result != 0) {
69440+ done_lh(kdata->left);
69441+ done_lh(kdata->right);
69442+ if (left_loaded != 0)
69443+ zrelse(left);
69444+ if (right_loaded != 0)
69445+ zrelse(right);
69446+ }
69447+ return result;
69448+}
69449+
69450+static void done_children(carry_kill_data * kdata)
69451+{
69452+ if (kdata->left != NULL && kdata->left->node != NULL) {
69453+ zrelse(kdata->left->node);
69454+ done_lh(kdata->left);
69455+ }
69456+ if (kdata->right != NULL && kdata->right->node != NULL) {
69457+ zrelse(kdata->right->node);
69458+ done_lh(kdata->right);
69459+ }
69460+}
69461+
69462+/* part of cut_node. It is called when cut_node is called to remove or cut part
69463+ of extent item. When head of that item is removed - we have to update right
69464+ delimiting of left neighbor of extent. When item is removed completely - we
69465+ have to set sibling link between left and right neighbor of removed
69466+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
69467+ locked. So, caller should repeat an attempt
69468+*/
69469+/* Audited by: umka (2002.06.16) */
69470+static int
69471+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
69472+{
69473+ int result;
69474+ reiser4_key key;
69475+ lock_handle left_lh;
69476+ lock_handle right_lh;
69477+ coord_t left_coord;
69478+ coord_t *from;
69479+ znode *left_child;
69480+ znode *right_child;
69481+ reiser4_tree *tree;
69482+ int left_zloaded_here, right_zloaded_here;
69483+
69484+ from = kdata->params.from;
69485+ assert("umka-326", from != NULL);
69486+ assert("umka-327", kdata->params.to != NULL);
69487+
69488+ /* for one extent item only yet */
69489+ assert("vs-591", item_is_extent(from));
69490+ assert("vs-592", from->item_pos == kdata->params.to->item_pos);
69491+
69492+ if ((kdata->params.from_key
69493+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
69494+ || from->unit_pos != 0) {
69495+ /* head of item @from is not removed, there is nothing to
69496+ worry about */
69497+ return 0;
69498+ }
69499+
69500+ result = 0;
69501+ left_zloaded_here = 0;
69502+ right_zloaded_here = 0;
69503+
69504+ left_child = right_child = NULL;
69505+
69506+ coord_dup(&left_coord, from);
69507+ init_lh(&left_lh);
69508+ init_lh(&right_lh);
69509+ if (coord_prev_unit(&left_coord)) {
69510+ /* @from is leftmost item in its node */
69511+ if (!locked_left_neighbor) {
69512+ result =
69513+ reiser4_get_left_neighbor(&left_lh, from->node,
69514+ ZNODE_READ_LOCK,
69515+ GN_CAN_USE_UPPER_LEVELS);
69516+ switch (result) {
69517+ case 0:
69518+ break;
69519+ case -E_NO_NEIGHBOR:
69520+ /* there is no formatted node to the left of
69521+ from->node */
69522+ warning("vs-605",
69523+ "extent item has smallest key in "
69524+ "the tree and it is about to be removed");
69525+ return 0;
69526+ case -E_DEADLOCK:
69527+ /* need to restart */
69528+ default:
69529+ return result;
69530+ }
69531+
69532+ /* we have acquired left neighbor of from->node */
69533+ result = zload(left_lh.node);
69534+ if (result)
69535+ goto done;
69536+
69537+ locked_left_neighbor = left_lh.node;
69538+ } else {
69539+ /* squalloc_right_twig_cut should have supplied locked
69540+ * left neighbor */
69541+ assert("vs-834",
69542+ znode_is_write_locked(locked_left_neighbor));
69543+ result = zload(locked_left_neighbor);
69544+ if (result)
69545+ return result;
69546+ }
69547+
69548+ left_zloaded_here = 1;
69549+ coord_init_last_unit(&left_coord, locked_left_neighbor);
69550+ }
69551+
69552+ if (!item_is_internal(&left_coord)) {
69553+ /* what else but extent can be on twig level */
69554+ assert("vs-606", item_is_extent(&left_coord));
69555+
69556+ /* there is no left formatted child */
69557+ if (left_zloaded_here)
69558+ zrelse(locked_left_neighbor);
69559+ done_lh(&left_lh);
69560+ return 0;
69561+ }
69562+
69563+ tree = znode_get_tree(left_coord.node);
69564+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
69565+
69566+ if (IS_ERR(left_child)) {
69567+ result = PTR_ERR(left_child);
69568+ goto done;
69569+ }
69570+
69571+ /* left child is acquired, calculate new right delimiting key for it
69572+ and get right child if it is necessary */
69573+ if (item_removed_completely
69574+ (from, kdata->params.from_key, kdata->params.to_key)) {
69575+ /* try to get right child of removed item */
69576+ coord_t right_coord;
69577+
69578+ assert("vs-607",
69579+ kdata->params.to->unit_pos ==
69580+ coord_last_unit_pos(kdata->params.to));
69581+ coord_dup(&right_coord, kdata->params.to);
69582+ if (coord_next_unit(&right_coord)) {
69583+ /* @to is rightmost unit in the node */
69584+ result =
69585+ reiser4_get_right_neighbor(&right_lh, from->node,
69586+ ZNODE_READ_LOCK,
69587+ GN_CAN_USE_UPPER_LEVELS);
69588+ switch (result) {
69589+ case 0:
69590+ result = zload(right_lh.node);
69591+ if (result)
69592+ goto done;
69593+
69594+ right_zloaded_here = 1;
69595+ coord_init_first_unit(&right_coord,
69596+ right_lh.node);
69597+ item_key_by_coord(&right_coord, &key);
69598+ break;
69599+
69600+ case -E_NO_NEIGHBOR:
69601+ /* there is no formatted node to the right of
69602+ from->node */
69603+ read_lock_dk(tree);
69604+ key = *znode_get_rd_key(from->node);
69605+ read_unlock_dk(tree);
69606+ right_coord.node = NULL;
69607+ result = 0;
69608+ break;
69609+ default:
69610+ /* real error */
69611+ goto done;
69612+ }
69613+ } else {
69614+ /* there is an item to the right of @from - take its key */
69615+ item_key_by_coord(&right_coord, &key);
69616+ }
69617+
69618+ /* try to get right child of @from */
69619+ if (right_coord.node && /* there is right neighbor of @from */
69620+ item_is_internal(&right_coord)) { /* it is internal item */
69621+ right_child = child_znode(&right_coord,
69622+ right_coord.node, 1, 0);
69623+
69624+ if (IS_ERR(right_child)) {
69625+ result = PTR_ERR(right_child);
69626+ goto done;
69627+ }
69628+
69629+ }
69630+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
69631+ update of right delimiting key of left_child */
69632+ result = prepare_children(left_child, right_child, kdata);
69633+ } else {
69634+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
69635+ result = prepare_children(left_child, NULL, kdata);
69636+ }
69637+
69638+ done:
69639+ if (right_child)
69640+ zput(right_child);
69641+ if (right_zloaded_here)
69642+ zrelse(right_lh.node);
69643+ done_lh(&right_lh);
69644+
69645+ if (left_child)
69646+ zput(left_child);
69647+ if (left_zloaded_here)
69648+ zrelse(locked_left_neighbor);
69649+ done_lh(&left_lh);
69650+ return result;
69651+}
69652+
69653+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
69654+ are to be cut completely */
71430cf6 69655+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
44254afd
MT
69656+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
69657+ const reiser4_key * to_key, /* last key to be removed */
69658+ reiser4_key *
69659+ smallest_removed /* smallest key actually removed */ )
69660+{
69661+ int result;
69662+ carry_pool *pool;
69663+ carry_level *lowest_level;
69664+ carry_cut_data *cut_data;
69665+ carry_op *op;
69666+
69667+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
69668+
69669+ pool =
69670+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69671+ sizeof(*cut_data));
69672+ if (IS_ERR(pool))
69673+ return PTR_ERR(pool);
69674+ lowest_level = (carry_level *) (pool + 1);
69675+ init_carry_level(lowest_level, pool);
69676+
71430cf6 69677+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
44254afd
MT
69678+ assert("vs-1509", op != 0);
69679+ if (IS_ERR(op)) {
69680+ done_carry_pool(pool);
69681+ return PTR_ERR(op);
69682+ }
69683+
69684+ cut_data = (carry_cut_data *) (lowest_level + 3);
69685+ cut_data->params.from = from;
69686+ cut_data->params.to = to;
69687+ cut_data->params.from_key = from_key;
69688+ cut_data->params.to_key = to_key;
69689+ cut_data->params.smallest_removed = smallest_removed;
69690+
69691+ op->u.cut_or_kill.is_cut = 1;
69692+ op->u.cut_or_kill.u.cut = cut_data;
69693+
71430cf6 69694+ result = reiser4_carry(lowest_level, NULL);
44254afd
MT
69695+ done_carry_pool(pool);
69696+
69697+ return result;
69698+}
69699+
69700+/* cut part of the node
69701+
69702+ Cut part or whole content of node.
69703+
69704+ cut data between @from and @to of @from->node and call carry() to make
69705+ corresponding changes in the tree. @from->node may become empty. If so -
69706+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
69707+ removed key is stored in @smallest_removed
69708+
69709+*/
69710+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
69711+ coord_t * to, /* coord of the last unit/item that will be eliminated */
69712+ const reiser4_key * from_key, /* first key to be removed */
69713+ const reiser4_key * to_key, /* last key to be removed */
69714+ reiser4_key * smallest_removed, /* smallest key actually removed */
69715+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
69716+ * locked (in squalloc_right_twig_cut, namely) */
69717+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
69718+ invalidate pages together with item pointing to them */
69719+ int truncate)
69720+{ /* this call is made for file truncate) */
69721+ int result;
69722+ carry_pool *pool;
69723+ carry_level *lowest_level;
69724+ carry_kill_data *kdata;
69725+ lock_handle *left_child;
69726+ lock_handle *right_child;
69727+ carry_op *op;
69728+
69729+ assert("umka-328", from != NULL);
69730+ assert("vs-316", !node_is_empty(from->node));
69731+ assert("nikita-1812", coord_is_existing_unit(from)
69732+ && coord_is_existing_unit(to));
69733+
69734+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
69735+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69736+ sizeof(carry_kill_data) +
69737+ 2 * sizeof(lock_handle) +
69738+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
69739+ if (IS_ERR(pool))
69740+ return PTR_ERR(pool);
69741+
69742+ lowest_level = (carry_level *) (pool + 1);
69743+ init_carry_level(lowest_level, pool);
69744+
69745+ kdata = (carry_kill_data *) (lowest_level + 3);
69746+ left_child = (lock_handle *) (kdata + 1);
69747+ right_child = left_child + 1;
69748+
69749+ init_lh(left_child);
69750+ init_lh(right_child);
69751+
69752+ kdata->params.from = from;
69753+ kdata->params.to = to;
69754+ kdata->params.from_key = from_key;
69755+ kdata->params.to_key = to_key;
69756+ kdata->params.smallest_removed = smallest_removed;
69757+ kdata->params.truncate = truncate;
69758+ kdata->flags = 0;
69759+ kdata->inode = inode;
69760+ kdata->left = left_child;
69761+ kdata->right = right_child;
69762+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
69763+ kdata->buf = (char *)(right_child + 1);
69764+
69765+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
69766+ /* left child of extent item may have to get updated right
69767+ delimiting key and to get linked with right child of extent
69768+ @from if it will be removed completely */
69769+ result = prepare_twig_kill(kdata, locked_left_neighbor);
69770+ if (result) {
69771+ done_children(kdata);
69772+ done_carry_pool(pool);
69773+ return result;
69774+ }
69775+ }
69776+
71430cf6 69777+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
44254afd
MT
69778+ if (IS_ERR(op) || (op == NULL)) {
69779+ done_children(kdata);
69780+ done_carry_pool(pool);
69781+ return RETERR(op ? PTR_ERR(op) : -EIO);
69782+ }
69783+
69784+ op->u.cut_or_kill.is_cut = 0;
69785+ op->u.cut_or_kill.u.kill = kdata;
69786+
71430cf6 69787+ result = reiser4_carry(lowest_level, NULL);
44254afd
MT
69788+
69789+ done_children(kdata);
69790+ done_carry_pool(pool);
69791+ return result;
69792+}
69793+
69794+void
69795+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
69796+{
71430cf6 69797+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
44254afd
MT
69798+ pgoff_t start_pg, end_pg;
69799+
69800+ start_pg = start >> PAGE_CACHE_SHIFT;
69801+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
69802+
69803+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
69804+ /*
69805+ * kill up to the page boundary.
69806+ */
69807+ assert("vs-123456", start_pg == end_pg);
69808+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
69809+ truncate);
69810+ } else if (start_pg != end_pg) {
69811+ /*
69812+ * page boundary is within killed portion of node.
69813+ */
69814+ assert("vs-654321", end_pg - start_pg == 1);
69815+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
69816+ end_pg - start_pg, 1);
69817+ }
69818+ }
69819+ inode_sub_bytes(inode, end - start);
69820+}
69821+
69822+/**
69823+ * Delete whole @node from the reiser4 tree without loading it.
69824+ *
69825+ * @left: locked left neighbor,
69826+ * @node: node to be deleted,
69827+ * @smallest_removed: leftmost key of deleted node,
69828+ * @object: inode pointer, if we truncate a file body.
69829+ * @truncate: true if called for file truncate.
69830+ *
69831+ * @return: 0 if success, error code otherwise.
69832+ *
69833+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
69834+ * contains the right value of the smallest removed key from the previous
69835+ * cut_worker() iteration. This is needed for proper accounting of
69836+ * "i_blocks" and "i_bytes" fields of the @object.
69837+ */
71430cf6
MT
69838+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
69839+ struct inode *object, int truncate)
44254afd
MT
69840+{
69841+ lock_handle parent_lock;
69842+ coord_t cut_from;
69843+ coord_t cut_to;
69844+ reiser4_tree *tree;
69845+ int ret;
69846+
69847+ assert("zam-937", node != NULL);
69848+ assert("zam-933", znode_is_write_locked(node));
69849+ assert("zam-999", smallest_removed != NULL);
69850+
69851+ init_lh(&parent_lock);
69852+
69853+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
69854+ if (ret)
69855+ return ret;
69856+
69857+ assert("zam-934", !znode_above_root(parent_lock.node));
69858+
69859+ ret = zload(parent_lock.node);
69860+ if (ret)
69861+ goto failed_nozrelse;
69862+
69863+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
69864+ if (ret)
69865+ goto failed;
69866+
69867+ /* decrement child counter and set parent pointer to NULL before
69868+ deleting the list from parent node because of checks in
69869+ internal_kill_item_hook (we can delete the last item from the parent
69870+ node, the parent node is going to be deleted and its c_count should
69871+ be zero). */
69872+
69873+ tree = znode_get_tree(node);
69874+ write_lock_tree(tree);
69875+ init_parent_coord(&node->in_parent, NULL);
69876+ --parent_lock.node->c_count;
69877+ write_unlock_tree(tree);
69878+
69879+ assert("zam-989", item_is_internal(&cut_from));
69880+
69881+ /* @node should be deleted after unlocking. */
69882+ ZF_SET(node, JNODE_HEARD_BANSHEE);
69883+
69884+ /* remove a pointer from the parent node to the node being deleted. */
69885+ coord_dup(&cut_to, &cut_from);
69886+ /* FIXME: shouldn't this be kill_node_content */
69887+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
69888+ if (ret)
69889+ /* FIXME(Zam): Should we re-connect the node to its parent if
69890+ * cut_node fails? */
69891+ goto failed;
69892+
69893+ {
69894+ reiser4_tree *tree = current_tree;
69895+ __u64 start_offset = 0, end_offset = 0;
69896+
69897+ read_lock_tree(tree);
69898+ write_lock_dk(tree);
69899+ if (object) {
69900+ /* We use @smallest_removed and the left delimiting of
69901+ * the current node for @object->i_blocks, i_bytes
69902+ * calculation. We assume that the items after the
69903+ * *@smallest_removed key have been deleted from the
69904+ * file body. */
69905+ start_offset = get_key_offset(znode_get_ld_key(node));
69906+ end_offset = get_key_offset(smallest_removed);
69907+ }
69908+
69909+ assert("zam-1021", znode_is_connected(node));
69910+ if (node->left)
69911+ znode_set_rd_key(node->left, znode_get_rd_key(node));
69912+
69913+ *smallest_removed = *znode_get_ld_key(node);
69914+
69915+ write_unlock_dk(tree);
69916+ read_unlock_tree(tree);
69917+
69918+ if (object) {
69919+ /* we used to perform actions which are to be performed on items on their removal from tree in
69920+ special item method - kill_hook. Here for optimization reasons we avoid reading node
69921+ containing item we remove and can not call item's kill hook. Instead we call function which
69922+ does exactly the same things as tail kill hook in assumption that node we avoid reading
69923+ contains only one item and that item is a tail one. */
69924+ fake_kill_hook_tail(object, start_offset, end_offset,
69925+ truncate);
69926+ }
69927+ }
69928+ failed:
69929+ zrelse(parent_lock.node);
69930+ failed_nozrelse:
69931+ done_lh(&parent_lock);
69932+
69933+ return ret;
69934+}
69935+
69936+static int can_delete(const reiser4_key *key, znode *node)
69937+{
69938+ int result;
69939+
69940+ read_lock_dk(current_tree);
69941+ result = keyle(key, znode_get_ld_key(node));
69942+ read_unlock_dk(current_tree);
69943+ return result;
69944+}
69945+
69946+/**
69947+ * This subroutine is not optimal but implementation seems to
69948+ * be easier).
69949+ *
69950+ * @tap: the point deletion process begins from,
69951+ * @from_key: the beginning of the deleted key range,
69952+ * @to_key: the end of the deleted key range,
69953+ * @smallest_removed: the smallest removed key,
69954+ * @truncate: true if called for file truncate.
69955+ * @progress: return true if a progress in file items deletions was made,
69956+ * @smallest_removed value is actual in that case.
69957+ *
71430cf6
MT
69958+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long
69959+ * reiser4_cut_tree operation was interrupted for allowing atom commit.
44254afd
MT
69960+ */
69961+int
69962+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
69963+ const reiser4_key * to_key,
69964+ reiser4_key * smallest_removed, struct inode *object,
69965+ int truncate, int *progress)
69966+{
69967+ lock_handle next_node_lock;
69968+ coord_t left_coord;
69969+ int result;
69970+
69971+ assert("zam-931", tap->coord->node != NULL);
69972+ assert("zam-932", znode_is_write_locked(tap->coord->node));
69973+
69974+ *progress = 0;
69975+ init_lh(&next_node_lock);
69976+
69977+ while (1) {
69978+ znode *node; /* node from which items are cut */
69979+ node_plugin *nplug; /* node plugin for @node */
69980+
69981+ node = tap->coord->node;
69982+
69983+ /* Move next_node_lock to the next node on the left. */
69984+ result =
69985+ reiser4_get_left_neighbor(&next_node_lock, node,
69986+ ZNODE_WRITE_LOCK,
69987+ GN_CAN_USE_UPPER_LEVELS);
69988+ if (result != 0 && result != -E_NO_NEIGHBOR)
69989+ break;
69990+ /* Check can we delete the node as a whole. */
69991+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
69992+ can_delete(from_key, node)) {
71430cf6
MT
69993+ result = reiser4_delete_node(node, smallest_removed,
69994+ object, truncate);
44254afd 69995+ } else {
71430cf6 69996+ result = reiser4_tap_load(tap);
44254afd
MT
69997+ if (result)
69998+ return result;
69999+
70000+ /* Prepare the second (right) point for cut_node() */
70001+ if (*progress)
70002+ coord_init_last_unit(tap->coord, node);
70003+
70004+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70005+ NULL)
70006+ /* set rightmost unit for the items without lookup method */
70007+ tap->coord->unit_pos =
70008+ coord_last_unit_pos(tap->coord);
70009+
70010+ nplug = node->nplug;
70011+
70012+ assert("vs-686", nplug);
70013+ assert("vs-687", nplug->lookup);
70014+
70015+ /* left_coord is leftmost unit cut from @node */
70016+ result = nplug->lookup(node, from_key,
70017+ FIND_MAX_NOT_MORE_THAN,
70018+ &left_coord);
70019+
70020+ if (IS_CBKERR(result))
70021+ break;
70022+
70023+ /* adjust coordinates so that they are set to existing units */
70024+ if (coord_set_to_right(&left_coord)
70025+ || coord_set_to_left(tap->coord)) {
70026+ result = 0;
70027+ break;
70028+ }
70029+
70030+ if (coord_compare(&left_coord, tap->coord) ==
70031+ COORD_CMP_ON_RIGHT) {
70032+ /* keys from @from_key to @to_key are not in the tree */
70033+ result = 0;
70034+ break;
70035+ }
70036+
70037+ if (left_coord.item_pos != tap->coord->item_pos) {
70038+ /* do not allow to cut more than one item. It is added to solve problem of truncating
70039+ partially converted files. If file is partially converted there may exist a twig node
70040+ containing both internal item or items pointing to leaf nodes with formatting items
70041+ and extent item. We do not want to kill internal items being at twig node here
70042+ because cut_tree_worker assumes killing them from level level */
70043+ coord_dup(&left_coord, tap->coord);
70044+ assert("vs-1652",
70045+ coord_is_existing_unit(&left_coord));
70046+ left_coord.unit_pos = 0;
70047+ }
70048+
70049+ /* cut data from one node */
71430cf6 70050+ // *smallest_removed = *reiser4_min_key();
44254afd
MT
70051+ result =
70052+ kill_node_content(&left_coord, tap->coord, from_key,
70053+ to_key, smallest_removed,
70054+ next_node_lock.node, object,
70055+ truncate);
71430cf6 70056+ reiser4_tap_relse(tap);
44254afd
MT
70057+ }
70058+ if (result)
70059+ break;
70060+
70061+ ++(*progress);
70062+
70063+ /* Check whether all items with keys >= from_key were removed
70064+ * from the tree. */
70065+ if (keyle(smallest_removed, from_key))
70066+ /* result = 0; */
70067+ break;
70068+
70069+ if (next_node_lock.node == NULL)
70070+ break;
70071+
71430cf6 70072+ result = reiser4_tap_move(tap, &next_node_lock);
44254afd
MT
70073+ done_lh(&next_node_lock);
70074+ if (result)
70075+ break;
70076+
71430cf6
MT
70077+ /* Break long reiser4_cut_tree operation (deletion of a large
70078+ file) if atom requires commit. */
44254afd
MT
70079+ if (*progress > CUT_TREE_MIN_ITERATIONS
70080+ && current_atom_should_commit()) {
70081+ result = -E_REPEAT;
70082+ break;
70083+ }
70084+ }
70085+ done_lh(&next_node_lock);
71430cf6 70086+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
44254afd
MT
70087+ return result;
70088+}
70089+
70090+/* there is a fundamental problem with optimizing deletes: VFS does it
70091+ one file at a time. Another problem is that if an item can be
70092+ anything, then deleting items must be done one at a time. It just
70093+ seems clean to writes this to specify a from and a to key, and cut
70094+ everything between them though. */
70095+
70096+/* use this function with care if deleting more than what is part of a single file. */
70097+/* do not use this when cutting a single item, it is suboptimal for that */
70098+
70099+/* You are encouraged to write plugin specific versions of this. It
70100+ cannot be optimal for all plugins because it works item at a time,
70101+ and some plugins could sometimes work node at a time. Regular files
70102+ however are not optimizable to work node at a time because of
70103+ extents needing to free the blocks they point to.
70104+
70105+ Optimizations compared to v3 code:
70106+
70107+ It does not balance (that task is left to memory pressure code).
70108+
70109+ Nodes are deleted only if empty.
70110+
70111+ Uses extents.
70112+
70113+ Performs read-ahead of formatted nodes whose contents are part of
70114+ the deletion.
70115+*/
70116+
70117+/**
70118+ * Delete everything from the reiser4 tree between two keys: @from_key and
70119+ * @to_key.
70120+ *
70121+ * @from_key: the beginning of the deleted key range,
70122+ * @to_key: the end of the deleted key range,
70123+ * @smallest_removed: the smallest removed key,
70124+ * @object: owner of cutting items.
70125+ * @truncate: true if called for file truncate.
70126+ * @progress: return true if a progress in file items deletions was made,
70127+ * @smallest_removed value is actual in that case.
70128+ *
70129+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70130+ * operation was interrupted for allowing atom commit .
70131+ */
70132+
71430cf6
MT
70133+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70134+ const reiser4_key * to_key,
70135+ reiser4_key * smallest_removed_p,
70136+ struct inode *object, int truncate, int *progress)
44254afd
MT
70137+{
70138+ lock_handle lock;
70139+ int result;
70140+ tap_t tap;
70141+ coord_t right_coord;
70142+ reiser4_key smallest_removed;
70143+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70144+ const reiser4_key *, reiser4_key *,
70145+ struct inode *, int, int *);
70146+ STORE_COUNTERS;
70147+
70148+ assert("umka-329", tree != NULL);
70149+ assert("umka-330", from_key != NULL);
70150+ assert("umka-331", to_key != NULL);
70151+ assert("zam-936", keyle(from_key, to_key));
70152+
70153+ if (smallest_removed_p == NULL)
70154+ smallest_removed_p = &smallest_removed;
70155+
70156+ init_lh(&lock);
70157+
70158+ do {
70159+ /* Find rightmost item to cut away from the tree. */
71430cf6
MT
70160+ result = reiser4_object_lookup(object, to_key, &right_coord,
70161+ &lock, ZNODE_WRITE_LOCK,
70162+ FIND_MAX_NOT_MORE_THAN,
70163+ TWIG_LEVEL, LEAF_LEVEL,
70164+ CBK_UNIQUE, NULL /*ra_info */);
44254afd
MT
70165+ if (result != CBK_COORD_FOUND)
70166+ break;
70167+ if (object == NULL
70168+ || inode_file_plugin(object)->cut_tree_worker == NULL)
70169+ cut_tree_worker = cut_tree_worker_common;
70170+ else
70171+ cut_tree_worker =
70172+ inode_file_plugin(object)->cut_tree_worker;
71430cf6 70173+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
44254afd
MT
70174+ result =
70175+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70176+ object, truncate, progress);
71430cf6 70177+ reiser4_tap_done(&tap);
44254afd 70178+
71430cf6 70179+ reiser4_preempt_point();
44254afd
MT
70180+
70181+ } while (0);
70182+
70183+ done_lh(&lock);
70184+
70185+ if (result) {
70186+ switch (result) {
70187+ case -E_NO_NEIGHBOR:
70188+ result = 0;
70189+ break;
70190+ case -E_DEADLOCK:
70191+ result = -E_REPEAT;
70192+ case -E_REPEAT:
70193+ case -ENOMEM:
70194+ case -ENOENT:
70195+ break;
70196+ default:
70197+ warning("nikita-2861", "failure: %i", result);
70198+ }
70199+ }
70200+
70201+ CHECK_COUNTERS;
70202+ return result;
70203+}
70204+
71430cf6
MT
70205+/* repeat reiser4_cut_tree_object until everything is deleted.
70206+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT
70207+ * is returned by cut_tree_object. */
70208+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70209+ const reiser4_key * to, struct inode *inode, int truncate)
44254afd
MT
70210+{
70211+ int result;
70212+ int progress;
70213+
70214+ do {
71430cf6
MT
70215+ result = reiser4_cut_tree_object(tree, from, to, NULL,
70216+ inode, truncate, &progress);
44254afd
MT
70217+ } while (result == -E_REPEAT);
70218+
70219+ return result;
70220+}
70221+
70222+/* finishing reiser4 initialization */
71430cf6 70223+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
44254afd
MT
70224+ * initialized */ ,
70225+ const reiser4_block_nr * root_block /* address of a root block
70226+ * on a disk */ ,
70227+ tree_level height /* height of a tree */ ,
70228+ node_plugin * nplug /* default node plugin */ )
70229+{
70230+ int result;
70231+
70232+ assert("nikita-306", tree != NULL);
70233+ assert("nikita-307", root_block != NULL);
70234+ assert("nikita-308", height > 0);
70235+ assert("nikita-309", nplug != NULL);
70236+ assert("zam-587", tree->super != NULL);
70237+
70238+ tree->root_block = *root_block;
70239+ tree->height = height;
70240+ tree->estimate_one_insert = calc_estimate_one_insert(height);
70241+ tree->nplug = nplug;
70242+
70243+ tree->znode_epoch = 1ull;
70244+
70245+ cbk_cache_init(&tree->cbk_cache);
70246+
70247+ result = znodes_tree_init(tree);
70248+ if (result == 0)
70249+ result = jnodes_tree_init(tree);
70250+ if (result == 0) {
71430cf6
MT
70251+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
70252+ reiser4_ctx_gfp_mask_get());
44254afd
MT
70253+ if (IS_ERR(tree->uber)) {
70254+ result = PTR_ERR(tree->uber);
70255+ tree->uber = NULL;
70256+ }
70257+ }
70258+ return result;
70259+}
70260+
70261+/* release resources associated with @tree */
71430cf6 70262+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
44254afd
MT
70263+{
70264+ if (tree == NULL)
70265+ return;
70266+
70267+ if (tree->uber != NULL) {
70268+ zput(tree->uber);
70269+ tree->uber = NULL;
70270+ }
70271+ znodes_tree_done(tree);
70272+ jnodes_tree_done(tree);
70273+ cbk_cache_done(&tree->cbk_cache);
70274+}
70275+
70276+/* Make Linus happy.
70277+ Local variables:
70278+ c-indentation-style: "K&R"
70279+ mode-name: "LC"
70280+ c-basic-offset: 8
70281+ tab-width: 8
70282+ fill-column: 120
70283+ scroll-step: 1
70284+ End:
70285+*/
71430cf6
MT
70286diff -urN linux-2.6.22.orig/fs/reiser4/tree.h linux-2.6.22/fs/reiser4/tree.h
70287--- linux-2.6.22.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
70288+++ linux-2.6.22/fs/reiser4/tree.h 2007-07-29 00:25:35.028735820 +0400
70289@@ -0,0 +1,577 @@
44254afd
MT
70290+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70291+ * reiser4/README */
70292+
70293+/* Tree operations. See fs/reiser4/tree.c for comments */
70294+
70295+#if !defined( __REISER4_TREE_H__ )
70296+#define __REISER4_TREE_H__
70297+
70298+#include "forward.h"
70299+#include "debug.h"
70300+#include "dformat.h"
70301+#include "plugin/node/node.h"
70302+#include "plugin/plugin.h"
70303+#include "znode.h"
70304+#include "tap.h"
70305+
70306+#include <linux/types.h> /* for __u?? */
70307+#include <linux/fs.h> /* for struct super_block */
70308+#include <linux/spinlock.h>
70309+#include <linux/sched.h> /* for struct task_struct */
70310+
70311+/* fictive block number never actually used */
70312+extern const reiser4_block_nr UBER_TREE_ADDR;
70313+
70314+/* &cbk_cache_slot - entry in a coord cache.
70315+
70316+ This is entry in a coord_by_key (cbk) cache, represented by
70317+ &cbk_cache.
70318+
70319+*/
70320+typedef struct cbk_cache_slot {
70321+ /* cached node */
70322+ znode *node;
70323+ /* linkage to the next cbk cache slot in a LRU order */
70324+ struct list_head lru;
70325+} cbk_cache_slot;
70326+
70327+/* &cbk_cache - coord cache. This is part of reiser4_tree.
70328+
70329+ cbk_cache is supposed to speed up tree lookups by caching results of recent
70330+ successful lookups (we don't cache negative results as dentry cache
70331+ does). Cache consists of relatively small number of entries kept in a LRU
70332+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70333+ which we can obtain a range of keys that covered by this znode. Before
70334+ embarking into real tree traversal we scan cbk_cache slot by slot and for
70335+ each slot check whether key we are looking for is between minimal and
70336+ maximal keys for node pointed to by this slot. If no match is found, real
70337+ tree traversal is performed and if result is successful, appropriate entry
70338+ is inserted into cache, possibly pulling least recently used entry out of
70339+ it.
70340+
70341+ Tree spin lock is used to protect coord cache. If contention for this
70342+ lock proves to be too high, more finer grained locking can be added.
70343+
70344+ Invariants involving parts of this data-type:
70345+
70346+ [cbk-cache-invariant]
70347+*/
70348+typedef struct cbk_cache {
70349+ /* serializator */
70350+ rwlock_t guard;
70351+ int nr_slots;
70352+ /* head of LRU list of cache slots */
70353+ struct list_head lru;
70354+ /* actual array of slots */
70355+ cbk_cache_slot *slot;
70356+} cbk_cache;
70357+
44254afd
MT
70358+/* level_lookup_result - possible outcome of looking up key at some level.
70359+ This is used by coord_by_key when traversing tree downward. */
70360+typedef enum {
70361+ /* continue to the next level */
70362+ LOOKUP_CONT,
70363+ /* done. Either required item was found, or we can prove it
70364+ doesn't exist, or some error occurred. */
70365+ LOOKUP_DONE,
70366+ /* restart traversal from the root. Infamous "repetition". */
70367+ LOOKUP_REST
70368+} level_lookup_result;
70369+
70370+/* This is representation of internal reiser4 tree where all file-system
70371+ data and meta-data are stored. This structure is passed to all tree
70372+ manipulation functions. It's different from the super block because:
70373+ we don't want to limit ourselves to strictly one to one mapping
70374+ between super blocks and trees, and, because they are logically
70375+ different: there are things in a super block that have no relation to
70376+ the tree (bitmaps, journalling area, mount options, etc.) and there
70377+ are things in a tree that bear no relation to the super block, like
70378+ tree of znodes.
70379+
70380+ At this time, there is only one tree
70381+ per filesystem, and this struct is part of the super block. We only
70382+ call the super block the super block for historical reasons (most
70383+ other filesystems call the per filesystem metadata the super block).
70384+*/
70385+
70386+struct reiser4_tree {
70387+ /* block_nr == 0 is fake znode. Write lock it, while changing
70388+ tree height. */
70389+ /* disk address of root node of a tree */
70390+ reiser4_block_nr root_block;
70391+
70392+ /* level of the root node. If this is 1, tree consists of root
70393+ node only */
70394+ tree_level height;
70395+
70396+ /*
70397+ * this is cached here avoid calling plugins through function
70398+ * dereference all the time.
70399+ */
70400+ __u64 estimate_one_insert;
70401+
70402+ /* cache of recent tree lookup results */
70403+ cbk_cache cbk_cache;
70404+
70405+ /* hash table to look up znodes by block number. */
70406+ z_hash_table zhash_table;
70407+ z_hash_table zfake_table;
70408+ /* hash table to look up jnodes by inode and offset. */
70409+ j_hash_table jhash_table;
70410+
70411+ /* lock protecting:
70412+ - parent pointers,
70413+ - sibling pointers,
70414+ - znode hash table
70415+ - coord cache
70416+ */
70417+ /* NOTE: The "giant" tree lock can be replaced by more spin locks,
70418+ hoping they will be less contented. We can use one spin lock per one
70419+ znode hash bucket. With adding of some code complexity, sibling
70420+ pointers can be protected by both znode spin locks. However it looks
70421+ more SMP scalable we should test this locking change on n-ways (n >
70422+ 4) SMP machines. Current 4-ways machine test does not show that tree
70423+ lock is contented and it is a bottleneck (2003.07.25). */
70424+
70425+ rwlock_t tree_lock;
70426+
70427+ /* lock protecting delimiting keys */
70428+ rwlock_t dk_lock;
70429+
70430+ /* spin lock protecting znode_epoch */
70431+ spinlock_t epoch_lock;
70432+ /* version stamp used to mark znode updates. See seal.[ch] for more
70433+ * information. */
70434+ __u64 znode_epoch;
70435+
70436+ znode *uber;
70437+ node_plugin *nplug;
70438+ struct super_block *super;
70439+ struct {
70440+ /* carry flags used for insertion of new nodes */
70441+ __u32 new_node_flags;
70442+ /* carry flags used for insertion of new extents */
70443+ __u32 new_extent_flags;
70444+ /* carry flags used for paste operations */
70445+ __u32 paste_flags;
70446+ /* carry flags used for insert operations */
70447+ __u32 insert_flags;
70448+ } carry;
70449+};
70450+
71430cf6
MT
70451+extern int reiser4_init_tree(reiser4_tree * tree,
70452+ const reiser4_block_nr * root_block,
70453+ tree_level height, node_plugin * default_plugin);
70454+extern void reiser4_done_tree(reiser4_tree * tree);
44254afd
MT
70455+
70456+/* cbk flags: options for coord_by_key() */
70457+typedef enum {
70458+ /* coord_by_key() is called for insertion. This is necessary because
70459+ of extents being located at the twig level. For explanation, see
70460+ comment just above is_next_item_internal().
70461+ */
70462+ CBK_FOR_INSERT = (1 << 0),
70463+ /* coord_by_key() is called with key that is known to be unique */
70464+ CBK_UNIQUE = (1 << 1),
70465+ /* coord_by_key() can trust delimiting keys. This options is not user
70466+ accessible. coord_by_key() will set it automatically. It will be
70467+ only cleared by special-case in extents-on-the-twig-level handling
70468+ where it is necessary to insert item with a key smaller than
70469+ leftmost key in a node. This is necessary because of extents being
70470+ located at the twig level. For explanation, see comment just above
70471+ is_next_item_internal().
70472+ */
70473+ CBK_TRUST_DK = (1 << 2),
70474+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
70475+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
70476+ CBK_DKSET = (1 << 5),
70477+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
70478+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
70479+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
70480+ * lock */
70481+} cbk_flags;
70482+
70483+/* insertion outcome. IBK = insert by key */
70484+typedef enum {
70485+ IBK_INSERT_OK = 0,
70486+ IBK_ALREADY_EXISTS = -EEXIST,
70487+ IBK_IO_ERROR = -EIO,
70488+ IBK_NO_SPACE = -E_NODE_FULL,
70489+ IBK_OOM = -ENOMEM
70490+} insert_result;
70491+
70492+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
70493+
70494+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
70495+ lock_handle * lh, void *arg);
71430cf6
MT
70496+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
70497+ lock_handle * lh,
70498+ tree_iterate_actor_t actor, void *arg,
70499+ znode_lock_mode mode, int through_units_p);
44254afd
MT
70500+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
70501+ znode_lock_request pri, lock_handle * lh);
70502+
70503+/* return node plugin of @node */
70504+static inline node_plugin *node_plugin_by_node(const znode *
70505+ node /* node to query */ )
70506+{
70507+ assert("vs-213", node != NULL);
70508+ assert("vs-214", znode_is_loaded(node));
70509+
70510+ return node->nplug;
70511+}
70512+
70513+/* number of items in @node */
70514+static inline pos_in_node_t node_num_items(const znode * node)
70515+{
70516+ assert("nikita-2754", znode_is_loaded(node));
70517+ assert("nikita-2468",
70518+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
70519+
70520+ return node->nr_items;
70521+}
70522+
70523+/* Return the number of items at the present node. Asserts coord->node !=
70524+ NULL. */
70525+static inline unsigned coord_num_items(const coord_t * coord)
70526+{
70527+ assert("jmacd-9805", coord->node != NULL);
70528+
70529+ return node_num_items(coord->node);
70530+}
70531+
70532+/* true if @node is empty */
70533+static inline int node_is_empty(const znode * node)
70534+{
70535+ return node_num_items(node) == 0;
70536+}
70537+
70538+typedef enum {
70539+ SHIFTED_SOMETHING = 0,
70540+ SHIFT_NO_SPACE = -E_NODE_FULL,
70541+ SHIFT_IO_ERROR = -EIO,
70542+ SHIFT_OOM = -ENOMEM,
70543+} shift_result;
70544+
70545+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
70546+extern int is_coord_in_node(const coord_t * coord);
70547+extern int key_in_node(const reiser4_key *, const coord_t *);
70548+extern void coord_item_move_to(coord_t * coord, int items);
70549+extern void coord_unit_move_to(coord_t * coord, int units);
70550+
70551+/* there are two types of repetitive accesses (ra): intra-syscall
70552+ (local) and inter-syscall (global). Local ra is used when
70553+ during single syscall we add/delete several items and units in the
70554+ same place in a tree. Note that plan-A fragments local ra by
70555+ separating stat-data and file body in key-space. Global ra is
70556+ used when user does repetitive modifications in the same place in a
70557+ tree.
70558+
70559+ Our ra implementation serves following purposes:
70560+ 1 it affects balancing decisions so that next operation in a row
70561+ can be performed faster;
70562+ 2 it affects lower-level read-ahead in page-cache;
70563+ 3 it allows to avoid unnecessary lookups by maintaining some state
70564+ across several operations (this is only for local ra);
70565+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
70566+ operations they are performed without actually doing any intra-node
70567+ shifts, until we finish sequence or scope of sequence leaves
70568+ current node, only then we really pack node (local ra only).
70569+*/
70570+
70571+/* another thing that can be useful is to keep per-tree and/or
70572+ per-process cache of recent lookups. This cache can be organised as a
70573+ list of block numbers of formatted nodes sorted by starting key in
70574+ this node. Balancings should invalidate appropriate parts of this
70575+ cache.
70576+*/
70577+
70578+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
70579+ coord_t * coord, lock_handle * handle,
70580+ znode_lock_mode lock, lookup_bias bias,
70581+ tree_level lock_level, tree_level stop_level,
70582+ __u32 flags, ra_info_t *);
70583+
71430cf6
MT
70584+lookup_result reiser4_object_lookup(struct inode *object,
70585+ const reiser4_key * key,
70586+ coord_t * coord,
70587+ lock_handle * lh,
70588+ znode_lock_mode lock_mode,
70589+ lookup_bias bias,
70590+ tree_level lock_level,
70591+ tree_level stop_level,
70592+ __u32 flags, ra_info_t * info);
44254afd
MT
70593+
70594+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
70595+ reiser4_item_data * data, coord_t * coord,
70596+ lock_handle * lh,
70597+ tree_level stop_level, __u32 flags);
70598+insert_result insert_by_coord(coord_t * coord,
70599+ reiser4_item_data * data, const reiser4_key * key,
70600+ lock_handle * lh, __u32);
70601+insert_result insert_extent_by_coord(coord_t * coord,
70602+ reiser4_item_data * data,
70603+ const reiser4_key * key, lock_handle * lh);
70604+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
70605+ const reiser4_key * to_key,
70606+ reiser4_key * smallest_removed);
70607+int kill_node_content(coord_t * from, coord_t * to,
70608+ const reiser4_key * from_key, const reiser4_key * to_key,
70609+ reiser4_key * smallest_removed,
70610+ znode * locked_left_neighbor, struct inode *inode,
70611+ int truncate);
70612+
71430cf6
MT
70613+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
70614+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
44254afd
MT
70615+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
70616+ reiser4_item_data * data, unsigned);
71430cf6 70617+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
44254afd
MT
70618+int find_new_child_ptr(znode * parent, znode * child, znode * left,
70619+ coord_t * result);
70620+
70621+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
70622+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
70623+
70624+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
70625+
70626+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
70627+ const reiser4_key *, reiser4_key *,
70628+ struct inode *, int, int *);
71430cf6
MT
70629+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
70630+ const reiser4_key *, reiser4_key *,
70631+ struct inode *, int, int *);
70632+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70633+ const reiser4_key * to, struct inode *, int);
44254afd 70634+
71430cf6 70635+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
44254afd
MT
70636+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
70637+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
70638+ znode * left, coord_t * result);
70639+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
70640+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
70641+ znode * child);
70642+extern znode *child_znode(const coord_t * in_parent, znode * parent,
70643+ int incore_p, int setup_dkeys_p);
70644+
70645+extern int cbk_cache_init(cbk_cache * cache);
70646+extern void cbk_cache_done(cbk_cache * cache);
70647+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
70648+
70649+extern char *sprint_address(const reiser4_block_nr * block);
70650+
70651+#if REISER4_DEBUG
70652+extern void print_coord_content(const char *prefix, coord_t * p);
70653+extern void reiser4_print_address(const char *prefix,
70654+ const reiser4_block_nr * block);
70655+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
70656+ __u32 flags);
70657+extern void check_dkeys(znode *node);
70658+#else
70659+#define print_coord_content(p, c) noop
70660+#define reiser4_print_address(p, b) noop
70661+#endif
70662+
70663+extern void forget_znode(lock_handle * handle);
70664+extern int deallocate_znode(znode * node);
70665+
70666+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
70667+
70668+/* struct used internally to pack all numerous arguments of tree lookup.
70669+ Used to avoid passing a lot of arguments to helper functions. */
70670+typedef struct cbk_handle {
70671+ /* tree we are in */
70672+ reiser4_tree *tree;
70673+ /* key we are going after */
70674+ const reiser4_key *key;
70675+ /* coord we will store result in */
70676+ coord_t *coord;
70677+ /* type of lock to take on target node */
70678+ znode_lock_mode lock_mode;
70679+ /* lookup bias. See comments at the declaration of lookup_bias */
70680+ lookup_bias bias;
70681+ /* lock level: level starting from which tree traversal starts taking
70682+ * write locks. */
70683+ tree_level lock_level;
70684+ /* level where search will stop. Either item will be found between
70685+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
70686+ returned.
70687+ */
70688+ tree_level stop_level;
70689+ /* level we are currently at */
70690+ tree_level level;
70691+ /* block number of @active node. Tree traversal operates on two
70692+ nodes: active and parent. */
70693+ reiser4_block_nr block;
70694+ /* put here error message to be printed by caller */
70695+ const char *error;
70696+ /* result passed back to caller */
70697+ lookup_result result;
70698+ /* lock handles for active and parent */
70699+ lock_handle *parent_lh;
70700+ lock_handle *active_lh;
70701+ reiser4_key ld_key;
70702+ reiser4_key rd_key;
70703+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
70704+ in tree.h:cbk_flags enum. */
70705+ __u32 flags;
70706+ ra_info_t *ra_info;
70707+ struct inode *object;
70708+} cbk_handle;
70709+
70710+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
70711+
70712+/* eottl.c */
70713+extern int handle_eottl(cbk_handle *h, int *outcome);
70714+
70715+int lookup_multikey(cbk_handle * handle, int nr_keys);
70716+int lookup_couple(reiser4_tree * tree,
70717+ const reiser4_key * key1, const reiser4_key * key2,
70718+ coord_t * coord1, coord_t * coord2,
70719+ lock_handle * lh1, lock_handle * lh2,
70720+ znode_lock_mode lock_mode, lookup_bias bias,
70721+ tree_level lock_level, tree_level stop_level, __u32 flags,
70722+ int *result1, int *result2);
70723+
44254afd
MT
70724+static inline void read_lock_tree(reiser4_tree *tree)
70725+{
70726+ /* check that tree is not locked */
70727+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
70728+ LOCK_CNT_NIL(read_locked_tree) &&
70729+ LOCK_CNT_NIL(write_locked_tree)));
70730+ /* check that spinlocks of lower priorities are not held */
70731+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
70732+ LOCK_CNT_NIL(rw_locked_dk) &&
70733+ LOCK_CNT_NIL(spin_locked_stack)));
70734+
70735+ read_lock(&(tree->tree_lock));
70736+
70737+ LOCK_CNT_INC(read_locked_tree);
70738+ LOCK_CNT_INC(rw_locked_tree);
70739+ LOCK_CNT_INC(spin_locked);
70740+}
70741+
70742+static inline void read_unlock_tree(reiser4_tree *tree)
70743+{
70744+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
70745+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
70746+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70747+
70748+ LOCK_CNT_DEC(read_locked_tree);
70749+ LOCK_CNT_DEC(rw_locked_tree);
70750+ LOCK_CNT_DEC(spin_locked);
70751+
70752+ read_unlock(&(tree->tree_lock));
70753+}
70754+
70755+static inline void write_lock_tree(reiser4_tree *tree)
70756+{
70757+ /* check that tree is not locked */
70758+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
70759+ LOCK_CNT_NIL(read_locked_tree) &&
70760+ LOCK_CNT_NIL(write_locked_tree)));
70761+ /* check that spinlocks of lower priorities are not held */
70762+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
70763+ LOCK_CNT_NIL(rw_locked_dk) &&
70764+ LOCK_CNT_NIL(spin_locked_stack)));
70765+
70766+ write_lock(&(tree->tree_lock));
70767+
70768+ LOCK_CNT_INC(write_locked_tree);
70769+ LOCK_CNT_INC(rw_locked_tree);
70770+ LOCK_CNT_INC(spin_locked);
70771+}
70772+
70773+static inline void write_unlock_tree(reiser4_tree *tree)
70774+{
70775+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
70776+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
70777+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70778+
70779+ LOCK_CNT_DEC(write_locked_tree);
70780+ LOCK_CNT_DEC(rw_locked_tree);
70781+ LOCK_CNT_DEC(spin_locked);
70782+
70783+ write_unlock(&(tree->tree_lock));
70784+}
70785+
70786+static inline void read_lock_dk(reiser4_tree *tree)
70787+{
70788+ /* check that dk is not locked */
70789+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
70790+ LOCK_CNT_NIL(read_locked_dk) &&
70791+ LOCK_CNT_NIL(write_locked_dk)));
70792+ /* check that spinlocks of lower priorities are not held */
70793+ assert("", LOCK_CNT_NIL(spin_locked_stack));
70794+
70795+ read_lock(&((tree)->dk_lock));
70796+
70797+ LOCK_CNT_INC(read_locked_dk);
70798+ LOCK_CNT_INC(rw_locked_dk);
70799+ LOCK_CNT_INC(spin_locked);
70800+}
70801+
70802+static inline void read_unlock_dk(reiser4_tree *tree)
70803+{
70804+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
70805+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
70806+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70807+
70808+ LOCK_CNT_DEC(read_locked_dk);
70809+ LOCK_CNT_DEC(rw_locked_dk);
70810+ LOCK_CNT_DEC(spin_locked);
70811+
70812+ read_unlock(&(tree->dk_lock));
70813+}
70814+
70815+static inline void write_lock_dk(reiser4_tree *tree)
70816+{
70817+ /* check that dk is not locked */
70818+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
70819+ LOCK_CNT_NIL(read_locked_dk) &&
70820+ LOCK_CNT_NIL(write_locked_dk)));
70821+ /* check that spinlocks of lower priorities are not held */
70822+ assert("", LOCK_CNT_NIL(spin_locked_stack));
70823+
70824+ write_lock(&((tree)->dk_lock));
70825+
70826+ LOCK_CNT_INC(write_locked_dk);
70827+ LOCK_CNT_INC(rw_locked_dk);
70828+ LOCK_CNT_INC(spin_locked);
70829+}
70830+
70831+static inline void write_unlock_dk(reiser4_tree *tree)
70832+{
70833+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
70834+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
70835+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
70836+
70837+ LOCK_CNT_DEC(write_locked_dk);
70838+ LOCK_CNT_DEC(rw_locked_dk);
70839+ LOCK_CNT_DEC(spin_locked);
70840+
70841+ write_unlock(&(tree->dk_lock));
70842+}
70843+
70844+/* estimate api. Implementation is in estimate.c */
70845+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
70846+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
70847+reiser4_block_nr estimate_insert_flow(tree_level);
70848+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
70849+reiser4_block_nr calc_estimate_one_insert(tree_level);
70850+reiser4_block_nr estimate_dirty_cluster(struct inode *);
70851+reiser4_block_nr estimate_insert_cluster(struct inode *);
70852+reiser4_block_nr estimate_update_cluster(struct inode *);
70853+
44254afd
MT
70854+/* __REISER4_TREE_H__ */
70855+#endif
70856+
70857+/* Make Linus happy.
70858+ Local variables:
70859+ c-indentation-style: "K&R"
70860+ mode-name: "LC"
70861+ c-basic-offset: 8
70862+ tab-width: 8
70863+ fill-column: 120
70864+ scroll-step: 1
70865+ End:
70866+*/
71430cf6
MT
70867diff -urN linux-2.6.22.orig/fs/reiser4/tree_mod.c linux-2.6.22/fs/reiser4/tree_mod.c
70868--- linux-2.6.22.orig/fs/reiser4/tree_mod.c 1970-01-01 03:00:00.000000000 +0300
70869+++ linux-2.6.22/fs/reiser4/tree_mod.c 2007-07-29 00:25:35.032736855 +0400
70870@@ -0,0 +1,386 @@
44254afd
MT
70871+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70872+ * reiser4/README */
70873+
70874+/*
70875+ * Functions to add/delete new nodes to/from the tree.
70876+ *
70877+ * Functions from this file are used by carry (see carry*) to handle:
70878+ *
70879+ * . insertion of new formatted node into tree
70880+ *
70881+ * . addition of new tree root, increasing tree height
70882+ *
70883+ * . removing tree root, decreasing tree height
70884+ *
70885+ */
70886+
70887+#include "forward.h"
70888+#include "debug.h"
70889+#include "dformat.h"
70890+#include "key.h"
70891+#include "coord.h"
70892+#include "plugin/plugin.h"
70893+#include "jnode.h"
70894+#include "znode.h"
70895+#include "tree_mod.h"
70896+#include "block_alloc.h"
70897+#include "tree_walk.h"
70898+#include "tree.h"
70899+#include "super.h"
70900+
70901+#include <linux/err.h>
70902+
70903+static int add_child_ptr(znode * parent, znode * child);
70904+/* warning only issued if error is not -E_REPEAT */
70905+#define ewarning( error, ... ) \
70906+ if( ( error ) != -E_REPEAT ) \
70907+ warning( __VA_ARGS__ )
70908+
70909+/* allocate new node on the @level and immediately on the right of @brother. */
71430cf6
MT
70910+znode * reiser4_new_node(znode * brother /* existing left neighbor
70911+ * of new node */,
70912+ tree_level level /* tree level at which new node is to
70913+ * be allocated */)
44254afd
MT
70914+{
70915+ znode *result;
70916+ int retcode;
70917+ reiser4_block_nr blocknr;
70918+
70919+ assert("nikita-930", brother != NULL);
70920+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
70921+
70922+ retcode = assign_fake_blocknr_formatted(&blocknr);
70923+ if (retcode == 0) {
70924+ result =
70925+ zget(znode_get_tree(brother), &blocknr, NULL, level,
71430cf6 70926+ reiser4_ctx_gfp_mask_get());
44254afd
MT
70927+ if (IS_ERR(result)) {
70928+ ewarning(PTR_ERR(result), "nikita-929",
70929+ "Cannot allocate znode for carry: %li",
70930+ PTR_ERR(result));
70931+ return result;
70932+ }
70933+ /* cheap test, can be executed even when debugging is off */
70934+ if (!znode_just_created(result)) {
70935+ warning("nikita-2213",
70936+ "Allocated already existing block: %llu",
70937+ (unsigned long long)blocknr);
70938+ zput(result);
70939+ return ERR_PTR(RETERR(-EIO));
70940+ }
70941+
70942+ assert("nikita-931", result != NULL);
70943+ result->nplug = znode_get_tree(brother)->nplug;
70944+ assert("nikita-933", result->nplug != NULL);
70945+
71430cf6 70946+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
44254afd
MT
70947+ if (retcode == 0) {
70948+ ZF_SET(result, JNODE_CREATED);
70949+ zrelse(result);
70950+ } else {
70951+ zput(result);
70952+ result = ERR_PTR(retcode);
70953+ }
70954+ } else {
70955+ /* failure to allocate new node during balancing.
70956+ This should never happen. Ever. Returning -E_REPEAT
70957+ is not viable solution, because "out of disk space"
70958+ is not transient error that will go away by itself.
70959+ */
70960+ ewarning(retcode, "nikita-928",
70961+ "Cannot allocate block for carry: %i", retcode);
70962+ result = ERR_PTR(retcode);
70963+ }
70964+ assert("nikita-1071", result != NULL);
70965+ return result;
70966+}
70967+
70968+/* allocate new root and add it to the tree
70969+
70970+ This helper function is called by add_new_root().
70971+
70972+*/
71430cf6 70973+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
44254afd
MT
70974+ znode * fake /* "fake" znode */ )
70975+{
70976+ reiser4_tree *tree = znode_get_tree(old_root);
70977+ znode *new_root = NULL; /* to shut gcc up */
70978+ int result;
70979+
70980+ assert("nikita-1069", old_root != NULL);
70981+ assert("umka-262", fake != NULL);
70982+ assert("umka-263", tree != NULL);
70983+
70984+ /* "fake" znode---one always hanging just above current root. This
70985+ node is locked when new root is created or existing root is
70986+ deleted. Downward tree traversal takes lock on it before taking
70987+ lock on a root node. This avoids race conditions with root
70988+ manipulations.
70989+
70990+ */
70991+ assert("nikita-1348", znode_above_root(fake));
70992+ assert("nikita-1211", znode_is_root(old_root));
70993+
70994+ result = 0;
70995+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
70996+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
70997+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
70998+ following comment (fs/ext2/ialloc.c:441): Is it really
70999+ ENOSPC?
71000+
71001+ -EXFULL? -EINVAL?
71002+ */
71003+ result = RETERR(-ENOSPC);
71004+ } else {
71005+ /* Allocate block for new root. It's not that
71006+ important where it will be allocated, as root is
71007+ almost always in memory. Moreover, allocate on
71008+ flush can be going here.
71009+ */
71010+ assert("nikita-1448", znode_is_root(old_root));
71430cf6 71011+ new_root = reiser4_new_node(fake, tree->height + 1);
44254afd
MT
71012+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71013+ lock_handle rlh;
71014+
71015+ init_lh(&rlh);
71016+ result =
71017+ longterm_lock_znode(&rlh, new_root,
71018+ ZNODE_WRITE_LOCK,
71019+ ZNODE_LOCK_LOPRI);
71020+ if (result == 0) {
71021+ parent_coord_t *in_parent;
71022+
71023+ znode_make_dirty(fake);
71024+
71025+ /* new root is a child of "fake" node */
71026+ write_lock_tree(tree);
71027+
71028+ ++tree->height;
71029+
71030+ /* recalculate max balance overhead */
71031+ tree->estimate_one_insert =
71032+ estimate_one_insert_item(tree);
71033+
71034+ tree->root_block = *znode_get_block(new_root);
71035+ in_parent = &new_root->in_parent;
71036+ init_parent_coord(in_parent, fake);
71037+ /* manually insert new root into sibling
71038+ * list. With this all nodes involved into
71039+ * balancing are connected after balancing is
71040+ * done---useful invariant to check. */
71041+ sibling_list_insert_nolock(new_root, NULL);
71042+ write_unlock_tree(tree);
71043+
71044+ /* insert into new root pointer to the
71045+ @old_root. */
71046+ assert("nikita-1110",
71047+ WITH_DATA(new_root,
71048+ node_is_empty(new_root)));
71049+ write_lock_dk(tree);
71430cf6
MT
71050+ znode_set_ld_key(new_root, reiser4_min_key());
71051+ znode_set_rd_key(new_root, reiser4_max_key());
44254afd
MT
71052+ write_unlock_dk(tree);
71053+ if (REISER4_DEBUG) {
71054+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71055+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71056+ ZF_SET(old_root, JNODE_ORPHAN);
71057+ }
71058+ result = add_child_ptr(new_root, old_root);
71059+ done_lh(&rlh);
71060+ }
71061+ zrelse(new_root);
71062+ }
71063+ }
71064+ if (result != 0)
71065+ new_root = ERR_PTR(result);
71066+ return new_root;
71067+}
71068+
71069+/* build &reiser4_item_data for inserting child pointer
71070+
71071+ Build &reiser4_item_data that can be later used to insert pointer to @child
71072+ in its parent.
71073+
71074+*/
71075+void build_child_ptr_data(znode * child /* node pointer to which will be
71076+ * inserted */ ,
71077+ reiser4_item_data * data /* where to store result */ )
71078+{
71079+ assert("nikita-1116", child != NULL);
71080+ assert("nikita-1117", data != NULL);
71081+
71082+ /*
71083+ * NOTE: use address of child's blocknr as address of data to be
71084+ * inserted. As result of this data gets into on-disk structure in cpu
71085+ * byte order. internal's create_hook converts it to little endian byte
71086+ * order.
71087+ */
71088+ data->data = (char *)znode_get_block(child);
71089+ /* data -> data is kernel space */
71090+ data->user = 0;
71091+ data->length = sizeof(reiser4_block_nr);
71092+ /* FIXME-VS: hardcoded internal item? */
71093+
71094+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71095+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71096+}
71097+
71098+/* add pointer to @child into empty @parent.
71099+
71100+ This is used when pointer to old root is inserted into new root which is
71101+ empty.
71102+*/
71103+static int add_child_ptr(znode * parent, znode * child)
71104+{
71105+ coord_t coord;
71106+ reiser4_item_data data;
71107+ int result;
71108+ reiser4_key key;
71109+
71110+ assert("nikita-1111", parent != NULL);
71111+ assert("nikita-1112", child != NULL);
71112+ assert("nikita-1115",
71113+ znode_get_level(parent) == znode_get_level(child) + 1);
71114+
71115+ result = zload(parent);
71116+ if (result != 0)
71117+ return result;
71118+ assert("nikita-1113", node_is_empty(parent));
71119+ coord_init_first_unit(&coord, parent);
71120+
71121+ build_child_ptr_data(child, &data);
71122+ data.arg = NULL;
71123+
71124+ read_lock_dk(znode_get_tree(parent));
71125+ key = *znode_get_ld_key(child);
71126+ read_unlock_dk(znode_get_tree(parent));
71127+
71128+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71129+ NULL);
71130+ znode_make_dirty(parent);
71131+ zrelse(parent);
71132+ return result;
71133+}
71134+
71135+/* actually remove tree root */
71430cf6
MT
71136+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
71137+ * being removed */,
71138+ znode * old_root /* root node that is being
71139+ * removed */ ,
71140+ znode * new_root /* new root---sole child of
71141+ * @old_root */,
71142+ const reiser4_block_nr * new_root_blk /* disk address of
71143+ * @new_root */)
44254afd
MT
71144+{
71145+ znode *uber;
71146+ int result;
71147+ lock_handle handle_for_uber;
71148+
71149+ assert("umka-265", tree != NULL);
71150+ assert("nikita-1198", new_root != NULL);
71151+ assert("nikita-1199",
71152+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
71153+
71154+ assert("nikita-1201", znode_is_write_locked(old_root));
71155+
71156+ assert("nikita-1203",
71157+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71158+
71159+ init_lh(&handle_for_uber);
71160+ /* obtain and lock "fake" znode protecting changes in tree height. */
71161+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71162+ &handle_for_uber);
71163+ if (result == 0) {
71164+ uber = handle_for_uber.node;
71165+
71166+ znode_make_dirty(uber);
71167+
71168+ /* don't take long term lock a @new_root. Take spinlock. */
71169+
71170+ write_lock_tree(tree);
71171+
71172+ tree->root_block = *new_root_blk;
71173+ --tree->height;
71174+
71175+ /* recalculate max balance overhead */
71176+ tree->estimate_one_insert = estimate_one_insert_item(tree);
71177+
71178+ assert("nikita-1202",
71179+ tree->height == znode_get_level(new_root));
71180+
71181+ /* new root is child on "fake" node */
71182+ init_parent_coord(&new_root->in_parent, uber);
71183+ ++uber->c_count;
71184+
71185+ /* sibling_list_insert_nolock(new_root, NULL); */
71186+ write_unlock_tree(tree);
71187+
71188+ /* reinitialise old root. */
71189+ result = node_plugin_by_node(old_root)->init(old_root);
71190+ znode_make_dirty(old_root);
71191+ if (result == 0) {
71192+ assert("nikita-1279", node_is_empty(old_root));
71193+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71194+ old_root->c_count = 0;
71195+ }
71196+ }
71197+ done_lh(&handle_for_uber);
71198+
71199+ return result;
71200+}
71201+
71202+/* remove tree root
71203+
71204+ This function removes tree root, decreasing tree height by one. Tree root
71205+ and its only child (that is going to become new tree root) are write locked
71206+ at the entry.
71207+
71208+ To remove tree root we need to take lock on special "fake" znode that
71430cf6
MT
71209+ protects changes of tree height. See comments in reiser4_add_tree_root() for
71210+ more on this.
44254afd
MT
71211+
71212+ Also parent pointers have to be updated in
71213+ old and new root. To simplify code, function is split into two parts: outer
71430cf6
MT
71214+ reiser4_kill_tree_root() collects all necessary arguments and calls
71215+ reiser4_kill_root() to do the actual job.
44254afd
MT
71216+
71217+*/
71430cf6
MT
71218+int reiser4_kill_tree_root(znode * old_root /* tree root that we are
71219+ removing*/)
44254afd
MT
71220+{
71221+ int result;
71222+ coord_t down_link;
71223+ znode *new_root;
71224+ reiser4_tree *tree;
71225+
71226+ assert("umka-266", current_tree != NULL);
71227+ assert("nikita-1194", old_root != NULL);
71228+ assert("nikita-1196", znode_is_root(old_root));
71229+ assert("nikita-1200", node_num_items(old_root) == 1);
71230+ assert("nikita-1401", znode_is_write_locked(old_root));
71231+
71232+ coord_init_first_unit(&down_link, old_root);
71233+
71234+ tree = znode_get_tree(old_root);
71235+ new_root = child_znode(&down_link, old_root, 0, 1);
71236+ if (!IS_ERR(new_root)) {
71237+ result =
71430cf6
MT
71238+ reiser4_kill_root(tree, old_root, new_root,
71239+ znode_get_block(new_root));
44254afd
MT
71240+ zput(new_root);
71241+ } else
71242+ result = PTR_ERR(new_root);
71243+
71244+ return result;
71245+}
71246+
71247+/* Make Linus happy.
71248+ Local variables:
71249+ c-indentation-style: "K&R"
71250+ mode-name: "LC"
71251+ c-basic-offset: 8
71252+ tab-width: 8
71253+ fill-column: 120
71254+ scroll-step: 1
71255+ End:
71256+*/
71430cf6
MT
71257diff -urN linux-2.6.22.orig/fs/reiser4/tree_mod.h linux-2.6.22/fs/reiser4/tree_mod.h
71258--- linux-2.6.22.orig/fs/reiser4/tree_mod.h 1970-01-01 03:00:00.000000000 +0300
71259+++ linux-2.6.22/fs/reiser4/tree_mod.h 2007-07-29 00:25:35.032736855 +0400
44254afd
MT
71260@@ -0,0 +1,29 @@
71261+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71262+ * reiser4/README */
71263+
71264+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71265+ * comments. */
71266+
71267+#if !defined( __REISER4_TREE_MOD_H__ )
71268+#define __REISER4_TREE_MOD_H__
71269+
71270+#include "forward.h"
71271+
71430cf6
MT
71272+znode *reiser4_new_node(znode * brother, tree_level level);
71273+znode *reiser4_add_tree_root(znode * old_root, znode * fake);
71274+int reiser4_kill_tree_root(znode * old_root);
44254afd
MT
71275+void build_child_ptr_data(znode * child, reiser4_item_data * data);
71276+
71277+/* __REISER4_TREE_MOD_H__ */
71278+#endif
71279+
71280+/* Make Linus happy.
71281+ Local variables:
71282+ c-indentation-style: "K&R"
71283+ mode-name: "LC"
71284+ c-basic-offset: 8
71285+ tab-width: 8
71286+ fill-column: 120
71287+ scroll-step: 1
71288+ End:
71289+*/
71430cf6
MT
71290diff -urN linux-2.6.22.orig/fs/reiser4/tree_walk.c linux-2.6.22/fs/reiser4/tree_walk.c
71291--- linux-2.6.22.orig/fs/reiser4/tree_walk.c 1970-01-01 03:00:00.000000000 +0300
71292+++ linux-2.6.22/fs/reiser4/tree_walk.c 2007-07-29 00:25:35.032736855 +0400
71293@@ -0,0 +1,927 @@
44254afd
MT
71294+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71295+ * reiser4/README */
71296+
71297+/* Routines and macros to:
71298+
71299+ get_left_neighbor()
71300+
71301+ get_right_neighbor()
71302+
71303+ get_parent()
71304+
71305+ get_first_child()
71306+
71307+ get_last_child()
71308+
71309+ various routines to walk the whole tree and do things to it like
71310+ repack it, or move it to tertiary storage. Please make them as
71311+ generic as is reasonable.
71312+
71313+*/
71314+
71315+#include "forward.h"
71316+#include "debug.h"
71317+#include "dformat.h"
71318+#include "coord.h"
71319+#include "plugin/item/item.h"
71320+#include "jnode.h"
71321+#include "znode.h"
71322+#include "tree_walk.h"
71323+#include "tree.h"
71324+#include "super.h"
71325+
71326+/* These macros are used internally in tree_walk.c in attempt to make
71327+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71328+ lock_left_neighbor */
71329+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71330+#define FIELD_OFFSET(name) offsetof(znode, name)
71331+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71332+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
71333+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
71334+
71335+/* This is the generic procedure to get and lock `generic' neighbor (left or
71336+ right neighbor or parent). It implements common algorithm for all cases of
71337+ getting lock on neighbor node, only znode structure field is different in
71338+ each case. This is parameterized by ptr_offset argument, which is byte
71339+ offset for the pointer to the desired neighbor within the current node's
71340+ znode structure. This function should be called with the tree lock held */
71341+static int lock_neighbor(
71342+ /* resulting lock handle */
71343+ lock_handle * result,
71344+ /* znode to lock */
71345+ znode * node,
71346+ /* pointer to neighbor (or parent) znode field offset, in bytes from
71347+ the base address of znode structure */
71348+ int ptr_offset,
71349+ /* lock mode for longterm_lock_znode call */
71350+ znode_lock_mode mode,
71351+ /* lock request for longterm_lock_znode call */
71352+ znode_lock_request req,
71353+ /* GN_* flags */
71354+ int flags, int rlocked)
71355+{
71356+ reiser4_tree *tree = znode_get_tree(node);
71357+ znode *neighbor;
71358+ int ret;
71359+
71360+ assert("umka-236", node != NULL);
71361+ assert("umka-237", tree != NULL);
71362+ assert_rw_locked(&(tree->tree_lock));
71363+
71364+ if (flags & GN_TRY_LOCK)
71365+ req |= ZNODE_LOCK_NONBLOCK;
71366+ if (flags & GN_SAME_ATOM)
71367+ req |= ZNODE_LOCK_DONT_FUSE;
71368+
71369+ /* get neighbor's address by using of sibling link, quit while loop
71370+ (and return) if link is not available. */
71371+ while (1) {
71372+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71373+
71374+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71375+ * node pointed by it is not connected.
71376+ *
71377+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71378+ * check and allows passing reference to not connected znode to
71379+ * subsequent longterm_lock_znode() call. This kills possible
71380+ * busy loop if we are trying to get longterm lock on locked but
71381+ * not yet connected parent node. */
71382+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71383+ || znode_is_connected(neighbor))) {
71384+ return RETERR(-E_NO_NEIGHBOR);
71385+ }
71386+
71387+ /* protect it from deletion. */
71388+ zref(neighbor);
71389+
71390+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71391+
71392+ ret = longterm_lock_znode(result, neighbor, mode, req);
71393+
71394+ /* The lock handle obtains its own reference, release the one from above. */
71395+ zput(neighbor);
71396+
71397+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71398+
71399+ /* restart if node we got reference to is being
71400+ invalidated. we should not get reference to this node
71401+ again. */
71402+ if (ret == -EINVAL)
71403+ continue;
71404+ if (ret)
71405+ return ret;
71406+
71407+ /* check if neighbor link still points to just locked znode;
71408+ the link could have been changed while the process slept. */
71409+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
71410+ return 0;
71411+
71412+ /* znode was locked by mistake; unlock it and restart locking
71413+ process from beginning. */
71414+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
71415+ longterm_unlock_znode(result);
71416+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
71417+ }
71418+}
71419+
71420+/* get parent node with longterm lock, accepts GN* flags. */
71421+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
71422+ znode * node /* child node */ ,
71423+ znode_lock_mode mode
71424+ /* type of lock: read or write */ ,
71425+ int flags /* GN_* flags */ )
71426+{
71427+ int result;
71428+
71429+ read_lock_tree(znode_get_tree(node));
71430+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
71431+ ZNODE_LOCK_HIPRI, flags, 1);
71432+ read_unlock_tree(znode_get_tree(node));
71433+ return result;
71434+}
71435+
71436+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
71437+ bit in @flags parameter */
71438+/* Audited by: umka (2002.06.14) */
71439+static inline int
71440+lock_side_neighbor(lock_handle * result,
71441+ znode * node, znode_lock_mode mode, int flags, int rlocked)
71442+{
71443+ int ret;
71444+ int ptr_offset;
71445+ znode_lock_request req;
71446+
71447+ if (flags & GN_GO_LEFT) {
71448+ ptr_offset = LEFT_PTR_OFFSET;
71449+ req = ZNODE_LOCK_LOPRI;
71450+ } else {
71451+ ptr_offset = RIGHT_PTR_OFFSET;
71452+ req = ZNODE_LOCK_HIPRI;
71453+ }
71454+
71455+ ret =
71456+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
71457+
71458+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
71459+ * guarantee that neighbor is absent in the
71460+ * tree; in this case we return -ENOENT --
71461+ * means neighbor at least not found in
71462+ * cache */
71463+ return RETERR(-ENOENT);
71464+
71465+ return ret;
71466+}
71467+
71468+#if REISER4_DEBUG
71469+
71470+int check_sibling_list(znode * node)
71471+{
71472+ znode *scan;
71473+ znode *next;
71474+
71475+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
71476+
71477+ if (node == NULL)
71478+ return 1;
71479+
71480+ if (ZF_ISSET(node, JNODE_RIP))
71481+ return 1;
71482+
71483+ assert("nikita-3270", node != NULL);
71484+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
71485+
71486+ for (scan = node; znode_is_left_connected(scan); scan = next) {
71487+ next = scan->left;
71488+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71489+ assert("nikita-3271", znode_is_right_connected(next));
71490+ assert("nikita-3272", next->right == scan);
71491+ } else
71492+ break;
71493+ }
71494+ for (scan = node; znode_is_right_connected(scan); scan = next) {
71495+ next = scan->right;
71496+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
71497+ assert("nikita-3273", znode_is_left_connected(next));
71498+ assert("nikita-3274", next->left == scan);
71499+ } else
71500+ break;
71501+ }
71502+ return 1;
71503+}
71504+
71505+#endif
71506+
71507+/* Znode sibling pointers maintenence. */
71508+
71509+/* Znode sibling pointers are established between any neighbored nodes which are
71510+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
71511+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
71512+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
71513+
71514+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
71515+ take care about searching (hash table lookup may be required) of znode
71516+ neighbors, establishing sibling pointers between them and setting
71517+ JNODE_*_CONNECTED state bits. */
71518+
71519+/* adjusting of sibling pointers and `connected' states for two
71520+ neighbors; works if one neighbor is NULL (was not found). */
71521+
71522+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
71523+void link_left_and_right(znode * left, znode * right)
71524+{
71525+ assert("nikita-3275", check_sibling_list(left));
71526+ assert("nikita-3275", check_sibling_list(right));
71527+
71528+ if (left != NULL) {
71529+ if (left->right == NULL) {
71530+ left->right = right;
71531+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
71532+
71533+ ON_DEBUG(left->right_version =
71534+ atomic_inc_return(&delim_key_version);
71535+ );
71536+
71537+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
71538+ && left->right != right) {
71539+
71540+ ON_DEBUG(left->right->left_version =
71541+ atomic_inc_return(&delim_key_version);
71542+ left->right_version =
71543+ atomic_inc_return(&delim_key_version););
71544+
71545+ left->right->left = NULL;
71546+ left->right = right;
71547+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
71548+ } else
71549+ /*
71550+ * there is a race condition in renew_sibling_link()
71551+ * and assertions below check that it is only one
71552+ * there. Thread T1 calls renew_sibling_link() without
71553+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
71554+ * node, but before T1 gets to the
71555+ * link_left_and_right(), another thread T2 creates
71556+ * neighbor node and connects it. check for
71557+ * left->right == NULL above protects T1 from
71558+ * overwriting correct left->right pointer installed
71559+ * by T2.
71560+ */
71561+ assert("nikita-3302",
71562+ right == NULL || left->right == right);
71563+ }
71564+ if (right != NULL) {
71565+ if (right->left == NULL) {
71566+ right->left = left;
71567+ ZF_SET(right, JNODE_LEFT_CONNECTED);
71568+
71569+ ON_DEBUG(right->left_version =
71570+ atomic_inc_return(&delim_key_version);
71571+ );
71572+
71573+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
71574+ && right->left != left) {
71575+
71576+ ON_DEBUG(right->left->right_version =
71577+ atomic_inc_return(&delim_key_version);
71578+ right->left_version =
71579+ atomic_inc_return(&delim_key_version););
71580+
71581+ right->left->right = NULL;
71582+ right->left = left;
71583+ ZF_SET(right, JNODE_LEFT_CONNECTED);
71584+
71585+ } else
71586+ assert("nikita-3303",
71587+ left == NULL || right->left == left);
71588+ }
71589+ assert("nikita-3275", check_sibling_list(left));
71590+ assert("nikita-3275", check_sibling_list(right));
71591+}
71592+
71593+/* Audited by: umka (2002.06.14) */
71594+static void link_znodes(znode * first, znode * second, int to_left)
71595+{
71596+ if (to_left)
71597+ link_left_and_right(second, first);
71598+ else
71599+ link_left_and_right(first, second);
71600+}
71601+
71602+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
71603+ coord's unit position in horizontal direction, even across node
71604+ boundary. Should be called under tree lock, it protects nonexistence of
71605+ sibling link on parent level, if lock_side_neighbor() fails with
71606+ -ENOENT. */
71607+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
71608+{
71609+ int ret;
71610+ znode *node;
71611+ reiser4_tree *tree;
71612+
71613+ assert("umka-243", coord != NULL);
71614+ assert("umka-244", handle != NULL);
71615+ assert("zam-1069", handle->node == NULL);
71616+
71617+ ret =
71618+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
71619+ coord_next_unit(coord);
71620+ if (!ret)
71621+ return 0;
71622+
71623+ ret =
71624+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
71625+ if (ret)
71626+ return ret;
71627+
71628+ node = handle->node;
71629+ tree = znode_get_tree(node);
71630+ write_unlock_tree(tree);
71631+
71632+ coord_init_zero(coord);
71633+
71634+ /* We avoid synchronous read here if it is specified by flag. */
71635+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
71636+ ret = jstartio(ZJNODE(handle->node));
71637+ if (!ret)
71638+ ret = -E_REPEAT;
71639+ goto error_locked;
71640+ }
71641+
71642+ /* corresponded zrelse() should be called by the clients of
71643+ far_next_coord(), in place when this node gets unlocked. */
71644+ ret = zload(handle->node);
71645+ if (ret)
71646+ goto error_locked;
71647+
71648+ if (flags & GN_GO_LEFT)
71649+ coord_init_last_unit(coord, node);
71650+ else
71651+ coord_init_first_unit(coord, node);
71652+
71653+ if (0) {
71654+ error_locked:
71655+ longterm_unlock_znode(handle);
71656+ }
71657+ write_lock_tree(tree);
71658+ return ret;
71659+}
71660+
71661+/* Very significant function which performs a step in horizontal direction
71662+ when sibling pointer is not available. Actually, it is only function which
71663+ does it.
71664+ Note: this function does not restore locking status at exit,
71665+ caller should does care about proper unlocking and zrelsing */
71666+static int
71667+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
71668+ tree_level level, int flags, int *nr_locked)
71669+{
71670+ int ret;
71671+ int to_left = flags & GN_GO_LEFT;
71672+ reiser4_block_nr da;
71673+ /* parent of the neighbor node; we set it to parent until not sharing
71674+ of one parent between child and neighbor node is detected */
71675+ znode *side_parent = coord->node;
71676+ reiser4_tree *tree = znode_get_tree(child);
71677+ znode *neighbor = NULL;
71678+
71679+ assert("umka-245", coord != NULL);
71680+ assert("umka-246", handle != NULL);
71681+ assert("umka-247", child != NULL);
71682+ assert("umka-303", tree != NULL);
71683+
71684+ init_lh(handle);
71685+ write_lock_tree(tree);
71686+ ret = far_next_coord(coord, handle, flags);
71687+
71688+ if (ret) {
71689+ if (ret != -ENOENT) {
71690+ write_unlock_tree(tree);
71691+ return ret;
71692+ }
71693+ } else {
71694+ item_plugin *iplug;
71695+
71696+ if (handle->node != NULL) {
71697+ (*nr_locked)++;
71698+ side_parent = handle->node;
71699+ }
71700+
71701+ /* does coord object points to internal item? We do not
71702+ support sibling pointers between znode for formatted and
71703+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
71704+ iplug = item_plugin_by_coord(coord);
71705+ if (!item_is_internal(coord)) {
71706+ link_znodes(child, NULL, to_left);
71707+ write_unlock_tree(tree);
71708+ /* we know there can't be formatted neighbor */
71709+ return RETERR(-E_NO_NEIGHBOR);
71710+ }
71711+ write_unlock_tree(tree);
71712+
71713+ iplug->s.internal.down_link(coord, NULL, &da);
71714+
71715+ if (flags & GN_NO_ALLOC) {
71716+ neighbor = zlook(tree, &da);
71717+ } else {
71718+ neighbor =
71430cf6
MT
71719+ zget(tree, &da, side_parent, level,
71720+ reiser4_ctx_gfp_mask_get());
44254afd
MT
71721+ }
71722+
71723+ if (IS_ERR(neighbor)) {
71724+ ret = PTR_ERR(neighbor);
71725+ return ret;
71726+ }
71727+
71728+ if (neighbor)
71729+ /* update delimiting keys */
71730+ set_child_delimiting_keys(coord->node, coord, neighbor);
71731+
71732+ write_lock_tree(tree);
71733+ }
71734+
71735+ if (likely(neighbor == NULL ||
71736+ (znode_get_level(child) == znode_get_level(neighbor)
71737+ && child != neighbor)))
71738+ link_znodes(child, neighbor, to_left);
71739+ else {
71740+ warning("nikita-3532",
71741+ "Sibling nodes on the different levels: %i != %i\n",
71742+ znode_get_level(child), znode_get_level(neighbor));
71743+ ret = RETERR(-EIO);
71744+ }
71745+
71746+ write_unlock_tree(tree);
71747+
71748+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
71749+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
71750+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
71751+ zput(neighbor);
71752+
71753+ return ret;
71754+}
71755+
71756+/* This function is for establishing of one side relation. */
71757+/* Audited by: umka (2002.06.14) */
71758+static int connect_one_side(coord_t * coord, znode * node, int flags)
71759+{
71760+ coord_t local;
71761+ lock_handle handle;
71762+ int nr_locked;
71763+ int ret;
71764+
71765+ assert("umka-248", coord != NULL);
71766+ assert("umka-249", node != NULL);
71767+
71768+ coord_dup_nocheck(&local, coord);
71769+
71770+ init_lh(&handle);
71771+
71772+ ret =
71773+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
71774+ flags | GN_NO_ALLOC, &nr_locked);
71775+
71776+ if (handle.node != NULL) {
71777+ /* complementary operations for zload() and lock() in far_next_coord() */
71778+ zrelse(handle.node);
71779+ longterm_unlock_znode(&handle);
71780+ }
71781+
71782+ /* we catch error codes which are not interesting for us because we
71783+ run renew_sibling_link() only for znode connection. */
71784+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
71785+ return 0;
71786+
71787+ return ret;
71788+}
71789+
71790+/* if @child is not in `connected' state, performs hash searches for left and
71791+ right neighbor nodes and establishes horizontal sibling links */
71792+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71793+int connect_znode(coord_t * parent_coord, znode * child)
71794+{
71795+ reiser4_tree *tree = znode_get_tree(child);
71796+ int ret = 0;
71797+
71798+ assert("zam-330", parent_coord != NULL);
71799+ assert("zam-331", child != NULL);
71800+ assert("zam-332", parent_coord->node != NULL);
71801+ assert("umka-305", tree != NULL);
71802+
71803+ /* it is trivial to `connect' root znode because it can't have
71804+ neighbors */
71805+ if (znode_above_root(parent_coord->node)) {
71806+ child->left = NULL;
71807+ child->right = NULL;
71808+ ZF_SET(child, JNODE_LEFT_CONNECTED);
71809+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
71810+
71811+ ON_DEBUG(child->left_version =
71812+ atomic_inc_return(&delim_key_version);
71813+ child->right_version =
71814+ atomic_inc_return(&delim_key_version););
71815+
71816+ return 0;
71817+ }
71818+
71819+ /* load parent node */
71820+ coord_clear_iplug(parent_coord);
71821+ ret = zload(parent_coord->node);
71822+
71823+ if (ret != 0)
71824+ return ret;
71825+
71826+ /* protect `connected' state check by tree_lock */
71827+ read_lock_tree(tree);
71828+
71829+ if (!znode_is_right_connected(child)) {
71830+ read_unlock_tree(tree);
71831+ /* connect right (default is right) */
71832+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
71833+ if (ret)
71834+ goto zrelse_and_ret;
71835+
71836+ read_lock_tree(tree);
71837+ }
71838+
71839+ ret = znode_is_left_connected(child);
71840+
71841+ read_unlock_tree(tree);
71842+
71843+ if (!ret) {
71844+ ret =
71845+ connect_one_side(parent_coord, child,
71846+ GN_NO_ALLOC | GN_GO_LEFT);
71847+ } else
71848+ ret = 0;
71849+
71850+ zrelse_and_ret:
71851+ zrelse(parent_coord->node);
71852+
71853+ return ret;
71854+}
71855+
71856+/* this function is like renew_sibling_link() but allocates neighbor node if
71857+ it doesn't exist and `connects' it. It may require making two steps in
71858+ horizontal direction, first one for neighbor node finding/allocation,
71859+ second one is for finding neighbor of neighbor to connect freshly allocated
71860+ znode. */
71861+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71862+static int
71863+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
71864+{
71865+ coord_t local;
71866+ lock_handle empty[2];
71867+ reiser4_tree *tree = znode_get_tree(node);
71868+ znode *neighbor = NULL;
71869+ int nr_locked = 0;
71870+ int ret;
71871+
71872+ assert("umka-250", coord != NULL);
71873+ assert("umka-251", node != NULL);
71874+ assert("umka-307", tree != NULL);
71875+ assert("umka-308", level <= tree->height);
71876+
71877+ /* umka (2002.06.14)
71878+ Here probably should be a check for given "level" validness.
71879+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
71880+ */
71881+
71882+ coord_dup(&local, coord);
71883+
71884+ ret =
71885+ renew_sibling_link(&local, &empty[0], node, level,
71886+ flags & ~GN_NO_ALLOC, &nr_locked);
71887+ if (ret)
71888+ goto out;
71889+
71890+ /* tree lock is not needed here because we keep parent node(s) locked
71891+ and reference to neighbor znode incremented */
71892+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
71893+
71894+ read_lock_tree(tree);
71895+ ret = znode_is_connected(neighbor);
71896+ read_unlock_tree(tree);
71897+ if (ret) {
71898+ ret = 0;
71899+ goto out;
71900+ }
71901+
71902+ ret =
71903+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
71904+ flags | GN_NO_ALLOC, &nr_locked);
71905+ /* second renew_sibling_link() call is used for znode connection only,
71906+ so we can live with these errors */
71907+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
71908+ ret = 0;
71909+
71910+ out:
71911+
71912+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
71913+ zrelse(empty[nr_locked].node);
71914+ longterm_unlock_znode(&empty[nr_locked]);
71915+ }
71916+
71917+ if (neighbor != NULL)
71918+ /* decrement znode reference counter without actually
71919+ releasing it. */
71920+ atomic_dec(&ZJNODE(neighbor)->x_count);
71921+
71922+ return ret;
71923+}
71924+
71925+/*
71926+ reiser4_get_neighbor() -- lock node's neighbor.
71927+
71928+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
71929+ given parameter) using sibling link to it. If sibling link is not available
71930+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
71931+ level up for information about neighbor's disk address. We lock node's
71932+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
71933+ disk address is in next (to left or to right) down link from link that points
71934+ to original node. If not, we need to lock parent's neighbor, read its content
71935+ and take first(last) downlink with neighbor's disk address. That locking
71936+ could be done by using sibling link and lock_neighbor() function, if sibling
71937+ link exists. In another case we have to go level up again until we find
71938+ common parent or valid sibling link. Then go down
71939+ allocating/connecting/locking/reading nodes until neighbor of first one is
71940+ locked.
71941+
71942+ @neighbor: result lock handle,
71943+ @node: a node which we lock neighbor of,
71944+ @lock_mode: lock mode {LM_READ, LM_WRITE},
71945+ @flags: logical OR of {GN_*} (see description above) subset.
71946+
71947+ @return: 0 if success, negative value if lock was impossible due to an error
71948+ or lack of neighbor node.
71949+*/
71950+
71951+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71952+int
71953+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
71954+ znode_lock_mode lock_mode, int flags)
71955+{
71956+ reiser4_tree *tree = znode_get_tree(node);
71957+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
71958+
71959+ coord_t coord;
71960+
71961+ tree_level base_level;
71962+ tree_level h = 0;
71963+ int ret;
71964+
71965+ assert("umka-252", tree != NULL);
71966+ assert("umka-253", neighbor != NULL);
71967+ assert("umka-254", node != NULL);
71968+
71969+ base_level = znode_get_level(node);
71970+
71971+ assert("umka-310", base_level <= tree->height);
71972+
71973+ coord_init_zero(&coord);
71974+
71975+ again:
71976+ /* first, we try to use simple lock_neighbor() which requires sibling
71977+ link existence */
71978+ read_lock_tree(tree);
71979+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
71980+ read_unlock_tree(tree);
71981+ if (!ret) {
71982+ /* load znode content if it was specified */
71983+ if (flags & GN_LOAD_NEIGHBOR) {
71984+ ret = zload(node);
71985+ if (ret)
71986+ longterm_unlock_znode(neighbor);
71987+ }
71988+ return ret;
71989+ }
71990+
71991+ /* only -ENOENT means we may look upward and try to connect
71992+ @node with its neighbor (if @flags allow us to do it) */
71993+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
71994+ return ret;
71995+
71996+ /* before establishing of sibling link we lock parent node; it is
71997+ required by renew_neighbor() to work. */
71998+ init_lh(&path[0]);
71999+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72000+ if (ret)
72001+ return ret;
72002+ if (znode_above_root(path[0].node)) {
72003+ longterm_unlock_znode(&path[0]);
72004+ return RETERR(-E_NO_NEIGHBOR);
72005+ }
72006+
72007+ while (1) {
72008+ znode *child = (h == 0) ? node : path[h - 1].node;
72009+ znode *parent = path[h].node;
72010+
72011+ ret = zload(parent);
72012+ if (ret)
72013+ break;
72014+
72015+ ret = find_child_ptr(parent, child, &coord);
72016+
72017+ if (ret) {
72018+ zrelse(parent);
72019+ break;
72020+ }
72021+
72022+ /* try to establish missing sibling link */
72023+ ret = renew_neighbor(&coord, child, h + base_level, flags);
72024+
72025+ zrelse(parent);
72026+
72027+ switch (ret) {
72028+ case 0:
72029+ /* unlocking of parent znode prevents simple
72030+ deadlock situation */
72031+ done_lh(&path[h]);
72032+
72033+ /* depend on tree level we stay on we repeat first
72034+ locking attempt ... */
72035+ if (h == 0)
72036+ goto again;
72037+
72038+ /* ... or repeat establishing of sibling link at
72039+ one level below. */
72040+ --h;
72041+ break;
72042+
72043+ case -ENOENT:
72044+ /* sibling link is not available -- we go
72045+ upward. */
72046+ init_lh(&path[h + 1]);
72047+ ret =
72048+ reiser4_get_parent(&path[h + 1], parent,
72049+ ZNODE_READ_LOCK);
72050+ if (ret)
72051+ goto fail;
72052+ ++h;
72053+ if (znode_above_root(path[h].node)) {
72054+ ret = RETERR(-E_NO_NEIGHBOR);
72055+ goto fail;
72056+ }
72057+ break;
72058+
72059+ case -E_DEADLOCK:
72060+ /* there was lock request from hi-pri locker. if
72061+ it is possible we unlock last parent node and
72062+ re-lock it again. */
71430cf6 72063+ for (; reiser4_check_deadlock(); h--) {
44254afd
MT
72064+ done_lh(&path[h]);
72065+ if (h == 0)
72066+ goto fail;
72067+ }
72068+
72069+ break;
72070+
72071+ default: /* other errors. */
72072+ goto fail;
72073+ }
72074+ }
72075+ fail:
72076+ ON_DEBUG(check_lock_node_data(node));
72077+ ON_DEBUG(check_lock_data());
72078+
72079+ /* unlock path */
72080+ do {
72081+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72082+ fail; path[0] is already done_lh-ed, therefore
72083+ longterm_unlock_znode(&path[h]); is not applicable */
72084+ done_lh(&path[h]);
72085+ --h;
72086+ } while (h + 1 != 0);
72087+
72088+ return ret;
72089+}
72090+
72091+/* remove node from sibling list */
72092+/* Audited by: umka (2002.06.14) */
72093+void sibling_list_remove(znode * node)
72094+{
72095+ reiser4_tree *tree;
72096+
72097+ tree = znode_get_tree(node);
72098+ assert("umka-255", node != NULL);
72099+ assert_rw_write_locked(&(tree->tree_lock));
72100+ assert("nikita-3275", check_sibling_list(node));
72101+
72102+ write_lock_dk(tree);
72103+ if (znode_is_right_connected(node) && node->right != NULL &&
72104+ znode_is_left_connected(node) && node->left != NULL) {
72105+ assert("zam-32245",
72106+ keyeq(znode_get_rd_key(node),
72107+ znode_get_ld_key(node->right)));
72108+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72109+ }
72110+ write_unlock_dk(tree);
72111+
72112+ if (znode_is_right_connected(node) && node->right != NULL) {
72113+ assert("zam-322", znode_is_left_connected(node->right));
72114+ node->right->left = node->left;
72115+ ON_DEBUG(node->right->left_version =
72116+ atomic_inc_return(&delim_key_version);
72117+ );
72118+ }
72119+ if (znode_is_left_connected(node) && node->left != NULL) {
72120+ assert("zam-323", znode_is_right_connected(node->left));
72121+ node->left->right = node->right;
72122+ ON_DEBUG(node->left->right_version =
72123+ atomic_inc_return(&delim_key_version);
72124+ );
72125+ }
72126+
72127+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72128+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72129+ ON_DEBUG(node->left = node->right = NULL;
72130+ node->left_version = atomic_inc_return(&delim_key_version);
72131+ node->right_version = atomic_inc_return(&delim_key_version););
72132+ assert("nikita-3276", check_sibling_list(node));
72133+}
72134+
72135+/* disconnect node from sibling list */
72136+void sibling_list_drop(znode * node)
72137+{
72138+ znode *right;
72139+ znode *left;
72140+
72141+ assert("nikita-2464", node != NULL);
72142+ assert("nikita-3277", check_sibling_list(node));
72143+
72144+ right = node->right;
72145+ if (right != NULL) {
72146+ assert("nikita-2465", znode_is_left_connected(right));
72147+ right->left = NULL;
72148+ ON_DEBUG(right->left_version =
72149+ atomic_inc_return(&delim_key_version);
72150+ );
72151+ }
72152+ left = node->left;
72153+ if (left != NULL) {
72154+ assert("zam-323", znode_is_right_connected(left));
72155+ left->right = NULL;
72156+ ON_DEBUG(left->right_version =
72157+ atomic_inc_return(&delim_key_version);
72158+ );
72159+ }
72160+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72161+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72162+ ON_DEBUG(node->left = node->right = NULL;
72163+ node->left_version = atomic_inc_return(&delim_key_version);
72164+ node->right_version = atomic_inc_return(&delim_key_version););
72165+}
72166+
72167+/* Insert new node into sibling list. Regular balancing inserts new node
72168+ after (at right side) existing and locked node (@before), except one case
72169+ of adding new tree root node. @before should be NULL in that case. */
72170+void sibling_list_insert_nolock(znode * new, znode * before)
72171+{
72172+ assert("zam-334", new != NULL);
72173+ assert("nikita-3298", !znode_is_left_connected(new));
72174+ assert("nikita-3299", !znode_is_right_connected(new));
72175+ assert("nikita-3300", new->left == NULL);
72176+ assert("nikita-3301", new->right == NULL);
72177+ assert("nikita-3278", check_sibling_list(new));
72178+ assert("nikita-3279", check_sibling_list(before));
72179+
72180+ if (before != NULL) {
72181+ assert("zam-333", znode_is_connected(before));
72182+ new->right = before->right;
72183+ new->left = before;
72184+ ON_DEBUG(new->right_version =
72185+ atomic_inc_return(&delim_key_version);
72186+ new->left_version =
72187+ atomic_inc_return(&delim_key_version););
72188+ if (before->right != NULL) {
72189+ before->right->left = new;
72190+ ON_DEBUG(before->right->left_version =
72191+ atomic_inc_return(&delim_key_version);
72192+ );
72193+ }
72194+ before->right = new;
72195+ ON_DEBUG(before->right_version =
72196+ atomic_inc_return(&delim_key_version);
72197+ );
72198+ } else {
72199+ new->right = NULL;
72200+ new->left = NULL;
72201+ ON_DEBUG(new->right_version =
72202+ atomic_inc_return(&delim_key_version);
72203+ new->left_version =
72204+ atomic_inc_return(&delim_key_version););
72205+ }
72206+ ZF_SET(new, JNODE_LEFT_CONNECTED);
72207+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
72208+ assert("nikita-3280", check_sibling_list(new));
72209+ assert("nikita-3281", check_sibling_list(before));
72210+}
72211+
72212+/*
72213+ Local variables:
72214+ c-indentation-style: "K&R"
72215+ mode-name: "LC"
72216+ c-basic-offset: 8
72217+ tab-width: 8
72218+ fill-column: 80
72219+ End:
72220+*/
71430cf6
MT
72221diff -urN linux-2.6.22.orig/fs/reiser4/tree_walk.h linux-2.6.22/fs/reiser4/tree_walk.h
72222--- linux-2.6.22.orig/fs/reiser4/tree_walk.h 1970-01-01 03:00:00.000000000 +0300
72223+++ linux-2.6.22/fs/reiser4/tree_walk.h 2007-07-29 00:25:35.032736855 +0400
44254afd
MT
72224@@ -0,0 +1,125 @@
72225+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72226+
72227+/* definitions of reiser4 tree walk functions */
72228+
72229+#ifndef __FS_REISER4_TREE_WALK_H__
72230+#define __FS_REISER4_TREE_WALK_H__
72231+
72232+#include "debug.h"
72233+#include "forward.h"
72234+
72235+/* establishes horizontal links between cached znodes */
72236+int connect_znode(coord_t * coord, znode * node);
72237+
72238+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72239+ have the following common arguments:
72240+
72241+ return codes:
72242+
72243+ @return : 0 - OK,
72244+
72245+ZAM-FIXME-HANS: wrong return code name. Change them all.
72246+ -ENOENT - neighbor is not in cache, what is detected by sibling
72247+ link absence.
72248+
72249+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72250+ found (because we are left-/right- most node of the
72251+ tree, for example). Also, this return code is for
72252+ reiser4_get_parent() when we see no parent link -- it
72253+ means that our node is root node.
72254+
72255+ -E_DEADLOCK - deadlock detected (request from high-priority process
72256+ received), other error codes are conformed to
72257+ /usr/include/asm/errno.h .
72258+*/
72259+
72260+int
72261+reiser4_get_parent_flags(lock_handle * result, znode * node,
72262+ znode_lock_mode mode, int flags);
72263+
72264+/* bits definition for reiser4_get_neighbor function `flags' arg. */
72265+typedef enum {
72266+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72267+ * find not allocated not connected neigbor by going though upper
72268+ * levels */
72269+ GN_CAN_USE_UPPER_LEVELS = 0x1,
72270+ /* locking left neighbor instead of right one */
72271+ GN_GO_LEFT = 0x2,
72272+ /* automatically load neighbor node content */
72273+ GN_LOAD_NEIGHBOR = 0x4,
72274+ /* return -E_REPEAT if can't lock */
72275+ GN_TRY_LOCK = 0x8,
72276+ /* used internally in tree_walk.c, causes renew_sibling to not
72277+ allocate neighbor znode, but only search for it in znode cache */
72278+ GN_NO_ALLOC = 0x10,
72279+ /* do not go across atom boundaries */
72280+ GN_SAME_ATOM = 0x20,
72281+ /* allow to lock not connected nodes */
72282+ GN_ALLOW_NOT_CONNECTED = 0x40,
72283+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72284+ GN_ASYNC = 0x80
72285+} znode_get_neigbor_flags;
72286+
72287+/* A commonly used wrapper for reiser4_get_parent_flags(). */
72288+static inline int reiser4_get_parent(lock_handle * result, znode * node,
72289+ znode_lock_mode mode)
72290+{
72291+ return reiser4_get_parent_flags(result, node, mode,
72292+ GN_ALLOW_NOT_CONNECTED);
72293+}
72294+
72295+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72296+ znode_lock_mode lock_mode, int flags);
72297+
72298+/* there are wrappers for most common usages of reiser4_get_neighbor() */
72299+static inline int
72300+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72301+ int flags)
72302+{
72303+ return reiser4_get_neighbor(result, node, lock_mode,
72304+ flags | GN_GO_LEFT);
72305+}
72306+
72307+static inline int
72308+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72309+ int flags)
72310+{
72311+ ON_DEBUG(check_lock_node_data(node));
72312+ ON_DEBUG(check_lock_data());
72313+ return reiser4_get_neighbor(result, node, lock_mode,
72314+ flags & (~GN_GO_LEFT));
72315+}
72316+
72317+extern void sibling_list_remove(znode * node);
72318+extern void sibling_list_drop(znode * node);
72319+extern void sibling_list_insert_nolock(znode * new, znode * before);
72320+extern void link_left_and_right(znode * left, znode * right);
72321+
72322+/* Functions called by tree_walk() when tree_walk() ... */
72323+struct tree_walk_actor {
72324+ /* ... meets a formatted node, */
72325+ int (*process_znode) (tap_t *, void *);
72326+ /* ... meets an extent, */
72327+ int (*process_extent) (tap_t *, void *);
72328+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72329+ * node or extent processing functions. */
72330+ int (*before) (void *);
72331+};
72332+
72333+#if REISER4_DEBUG
72334+int check_sibling_list(znode * node);
72335+#else
72336+#define check_sibling_list(n) (1)
72337+#endif
72338+
72339+#endif /* __FS_REISER4_TREE_WALK_H__ */
72340+
72341+/*
72342+ Local variables:
72343+ c-indentation-style: "K&R"
72344+ mode-name: "LC"
72345+ c-basic-offset: 8
72346+ tab-width: 8
72347+ fill-column: 120
72348+ End:
72349+*/
71430cf6
MT
72350diff -urN linux-2.6.22.orig/fs/reiser4/txnmgr.c linux-2.6.22/fs/reiser4/txnmgr.c
72351--- linux-2.6.22.orig/fs/reiser4/txnmgr.c 1970-01-01 03:00:00.000000000 +0300
72352+++ linux-2.6.22/fs/reiser4/txnmgr.c 2007-07-29 00:25:35.040738926 +0400
72353@@ -0,0 +1,3164 @@
44254afd
MT
72354+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72355+ * reiser4/README */
72356+
72357+/* Joshua MacDonald wrote the first draft of this code. */
72358+
72359+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72360+filesystem scales only as well as its worst locking design. You need to
72361+substantially restructure this code. Josh was not as experienced a programmer
72362+as you. Particularly review how the locking style differs from what you did
72363+for znodes usingt hi-lo priority locking, and present to me an opinion on
72364+whether the differences are well founded. */
72365+
72366+/* I cannot help but to disagree with the sentiment above. Locking of
72367+ * transaction manager is _not_ badly designed, and, at the very least, is not
72368+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72369+ * locking on znodes, especially on the root node of the tree. --nikita,
72370+ * 2003.10.13 */
72371+
72372+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
72373+ txnmgr processes capture_block requests and manages the relationship between jnodes and
72374+ atoms through the various stages of a transcrash, and it also oversees the fusion and
72375+ capture-on-copy processes. The main difficulty with this task is maintaining a
72376+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
72377+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72378+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
72379+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
72380+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
72381+ atom, you must use trylock() and possibly reverse the order.
72382+
72383+ This code implements the design documented at:
72384+
72385+ http://namesys.com/txn-doc.html
72386+
72387+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
72388+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
72389+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
72390+year old --- define all technical terms used.
72391+
72392+*/
72393+
72394+/* Thoughts on the external transaction interface:
72395+
71430cf6 72396+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
44254afd
MT
72397+ creates state that lasts for the duration of a system call and is called at the start
72398+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
72399+ occupying the scope of a single system call. We wish to give certain applications an
72400+ interface to begin and close (commit) transactions. Since our implementation of
72401+ transactions does not yet support isolation, allowing an application to open a
72402+ transaction implies trusting it to later close the transaction. Part of the
72403+ transaction interface will be aimed at enabling that trust, but the interface for
72404+ actually using transactions is fairly narrow.
72405+
72406+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
72407+ this identifier into a string that a shell-script could use, allowing you to start a
72408+ transaction by issuing a command. Once open, the transcrash should be set in the task
72409+ structure, and there should be options (I suppose) to allow it to be carried across
72410+ fork/exec. A transcrash has several options:
72411+
72412+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
72413+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
72414+ capture on reads as well, it should set READ_FUSING.
72415+
72416+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
72417+ eventually close (or else the machine must crash). If the application dies an
72418+ unexpected death with an open transcrash, for example, or if it hangs for a long
72419+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
72420+ This is a dangerous option, but it is one way to solve the problem until isolated
72421+ transcrashes are available for untrusted applications.
72422+
72423+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
72424+ creating a vulnerability based on resource starvation. Guaranteeing that some
72425+ minimum amount of computational resources are made available would seem more correct
72426+ than guaranteeing some amount of time. When we again have someone to code the work,
72427+ this issue should be considered carefully. -Hans
72428+
72429+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
72430+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
72431+ where it is safe for the application to fail, because the system may not be able to
72432+ grant the allocation and the application must be able to back-out. For this reason,
72433+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
72434+ the application may also wish to extend the allocation after beginning its transcrash.
72435+
72436+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
72437+ modifications that require transaction protection. When isolated transactions are
72438+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
72439+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
72440+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
72441+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
72442+
72443+ For actually implementing these out-of-system-call-scopped transcrashes, the
72444+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
72445+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
71430cf6 72446+ "struct kmem_cache *_txnh_slab" created for that purpose in this file.
44254afd
MT
72447+*/
72448+
72449+/* Extending the other system call interfaces for future transaction features:
72450+
72451+ Specialized applications may benefit from passing flags to the ordinary system call
72452+ interface such as read(), write(), or stat(). For example, the application specifies
72453+ WRITE_FUSING by default but wishes to add that a certain read() command should be
72454+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
72455+ read, or the file-data read? These issues are straight-forward, but there are a lot of
72456+ them and adding the necessary flags-passing code will be tedious.
72457+
72458+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
72459+ flag, which specifies that although it is a read operation being requested, a
72460+ write-lock should be taken. The reason is that read-locks are shared while write-locks
72461+ are exclusive, so taking a read-lock when a later-write is known in advance will often
72462+ leads to deadlock. If a reader knows it will write later, it should issue read
72463+ requests with the RMW flag set.
72464+*/
72465+
72466+/*
72467+ The znode/atom deadlock avoidance.
72468+
72469+ FIXME(Zam): writing of this comment is in progress.
72470+
72471+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
72472+ long-term locking, which makes reiser4 locking scheme more complex. It had
72473+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
72474+ looked as the following: one stopped thread waits for a long-term lock on
72475+ znode, the thread who owns that lock waits when fusion with another atom will
72476+ be allowed.
72477+
72478+ The source of the deadlocks is an optimization of not capturing index nodes
72479+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
72480+ unconditionally captures each block before locking it.
72481+
72482+ That scheme has no deadlocks. Let's begin with the thread which stage is
72483+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
72484+ a capture because it's stage allows fusion with any atom except which are
72485+ being committed currently. A process of atom commit can't deadlock because
72486+ atom commit procedure does not acquire locks and does not fuse with other
72487+ atoms. Reiser4 does capturing right before going to sleep inside the
72488+ longtertm_lock_znode() function, it means the znode which we want to lock is
72489+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
72490+ continue the analysis we understand that no one process in the sequence may
72491+ waits atom fusion. Thereby there are no deadlocks of described kind.
72492+
72493+ The capturing optimization makes the deadlocks possible. A thread can wait a
72494+ lock which owner did not captured that node. The lock owner's current atom
72495+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
72496+ state. A deadlock is possible when that atom meets another one which is in
72497+ ASTAGE_CAPTURE_WAIT already.
72498+
72499+ The deadlock avoidance scheme includes two algorithms:
72500+
72501+ First algorithm is used when a thread captures a node which is locked but not
72502+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
72503+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
72504+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
72505+ routine which forces all lock owners to join with current atom is executed.
72506+
72507+ Second algorithm does not allow to skip capturing of already captured nodes.
72508+
72509+ Both algorithms together prevent waiting a longterm lock without atom fusion
72510+ with atoms of all lock owners, which is a key thing for getting atom/znode
72511+ locking deadlocks.
72512+*/
72513+
72514+/*
72515+ * Transactions and mmap(2).
72516+ *
72517+ * 1. Transactions are not supported for accesses through mmap(2), because
72518+ * this would effectively amount to user-level transactions whose duration
72519+ * is beyond control of the kernel.
72520+ *
72521+ * 2. That said, we still want to preserve some decency with regard to
72522+ * mmap(2). During normal write(2) call, following sequence of events
72523+ * happens:
72524+ *
72525+ * 1. page is created;
72526+ *
72527+ * 2. jnode is created, dirtied and captured into current atom.
72528+ *
72529+ * 3. extent is inserted and modified.
72530+ *
72531+ * Steps (2) and (3) take place under long term lock on the twig node.
72532+ *
72533+ * When file is accessed through mmap(2) page is always created during
71430cf6
MT
72534+ * page fault.
72535+ * After this (in reiser4_readpage()->reiser4_readpage_extent()):
44254afd
MT
72536+ *
72537+ * 1. if access is made to non-hole page new jnode is created, (if
72538+ * necessary)
72539+ *
72540+ * 2. if access is made to the hole page, jnode is not created (XXX
72541+ * not clear why).
72542+ *
72543+ * Also, even if page is created by write page fault it is not marked
72544+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
72545+ * with page write-out.
72546+ *
72547+ * Dirty bit installed by hardware is only transferred to the struct page
72548+ * later, when page is unmapped (in zap_pte_range(), or
72549+ * try_to_unmap_one()).
72550+ *
72551+ * So, with mmap(2) we have to handle following irksome situations:
72552+ *
72553+ * 1. there exists modified page (clean or dirty) without jnode
72554+ *
72555+ * 2. there exists modified page (clean or dirty) with clean jnode
72556+ *
72557+ * 3. clean page which is a part of atom can be transparently modified
72558+ * at any moment through mapping without becoming dirty.
72559+ *
72560+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
72561+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
72562+ * don't see them, because these methods operate on atoms.
72563+ *
72564+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
72565+ * captured jnode captured by some atom. As part of early flush (for
72566+ * example) page was written out. Dirty bit was cleared on both page and
72567+ * jnode. After this page is modified through mapping, but kernel doesn't
72568+ * notice and just discards page and jnode as part of commit. (XXX
72569+ * actually it doesn't, because to reclaim page ->releasepage() has to be
72570+ * called and before this dirty bit will be transferred to the struct
72571+ * page).
72572+ *
72573+ */
72574+
72575+#include "debug.h"
72576+#include "txnmgr.h"
72577+#include "jnode.h"
72578+#include "znode.h"
72579+#include "block_alloc.h"
72580+#include "tree.h"
72581+#include "wander.h"
72582+#include "ktxnmgrd.h"
72583+#include "super.h"
72584+#include "page_cache.h"
72585+#include "reiser4.h"
72586+#include "vfs_ops.h"
72587+#include "inode.h"
72588+#include "flush.h"
72589+
72590+#include <asm/atomic.h>
72591+#include <linux/types.h>
72592+#include <linux/fs.h>
72593+#include <linux/mm.h>
72594+#include <linux/slab.h>
72595+#include <linux/pagemap.h>
72596+#include <linux/writeback.h>
72597+#include <linux/swap.h> /* for totalram_pages */
72598+
72599+static void atom_free(txn_atom * atom);
72600+
72601+static int commit_txnh(txn_handle * txnh);
72602+
72603+static void wakeup_atom_waitfor_list(txn_atom * atom);
72604+static void wakeup_atom_waiting_list(txn_atom * atom);
72605+
72606+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
72607+
72608+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
72609+
72610+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
72611+
72612+static int capture_init_fusion(jnode * node, txn_handle * txnh,
72613+ txn_capture mode);
72614+
72615+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
72616+
72617+static void capture_fuse_into(txn_atom * small, txn_atom * large);
72618+
71430cf6 72619+void reiser4_invalidate_list(struct list_head *);
44254afd
MT
72620+
72621+/* GENERIC STRUCTURES */
72622+
72623+typedef struct _txn_wait_links txn_wait_links;
72624+
72625+struct _txn_wait_links {
72626+ lock_stack *_lock_stack;
72627+ struct list_head _fwaitfor_link;
72628+ struct list_head _fwaiting_link;
72629+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72630+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
72631+};
72632+
72633+/* FIXME: In theory, we should be using the slab cache init & destructor
72634+ methods instead of, e.g., jnode_init, etc. */
71430cf6 72635+static struct kmem_cache *_atom_slab = NULL;
44254afd 72636+/* this is for user-visible, cross system-call transactions. */
71430cf6 72637+static struct kmem_cache *_txnh_slab = NULL;
44254afd
MT
72638+
72639+/**
72640+ * init_txnmgr_static - create transaction manager slab caches
72641+ *
72642+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
72643+ * initialization.
72644+ */
72645+int init_txnmgr_static(void)
72646+{
72647+ assert("jmacd-600", _atom_slab == NULL);
72648+ assert("jmacd-601", _txnh_slab == NULL);
72649+
72650+ ON_DEBUG(atomic_set(&flush_cnt, 0));
72651+
72652+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
72653+ SLAB_HWCACHE_ALIGN |
72654+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
72655+ if (_atom_slab == NULL)
72656+ return RETERR(-ENOMEM);
72657+
72658+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
72659+ SLAB_HWCACHE_ALIGN, NULL, NULL);
72660+ if (_txnh_slab == NULL) {
72661+ kmem_cache_destroy(_atom_slab);
72662+ _atom_slab = NULL;
72663+ return RETERR(-ENOMEM);
72664+ }
72665+
72666+ return 0;
72667+}
72668+
72669+/**
72670+ * done_txnmgr_static - delete txn_atom and txn_handle caches
72671+ *
72672+ * This is called on reiser4 module unloading or system shutdown.
72673+ */
72674+void done_txnmgr_static(void)
72675+{
72676+ destroy_reiser4_cache(&_atom_slab);
72677+ destroy_reiser4_cache(&_txnh_slab);
72678+}
72679+
72680+/**
72681+ * init_txnmgr - initialize a new transaction manager
72682+ * @mgr: pointer to transaction manager embedded in reiser4 super block
72683+ *
72684+ * This is called on mount. Makes necessary initializations.
72685+ */
71430cf6 72686+void reiser4_init_txnmgr(txn_mgr *mgr)
44254afd
MT
72687+{
72688+ assert("umka-169", mgr != NULL);
72689+
72690+ mgr->atom_count = 0;
72691+ mgr->id_count = 1;
72692+ INIT_LIST_HEAD(&mgr->atoms_list);
72693+ spin_lock_init(&mgr->tmgr_lock);
71430cf6 72694+ mutex_init(&mgr->commit_mutex);
44254afd
MT
72695+}
72696+
72697+/**
71430cf6 72698+ * reiser4_done_txnmgr - stop transaction manager
44254afd
MT
72699+ * @mgr: pointer to transaction manager embedded in reiser4 super block
72700+ *
72701+ * This is called on umount. Does sanity checks.
72702+ */
71430cf6 72703+void reiser4_done_txnmgr(txn_mgr *mgr)
44254afd
MT
72704+{
72705+ assert("umka-170", mgr != NULL);
72706+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
72707+ assert("umka-1702", mgr->atom_count == 0);
72708+}
72709+
72710+/* Initialize a transaction handle. */
72711+/* Audited by: umka (2002.06.13) */
72712+static void txnh_init(txn_handle * txnh, txn_mode mode)
72713+{
72714+ assert("umka-171", txnh != NULL);
72715+
72716+ txnh->mode = mode;
72717+ txnh->atom = NULL;
71430cf6 72718+ reiser4_ctx_gfp_mask_set();
44254afd
MT
72719+ txnh->flags = 0;
72720+ spin_lock_init(&txnh->hlock);
72721+ INIT_LIST_HEAD(&txnh->txnh_link);
72722+}
72723+
72724+#if REISER4_DEBUG
72725+/* Check if a transaction handle is clean. */
72726+static int txnh_isclean(txn_handle * txnh)
72727+{
72728+ assert("umka-172", txnh != NULL);
72729+ return txnh->atom == NULL &&
72730+ LOCK_CNT_NIL(spin_locked_txnh);
72731+}
72732+#endif
72733+
72734+/* Initialize an atom. */
72735+static void atom_init(txn_atom * atom)
72736+{
72737+ int level;
72738+
72739+ assert("umka-173", atom != NULL);
72740+
72741+ memset(atom, 0, sizeof(txn_atom));
72742+
72743+ atom->stage = ASTAGE_FREE;
72744+ atom->start_time = jiffies;
72745+
72746+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
72747+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
72748+
72749+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
72750+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
72751+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
72752+ INIT_LIST_HEAD(&atom->inodes);
71430cf6 72753+ spin_lock_init(&(atom->alock));
44254afd
MT
72754+ /* list of transaction handles */
72755+ INIT_LIST_HEAD(&atom->txnh_list);
72756+ /* link to transaction manager's list of atoms */
72757+ INIT_LIST_HEAD(&atom->atom_link);
72758+ INIT_LIST_HEAD(&atom->fwaitfor_list);
72759+ INIT_LIST_HEAD(&atom->fwaiting_list);
72760+ blocknr_set_init(&atom->delete_set);
72761+ blocknr_set_init(&atom->wandered_map);
72762+
72763+ init_atom_fq_parts(atom);
72764+}
72765+
72766+#if REISER4_DEBUG
72767+/* Check if an atom is clean. */
72768+static int atom_isclean(txn_atom * atom)
72769+{
72770+ int level;
72771+
72772+ assert("umka-174", atom != NULL);
72773+
72774+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72775+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
72776+ return 0;
72777+ }
72778+ }
72779+
72780+ return atom->stage == ASTAGE_FREE &&
72781+ atom->txnh_count == 0 &&
72782+ atom->capture_count == 0 &&
72783+ atomic_read(&atom->refcount) == 0 &&
72784+ (&atom->atom_link == atom->atom_link.next &&
72785+ &atom->atom_link == atom->atom_link.prev) &&
72786+ list_empty_careful(&atom->txnh_list) &&
72787+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
72788+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
72789+ list_empty_careful(ATOM_WB_LIST(atom)) &&
72790+ list_empty_careful(&atom->fwaitfor_list) &&
72791+ list_empty_careful(&atom->fwaiting_list) &&
72792+ atom_fq_parts_are_clean(atom);
72793+}
72794+#endif
72795+
72796+/* Begin a transaction in this context. Currently this uses the reiser4_context's
72797+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
72798+ this will be extended to allow transaction handles to span several contexts. */
72799+/* Audited by: umka (2002.06.13) */
71430cf6 72800+void reiser4_txn_begin(reiser4_context * context)
44254afd
MT
72801+{
72802+ assert("jmacd-544", context->trans == NULL);
72803+
72804+ context->trans = &context->trans_in_ctx;
72805+
72806+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
72807+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
72808+ stack allocated right now, but we would like to allow for dynamically allocated
72809+ transcrashes that span multiple system calls.
72810+ */
72811+ txnh_init(context->trans, TXN_WRITE_FUSING);
72812+}
72813+
72814+/* Finish a transaction handle context. */
71430cf6 72815+int reiser4_txn_end(reiser4_context * context)
44254afd
MT
72816+{
72817+ long ret = 0;
72818+ txn_handle *txnh;
72819+
72820+ assert("umka-283", context != NULL);
71430cf6 72821+ assert("nikita-3012", reiser4_schedulable());
44254afd
MT
72822+ assert("vs-24", context == get_current_context());
72823+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
72824+
72825+ txnh = context->trans;
72826+ if (txnh != NULL) {
72827+ if (txnh->atom != NULL)
72828+ ret = commit_txnh(txnh);
72829+ assert("jmacd-633", txnh_isclean(txnh));
72830+ context->trans = NULL;
72831+ }
72832+ return ret;
72833+}
72834+
71430cf6 72835+void reiser4_txn_restart(reiser4_context * context)
44254afd 72836+{
71430cf6
MT
72837+ reiser4_txn_end(context);
72838+ reiser4_preempt_point();
72839+ reiser4_txn_begin(context);
44254afd
MT
72840+}
72841+
71430cf6 72842+void reiser4_txn_restart_current(void)
44254afd 72843+{
71430cf6 72844+ reiser4_txn_restart(get_current_context());
44254afd
MT
72845+}
72846+
72847+/* TXN_ATOM */
72848+
72849+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
72850+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
72851+ return NULL. */
72852+static txn_atom *txnh_get_atom(txn_handle * txnh)
72853+{
72854+ txn_atom *atom;
72855+
72856+ assert("umka-180", txnh != NULL);
72857+ assert_spin_not_locked(&(txnh->hlock));
72858+
72859+ while (1) {
72860+ spin_lock_txnh(txnh);
72861+ atom = txnh->atom;
72862+
72863+ if (atom == NULL)
72864+ break;
72865+
72866+ if (spin_trylock_atom(atom))
72867+ break;
72868+
72869+ atomic_inc(&atom->refcount);
72870+
72871+ spin_unlock_txnh(txnh);
72872+ spin_lock_atom(atom);
72873+ spin_lock_txnh(txnh);
72874+
72875+ if (txnh->atom == atom) {
72876+ atomic_dec(&atom->refcount);
72877+ break;
72878+ }
72879+
72880+ spin_unlock_txnh(txnh);
72881+ atom_dec_and_unlock(atom);
72882+ }
72883+
72884+ return atom;
72885+}
72886+
72887+/* Get the current atom and spinlock it if current atom present. May return NULL */
72888+txn_atom *get_current_atom_locked_nocheck(void)
72889+{
72890+ reiser4_context *cx;
72891+ txn_atom *atom;
72892+ txn_handle *txnh;
72893+
72894+ cx = get_current_context();
72895+ assert("zam-437", cx != NULL);
72896+
72897+ txnh = cx->trans;
72898+ assert("zam-435", txnh != NULL);
72899+
72900+ atom = txnh_get_atom(txnh);
72901+
72902+ spin_unlock_txnh(txnh);
72903+ return atom;
72904+}
72905+
72906+/* Get the atom belonging to a jnode, which is initially locked. Return with
72907+ both jnode and atom locked. This performs the necessary spin_trylock to
72908+ break the lock-ordering cycle. Assumes the jnode is already locked, and
72909+ returns NULL if atom is not set. */
72910+txn_atom *jnode_get_atom(jnode * node)
72911+{
72912+ txn_atom *atom;
72913+
72914+ assert("umka-181", node != NULL);
72915+
72916+ while (1) {
72917+ assert_spin_locked(&(node->guard));
72918+
72919+ atom = node->atom;
72920+ /* node is not in any atom */
72921+ if (atom == NULL)
72922+ break;
72923+
72924+ /* If atom is not locked, grab the lock and return */
72925+ if (spin_trylock_atom(atom))
72926+ break;
72927+
72928+ /* At least one jnode belongs to this atom it guarantees that
72929+ * atom->refcount > 0, we can safely increment refcount. */
72930+ atomic_inc(&atom->refcount);
72931+ spin_unlock_jnode(node);
72932+
72933+ /* re-acquire spin locks in the right order */
72934+ spin_lock_atom(atom);
72935+ spin_lock_jnode(node);
72936+
72937+ /* check if node still points to the same atom. */
72938+ if (node->atom == atom) {
72939+ atomic_dec(&atom->refcount);
72940+ break;
72941+ }
72942+
72943+ /* releasing of atom lock and reference requires not holding
72944+ * locks on jnodes. */
72945+ spin_unlock_jnode(node);
72946+
72947+ /* We do not sure that this atom has extra references except our
72948+ * one, so we should call proper function which may free atom if
72949+ * last reference is released. */
72950+ atom_dec_and_unlock(atom);
72951+
72952+ /* lock jnode again for getting valid node->atom pointer
72953+ * value. */
72954+ spin_lock_jnode(node);
72955+ }
72956+
72957+ return atom;
72958+}
72959+
72960+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
72961+ by flush code to indicate whether the next node (in some direction) is suitable for
72962+ flushing. */
72963+int
72964+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
72965+{
72966+ int compat;
72967+ txn_atom *atom;
72968+
72969+ assert("umka-182", node != NULL);
72970+ assert("umka-183", check != NULL);
72971+
72972+ /* Not sure what this function is supposed to do if supplied with @check that is
72973+ neither formatted nor unformatted (bitmap or so). */
72974+ assert("nikita-2373", jnode_is_znode(check)
72975+ || jnode_is_unformatted(check));
72976+
72977+ /* Need a lock on CHECK to get its atom and to check various state bits.
72978+ Don't need a lock on NODE once we get the atom lock. */
72979+ /* It is not enough to lock two nodes and check (node->atom ==
72980+ check->atom) because atom could be locked and being fused at that
72981+ moment, jnodes of the atom of that state (being fused) can point to
72982+ different objects, but the atom is the same. */
72983+ spin_lock_jnode(check);
72984+
72985+ atom = jnode_get_atom(check);
72986+
72987+ if (atom == NULL) {
72988+ compat = 0;
72989+ } else {
72990+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
72991+
72992+ if (compat && jnode_is_znode(check)) {
72993+ compat &= znode_is_connected(JZNODE(check));
72994+ }
72995+
72996+ if (compat && alloc_check) {
72997+ compat &= (alloc_value == jnode_is_flushprepped(check));
72998+ }
72999+
73000+ spin_unlock_atom(atom);
73001+ }
73002+
73003+ spin_unlock_jnode(check);
73004+
73005+ return compat;
73006+}
73007+
73008+/* Decrement the atom's reference count and if it falls to zero, free it. */
73009+void atom_dec_and_unlock(txn_atom * atom)
73010+{
73011+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73012+
73013+ assert("umka-186", atom != NULL);
73014+ assert_spin_locked(&(atom->alock));
73015+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
73016+
73017+ if (atomic_dec_and_test(&atom->refcount)) {
73018+ /* take txnmgr lock and atom lock in proper order. */
73019+ if (!spin_trylock_txnmgr(mgr)) {
73020+ /* This atom should exist after we re-acquire its
73021+ * spinlock, so we increment its reference counter. */
73022+ atomic_inc(&atom->refcount);
73023+ spin_unlock_atom(atom);
73024+ spin_lock_txnmgr(mgr);
73025+ spin_lock_atom(atom);
73026+
73027+ if (!atomic_dec_and_test(&atom->refcount)) {
73028+ spin_unlock_atom(atom);
73029+ spin_unlock_txnmgr(mgr);
73030+ return;
73031+ }
73032+ }
73033+ assert_spin_locked(&(mgr->tmgr_lock));
73034+ atom_free(atom);
73035+ spin_unlock_txnmgr(mgr);
73036+ } else
73037+ spin_unlock_atom(atom);
73038+}
73039+
73040+/* Create new atom and connect it to given transaction handle. This adds the
73041+ atom to the transaction manager's list and sets its reference count to 1, an
73042+ artificial reference which is kept until it commits. We play strange games
73043+ to avoid allocation under jnode & txnh spinlocks.*/
73044+
73045+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73046+{
73047+ txn_atom *atom;
73048+ txn_mgr *mgr;
73049+
73050+ if (REISER4_DEBUG && rofs_tree(current_tree)) {
73051+ warning("nikita-3366", "Creating atom on rofs");
73052+ dump_stack();
73053+ }
73054+
73055+ if (*atom_alloc == NULL) {
71430cf6
MT
73056+ (*atom_alloc) = kmem_cache_alloc(_atom_slab,
73057+ reiser4_ctx_gfp_mask_get());
44254afd
MT
73058+
73059+ if (*atom_alloc == NULL)
73060+ return RETERR(-ENOMEM);
73061+ }
73062+
73063+ /* and, also, txnmgr spin lock should be taken before jnode and txnh
73064+ locks. */
73065+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73066+ spin_lock_txnmgr(mgr);
73067+ spin_lock_txnh(txnh);
73068+
73069+ /* Check whether new atom still needed */
73070+ if (txnh->atom != NULL) {
73071+ /* NOTE-NIKITA probably it is rather better to free
71430cf6 73072+ * atom_alloc here than thread it up to reiser4_try_capture() */
44254afd
MT
73073+
73074+ spin_unlock_txnh(txnh);
73075+ spin_unlock_txnmgr(mgr);
73076+
73077+ return -E_REPEAT;
73078+ }
73079+
73080+ atom = *atom_alloc;
73081+ *atom_alloc = NULL;
73082+
73083+ atom_init(atom);
73084+
73085+ assert("jmacd-17", atom_isclean(atom));
73086+
73087+ /*
71430cf6
MT
73088+ * lock ordering is broken here. It is ok, as long as @atom is new
73089+ * and inaccessible for others. We can't use spin_lock_atom or
73090+ * spin_lock(&atom->alock) because they care about locking
73091+ * dependencies. spin_trylock_lock doesn't.
44254afd 73092+ */
71430cf6 73093+ check_me("", spin_trylock_atom(atom));
44254afd
MT
73094+
73095+ /* add atom to the end of transaction manager's list of atoms */
73096+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
73097+ atom->atom_id = mgr->id_count++;
73098+ mgr->atom_count += 1;
73099+
73100+ /* Release txnmgr lock */
73101+ spin_unlock_txnmgr(mgr);
73102+
73103+ /* One reference until it commits. */
73104+ atomic_inc(&atom->refcount);
73105+ atom->stage = ASTAGE_CAPTURE_FUSE;
73106+ atom->super = reiser4_get_current_sb();
73107+ capture_assign_txnh_nolock(atom, txnh);
73108+
71430cf6 73109+ spin_unlock_atom(atom);
44254afd
MT
73110+ spin_unlock_txnh(txnh);
73111+
73112+ return -E_REPEAT;
73113+}
73114+
73115+/* Return true if an atom is currently "open". */
73116+static int atom_isopen(const txn_atom * atom)
73117+{
73118+ assert("umka-185", atom != NULL);
73119+
73120+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73121+}
73122+
73123+/* Return the number of pointers to this atom that must be updated during fusion. This
73124+ approximates the amount of work to be done. Fusion chooses the atom with fewer
73125+ pointers to fuse into the atom with more pointers. */
73126+static int atom_pointer_count(const txn_atom * atom)
73127+{
73128+ assert("umka-187", atom != NULL);
73129+
73130+ /* This is a measure of the amount of work needed to fuse this atom
73131+ * into another. */
73132+ return atom->txnh_count + atom->capture_count;
73133+}
73134+
73135+/* Called holding the atom lock, this removes the atom from the transaction manager list
73136+ and frees it. */
73137+static void atom_free(txn_atom * atom)
73138+{
73139+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73140+
73141+ assert("umka-188", atom != NULL);
73142+ assert_spin_locked(&(atom->alock));
73143+
73144+ /* Remove from the txn_mgr's atom list */
73145+ assert_spin_locked(&(mgr->tmgr_lock));
73146+ mgr->atom_count -= 1;
73147+ list_del_init(&atom->atom_link);
73148+
73149+ /* Clean the atom */
73150+ assert("jmacd-16",
73151+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73152+ atom->stage = ASTAGE_FREE;
73153+
73154+ blocknr_set_destroy(&atom->delete_set);
73155+ blocknr_set_destroy(&atom->wandered_map);
73156+
73157+ assert("jmacd-16", atom_isclean(atom));
73158+
73159+ spin_unlock_atom(atom);
73160+
73161+ kmem_cache_free(_atom_slab, atom);
73162+}
73163+
73164+static int atom_is_dotard(const txn_atom * atom)
73165+{
73166+ return time_after(jiffies, atom->start_time +
73167+ get_current_super_private()->tmgr.atom_max_age);
73168+}
73169+
73170+static int atom_can_be_committed(txn_atom * atom)
73171+{
73172+ assert_spin_locked(&(atom->alock));
73173+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
73174+ return atom->txnh_count == atom->nr_waiters + 1;
73175+}
73176+
73177+/* Return true if an atom should commit now. This is determined by aging, atom
73178+ size or atom flags. */
73179+static int atom_should_commit(const txn_atom * atom)
73180+{
73181+ assert("umka-189", atom != NULL);
73182+ return
73183+ (atom->flags & ATOM_FORCE_COMMIT) ||
73184+ ((unsigned)atom_pointer_count(atom) >
73185+ get_current_super_private()->tmgr.atom_max_size)
73186+ || atom_is_dotard(atom);
73187+}
73188+
73189+/* return 1 if current atom exists and requires commit. */
73190+int current_atom_should_commit(void)
73191+{
73192+ txn_atom *atom;
73193+ int result = 0;
73194+
73195+ atom = get_current_atom_locked_nocheck();
73196+ if (atom) {
73197+ result = atom_should_commit(atom);
73198+ spin_unlock_atom(atom);
73199+ }
73200+ return result;
73201+}
73202+
73203+static int atom_should_commit_asap(const txn_atom * atom)
73204+{
73205+ unsigned int captured;
73206+ unsigned int pinnedpages;
73207+
73208+ assert("nikita-3309", atom != NULL);
73209+
73210+ captured = (unsigned)atom->capture_count;
73211+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73212+
73213+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73214+}
73215+
73216+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73217+{
73218+ jnode *first_dirty;
73219+
73220+ list_for_each_entry(first_dirty, head, capture_link) {
73221+ if (!(flags & JNODE_FLUSH_COMMIT)) {
73222+ /*
73223+ * skip jnodes which "heard banshee" or having active
73224+ * I/O
73225+ */
73226+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73227+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
73228+ continue;
73229+ }
73230+ return first_dirty;
73231+ }
73232+ return NULL;
73233+}
73234+
73235+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73236+ nodes on atom's lists */
73237+jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73238+{
73239+ jnode *first_dirty;
73240+ tree_level level;
73241+
73242+ assert_spin_locked(&(atom->alock));
73243+
73244+ /* The flush starts from LEAF_LEVEL (=1). */
73245+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73246+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73247+ continue;
73248+
73249+ first_dirty =
73250+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73251+ flags);
73252+ if (first_dirty)
73253+ return first_dirty;
73254+ }
73255+
73256+ /* znode-above-root is on the list #0. */
73257+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73258+}
73259+
73260+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73261+{
73262+ jnode *cur;
73263+
73264+ assert("zam-905", atom_is_protected(atom));
73265+
73266+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73267+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73268+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73269+
73270+ spin_lock_jnode(cur);
73271+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73272+ if (JF_ISSET(cur, JNODE_DIRTY)) {
73273+ queue_jnode(fq, cur);
73274+ } else {
73275+ /* move jnode to atom's clean list */
73276+ list_move_tail(&cur->capture_link,
73277+ ATOM_CLEAN_LIST(atom));
73278+ }
73279+ }
73280+ spin_unlock_jnode(cur);
73281+
73282+ cur = next;
73283+ }
73284+}
73285+
73286+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73287+ * jnodes to disk. */
73288+static int submit_wb_list(void)
73289+{
73290+ int ret;
73291+ flush_queue_t *fq;
73292+
73293+ fq = get_fq_for_current_atom();
73294+ if (IS_ERR(fq))
73295+ return PTR_ERR(fq);
73296+
73297+ dispatch_wb_list(fq->atom, fq);
73298+ spin_unlock_atom(fq->atom);
73299+
71430cf6
MT
73300+ ret = reiser4_write_fq(fq, NULL, 1);
73301+ reiser4_fq_put(fq);
44254afd
MT
73302+
73303+ return ret;
73304+}
73305+
73306+/* Wait completion of all writes, re-submit atom writeback list if needed. */
73307+static int current_atom_complete_writes(void)
73308+{
73309+ int ret;
73310+
73311+ /* Each jnode from that list was modified and dirtied when it had i/o
73312+ * request running already. After i/o completion we have to resubmit
73313+ * them to disk again.*/
73314+ ret = submit_wb_list();
73315+ if (ret < 0)
73316+ return ret;
73317+
73318+ /* Wait all i/o completion */
73319+ ret = current_atom_finish_all_fq();
73320+ if (ret)
73321+ return ret;
73322+
73323+ /* Scan wb list again; all i/o should be completed, we re-submit dirty
73324+ * nodes to disk */
73325+ ret = submit_wb_list();
73326+ if (ret < 0)
73327+ return ret;
73328+
73329+ /* Wait all nodes we just submitted */
73330+ return current_atom_finish_all_fq();
73331+}
73332+
71430cf6
MT
73333+#if REISER4_DEBUG
73334+
73335+static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
73336+{
73337+ if (atom == NULL) {
73338+ printk("%s: no atom\n", prefix);
73339+ return;
73340+ }
73341+
73342+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
73343+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
73344+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
73345+ atom->txnh_count, atom->capture_count, atom->stage,
73346+ atom->start_time, atom->flushed);
73347+}
73348+
73349+#else /* REISER4_DEBUG */
73350+
73351+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
73352+
73353+#endif /* REISER4_DEBUG */
73354+
44254afd
MT
73355+#define TOOMANYFLUSHES (1 << 13)
73356+
73357+/* Called with the atom locked and no open "active" transaction handlers except
73358+ ours, this function calls flush_current_atom() until all dirty nodes are
73359+ processed. Then it initiates commit processing.
73360+
73361+ Called by the single remaining open "active" txnh, which is closing. Other
73362+ open txnhs belong to processes which wait atom commit in commit_txnh()
73363+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
73364+ long as we hold the atom lock none of the jnodes can be captured and/or
73365+ locked.
73366+
73367+ Return value is an error code if commit fails.
73368+*/
73369+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73370+{
73371+ reiser4_super_info_data *sbinfo = get_current_super_private();
73372+ long ret = 0;
73373+ /* how many times jnode_flush() was called as a part of attempt to
73374+ * commit this atom. */
73375+ int flushiters;
73376+
73377+ assert("zam-888", atom != NULL && *atom != NULL);
73378+ assert_spin_locked(&((*atom)->alock));
73379+ assert("zam-887", get_current_context()->trans->atom == *atom);
73380+ assert("jmacd-151", atom_isopen(*atom));
73381+
44254afd 73382+ assert("nikita-3184",
71430cf6 73383+ get_current_super_private()->delete_mutex_owner != current);
44254afd
MT
73384+
73385+ for (flushiters = 0;; ++flushiters) {
73386+ ret =
73387+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73388+ JNODE_FLUSH_COMMIT,
73389+ LONG_MAX /* nr_to_write */ ,
73390+ nr_submitted, atom, NULL);
73391+ if (ret != -E_REPEAT)
73392+ break;
73393+
73394+ /* if atom's dirty list contains one znode which is
73395+ HEARD_BANSHEE and is locked we have to allow lock owner to
73396+ continue and uncapture that znode */
71430cf6 73397+ reiser4_preempt_point();
44254afd
MT
73398+
73399+ *atom = get_current_atom_locked();
73400+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73401+ warning("nikita-3176",
73402+ "Flushing like mad: %i", flushiters);
71430cf6 73403+ reiser4_info_atom("atom", *atom);
44254afd
MT
73404+ DEBUGON(flushiters > (1 << 20));
73405+ }
73406+ }
73407+
73408+ if (ret)
73409+ return ret;
73410+
73411+ assert_spin_locked(&((*atom)->alock));
73412+
73413+ if (!atom_can_be_committed(*atom)) {
73414+ spin_unlock_atom(*atom);
73415+ return RETERR(-E_REPEAT);
73416+ }
73417+
73418+ if ((*atom)->capture_count == 0)
73419+ goto done;
73420+
73421+ /* Up to this point we have been flushing and after flush is called we
73422+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
73423+ at this point, commit should be successful. */
71430cf6 73424+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
44254afd
MT
73425+ ON_DEBUG(((*atom)->committer = current));
73426+ spin_unlock_atom(*atom);
73427+
73428+ ret = current_atom_complete_writes();
73429+ if (ret)
73430+ return ret;
73431+
73432+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
73433+
73434+ /* isolate critical code path which should be executed by only one
71430cf6
MT
73435+ * thread using tmgr mutex */
73436+ mutex_lock(&sbinfo->tmgr.commit_mutex);
44254afd
MT
73437+
73438+ ret = reiser4_write_logs(nr_submitted);
73439+ if (ret < 0)
73440+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
73441+
71430cf6 73442+ /* The atom->ovrwr_nodes list is processed under commit mutex held
44254afd 73443+ because of bitmap nodes which are captured by special way in
71430cf6 73444+ reiser4_pre_commit_hook_bitmap(), that way does not include
44254afd 73445+ capture_fuse_wait() as a capturing of other nodes does -- the commit
71430cf6
MT
73446+ mutex is used for transaction isolation instead. */
73447+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
73448+ mutex_unlock(&sbinfo->tmgr.commit_mutex);
44254afd 73449+
71430cf6
MT
73450+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
73451+ reiser4_invalidate_list(ATOM_WB_LIST(*atom));
44254afd
MT
73452+ assert("zam-927", list_empty(&(*atom)->inodes));
73453+
73454+ spin_lock_atom(*atom);
73455+ done:
71430cf6 73456+ reiser4_atom_set_stage(*atom, ASTAGE_DONE);
44254afd
MT
73457+ ON_DEBUG((*atom)->committer = NULL);
73458+
73459+ /* Atom's state changes, so wake up everybody waiting for this
73460+ event. */
73461+ wakeup_atom_waiting_list(*atom);
73462+
73463+ /* Decrement the "until commit" reference, at least one txnh (the caller) is
73464+ still open. */
73465+ atomic_dec(&(*atom)->refcount);
73466+
73467+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
73468+ assert("jmacd-1062", (*atom)->capture_count == 0);
73469+ BUG_ON((*atom)->capture_count != 0);
73470+ assert_spin_locked(&((*atom)->alock));
73471+
73472+ return ret;
73473+}
73474+
73475+/* TXN_TXNH */
73476+
73477+/**
73478+ * force_commit_atom - commit current atom and wait commit completion
73479+ * @txnh:
73480+ *
73481+ * Commits current atom and wait commit completion; current atom and @txnh have
73482+ * to be spinlocked before call, this function unlocks them on exit.
73483+ */
73484+int force_commit_atom(txn_handle *txnh)
73485+{
73486+ txn_atom *atom;
73487+
73488+ assert("zam-837", txnh != NULL);
73489+ assert_spin_locked(&(txnh->hlock));
73490+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
73491+
73492+ atom = txnh->atom;
73493+
73494+ assert("zam-834", atom != NULL);
73495+ assert_spin_locked(&(atom->alock));
73496+
73497+ /*
73498+ * Set flags for atom and txnh: forcing atom commit and waiting for
73499+ * commit completion
73500+ */
73501+ txnh->flags |= TXNH_WAIT_COMMIT;
73502+ atom->flags |= ATOM_FORCE_COMMIT;
73503+
73504+ spin_unlock_txnh(txnh);
73505+ spin_unlock_atom(atom);
73506+
73507+ /* commit is here */
71430cf6 73508+ reiser4_txn_restart_current();
44254afd
MT
73509+ return 0;
73510+}
73511+
73512+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
73513+ * should we commit all atoms including new ones which are created after this
73514+ * functions is called. */
73515+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
73516+{
73517+ int ret;
73518+ txn_atom *atom;
73519+ txn_mgr *mgr;
73520+ txn_handle *txnh;
73521+ unsigned long start_time = jiffies;
73522+ reiser4_context *ctx = get_current_context();
73523+
73524+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71430cf6 73525+ assert("nikita-3058", reiser4_commit_check_locks());
44254afd 73526+
71430cf6 73527+ reiser4_txn_restart_current();
44254afd
MT
73528+
73529+ mgr = &get_super_private(super)->tmgr;
73530+
73531+ txnh = ctx->trans;
73532+
73533+ again:
73534+
73535+ spin_lock_txnmgr(mgr);
73536+
73537+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
73538+ spin_lock_atom(atom);
73539+
73540+ /* Commit any atom which can be committed. If @commit_new_atoms
73541+ * is not set we commit only atoms which were created before
73542+ * this call is started. */
73543+ if (commit_all_atoms
73544+ || time_before_eq(atom->start_time, start_time)) {
73545+ if (atom->stage <= ASTAGE_POST_COMMIT) {
73546+ spin_unlock_txnmgr(mgr);
73547+
73548+ if (atom->stage < ASTAGE_PRE_COMMIT) {
73549+ spin_lock_txnh(txnh);
73550+ /* Add force-context txnh */
73551+ capture_assign_txnh_nolock(atom, txnh);
73552+ ret = force_commit_atom(txnh);
73553+ if (ret)
73554+ return ret;
73555+ } else
73556+ /* wait atom commit */
71430cf6 73557+ reiser4_atom_wait_event(atom);
44254afd
MT
73558+
73559+ goto again;
73560+ }
73561+ }
73562+
73563+ spin_unlock_atom(atom);
73564+ }
73565+
73566+#if REISER4_DEBUG
73567+ if (commit_all_atoms) {
73568+ reiser4_super_info_data *sbinfo = get_super_private(super);
73569+ spin_lock_reiser4_super(sbinfo);
73570+ assert("zam-813",
73571+ sbinfo->blocks_fake_allocated_unformatted == 0);
73572+ assert("zam-812", sbinfo->blocks_fake_allocated == 0);
73573+ spin_unlock_reiser4_super(sbinfo);
73574+ }
73575+#endif
73576+
73577+ spin_unlock_txnmgr(mgr);
73578+
73579+ return 0;
73580+}
73581+
73582+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
73583+ * caller */
73584+static int atom_is_committable(txn_atom * atom)
73585+{
73586+ return
73587+ atom->stage < ASTAGE_PRE_COMMIT &&
73588+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
73589+}
73590+
73591+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
73592+ * lock at exit */
73593+int commit_some_atoms(txn_mgr * mgr)
73594+{
73595+ int ret = 0;
73596+ txn_atom *atom;
73597+ txn_handle *txnh;
73598+ reiser4_context *ctx;
73599+ struct list_head *pos, *tmp;
73600+
73601+ ctx = get_current_context();
73602+ assert("nikita-2444", ctx != NULL);
73603+
73604+ txnh = ctx->trans;
73605+ spin_lock_txnmgr(mgr);
73606+
73607+ /*
73608+ * this is to avoid gcc complain that atom might be used
73609+ * uninitialized
73610+ */
73611+ atom = NULL;
73612+
73613+ /* look for atom to commit */
73614+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
73615+ atom = list_entry(pos, txn_atom, atom_link);
73616+ /*
73617+ * first test without taking atom spin lock, whether it is
73618+ * eligible for committing at all
73619+ */
73620+ if (atom_is_committable(atom)) {
73621+ /* now, take spin lock and re-check */
73622+ spin_lock_atom(atom);
73623+ if (atom_is_committable(atom))
73624+ break;
73625+ spin_unlock_atom(atom);
73626+ }
73627+ }
73628+
73629+ ret = (&mgr->atoms_list == pos);
73630+ spin_unlock_txnmgr(mgr);
73631+
73632+ if (ret) {
73633+ /* nothing found */
73634+ spin_unlock(&mgr->daemon->guard);
73635+ return 0;
73636+ }
73637+
73638+ spin_lock_txnh(txnh);
73639+
73640+ BUG_ON(atom == NULL);
73641+ /* Set the atom to force committing */
73642+ atom->flags |= ATOM_FORCE_COMMIT;
73643+
73644+ /* Add force-context txnh */
73645+ capture_assign_txnh_nolock(atom, txnh);
73646+
73647+ spin_unlock_txnh(txnh);
73648+ spin_unlock_atom(atom);
73649+
73650+ /* we are about to release daemon spin lock, notify daemon it
73651+ has to rescan atoms */
73652+ mgr->daemon->rescan = 1;
73653+ spin_unlock(&mgr->daemon->guard);
71430cf6 73654+ reiser4_txn_restart_current();
44254afd
MT
73655+ return 0;
73656+}
73657+
73658+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
73659+{
73660+ int atom_stage;
73661+ txn_atom *atom_2;
73662+ int repeat;
73663+
73664+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
73665+
73666+ atom_stage = atom->stage;
73667+ repeat = 0;
73668+
73669+ if (!spin_trylock_txnmgr(tmgr)) {
73670+ atomic_inc(&atom->refcount);
73671+ spin_unlock_atom(atom);
73672+ spin_lock_txnmgr(tmgr);
73673+ spin_lock_atom(atom);
73674+ repeat = 1;
73675+ if (atom->stage != atom_stage) {
73676+ spin_unlock_txnmgr(tmgr);
73677+ atom_dec_and_unlock(atom);
73678+ return -E_REPEAT;
73679+ }
73680+ atomic_dec(&atom->refcount);
73681+ }
73682+
73683+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
73684+ if (atom == atom_2)
73685+ continue;
73686+ /*
73687+ * if trylock does not succeed we just do not fuse with that
73688+ * atom.
73689+ */
73690+ if (spin_trylock_atom(atom_2)) {
73691+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
73692+ spin_unlock_txnmgr(tmgr);
73693+ capture_fuse_into(atom_2, atom);
73694+ /* all locks are lost we can only repeat here */
73695+ return -E_REPEAT;
73696+ }
73697+ spin_unlock_atom(atom_2);
73698+ }
73699+ }
73700+ atom->flags |= ATOM_CANCEL_FUSION;
73701+ spin_unlock_txnmgr(tmgr);
73702+ if (repeat) {
73703+ spin_unlock_atom(atom);
73704+ return -E_REPEAT;
73705+ }
73706+ return 0;
73707+}
73708+
73709+/* Calls jnode_flush for current atom if it exists; if not, just take another
73710+ atom and call jnode_flush() for him. If current transaction handle has
73711+ already assigned atom (current atom) we have to close current transaction
73712+ prior to switch to another atom or do something with current atom. This
73713+ code tries to flush current atom.
73714+
73715+ flush_some_atom() is called as part of memory clearing process. It is
73716+ invoked from balance_dirty_pages(), pdflushd, and entd.
73717+
73718+ If we can flush no nodes, atom is committed, because this frees memory.
73719+
73720+ If atom is too large or too old it is committed also.
73721+*/
73722+int
73723+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
73724+ int flags)
73725+{
73726+ reiser4_context *ctx = get_current_context();
73727+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
73728+ txn_handle *txnh = ctx->trans;
73729+ txn_atom *atom;
73730+ int ret;
73731+
73732+ BUG_ON(wbc->nr_to_write == 0);
73733+ BUG_ON(*nr_submitted != 0);
73734+ assert("zam-1042", txnh != NULL);
73735+ repeat:
73736+ if (txnh->atom == NULL) {
73737+ /* current atom is not available, take first from txnmgr */
73738+ spin_lock_txnmgr(tmgr);
73739+
73740+ /* traverse the list of all atoms */
73741+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73742+ /* lock atom before checking its state */
73743+ spin_lock_atom(atom);
73744+
73745+ /*
73746+ * we need an atom which is not being committed and
73747+ * which has no flushers (jnode_flush() add one flusher
73748+ * at the beginning and subtract one at the end).
73749+ */
73750+ if (atom->stage < ASTAGE_PRE_COMMIT &&
73751+ atom->nr_flushers == 0) {
73752+ spin_lock_txnh(txnh);
73753+ capture_assign_txnh_nolock(atom, txnh);
73754+ spin_unlock_txnh(txnh);
73755+
73756+ goto found;
73757+ }
73758+
73759+ spin_unlock_atom(atom);
73760+ }
73761+
73762+ /*
73763+ * Write throttling is case of no one atom can be
73764+ * flushed/committed.
73765+ */
73766+ if (!current_is_pdflush() && !wbc->nonblocking) {
73767+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73768+ spin_lock_atom(atom);
73769+ /* Repeat the check from the above. */
73770+ if (atom->stage < ASTAGE_PRE_COMMIT
73771+ && atom->nr_flushers == 0) {
73772+ spin_lock_txnh(txnh);
73773+ capture_assign_txnh_nolock(atom, txnh);
73774+ spin_unlock_txnh(txnh);
73775+
73776+ goto found;
73777+ }
73778+ if (atom->stage <= ASTAGE_POST_COMMIT) {
73779+ spin_unlock_txnmgr(tmgr);
73780+ /*
73781+ * we just wait until atom's flusher
73782+ * makes a progress in flushing or
73783+ * committing the atom
73784+ */
71430cf6 73785+ reiser4_atom_wait_event(atom);
44254afd
MT
73786+ goto repeat;
73787+ }
73788+ spin_unlock_atom(atom);
73789+ }
73790+ }
73791+ spin_unlock_txnmgr(tmgr);
73792+ return 0;
73793+ found:
73794+ spin_unlock_txnmgr(tmgr);
73795+ } else
73796+ atom = get_current_atom_locked();
73797+
73798+ BUG_ON(atom->super != ctx->super);
73799+ assert("vs-35", atom->super == ctx->super);
73800+ if (start) {
73801+ spin_lock_jnode(start);
73802+ ret = (atom == start->atom) ? 1 : 0;
73803+ spin_unlock_jnode(start);
73804+ if (ret == 0)
73805+ start = NULL;
73806+ }
73807+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
73808+ if (ret == 0) {
73809+ /* flush_current_atom returns 0 only if it submitted for write
73810+ nothing */
73811+ BUG_ON(*nr_submitted != 0);
73812+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
73813+ if (atom->capture_count < tmgr->atom_min_size &&
73814+ !(atom->flags & ATOM_CANCEL_FUSION)) {
73815+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
73816+ if (ret == -E_REPEAT) {
71430cf6 73817+ reiser4_preempt_point();
44254afd
MT
73818+ goto repeat;
73819+ }
73820+ }
73821+ /* if early flushing could not make more nodes clean,
73822+ * or atom is too old/large,
73823+ * we force current atom to commit */
73824+ /* wait for commit completion but only if this
73825+ * wouldn't stall pdflushd and ent thread. */
73826+ if (!wbc->nonblocking && !ctx->entd)
73827+ txnh->flags |= TXNH_WAIT_COMMIT;
73828+ atom->flags |= ATOM_FORCE_COMMIT;
73829+ }
73830+ spin_unlock_atom(atom);
73831+ } else if (ret == -E_REPEAT) {
73832+ if (*nr_submitted == 0) {
73833+ /* let others who hampers flushing (hold longterm locks,
73834+ for instance) to free the way for flush */
71430cf6 73835+ reiser4_preempt_point();
44254afd
MT
73836+ goto repeat;
73837+ }
73838+ ret = 0;
73839+ }
73840+/*
73841+ if (*nr_submitted > wbc->nr_to_write)
73842+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
73843+*/
71430cf6 73844+ reiser4_txn_restart(ctx);
44254afd
MT
73845+
73846+ return ret;
73847+}
73848+
73849+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
71430cf6 73850+void reiser4_invalidate_list(struct list_head *head)
44254afd
MT
73851+{
73852+ while (!list_empty(head)) {
73853+ jnode *node;
73854+
73855+ node = list_entry(head->next, jnode, capture_link);
73856+ spin_lock_jnode(node);
71430cf6 73857+ reiser4_uncapture_block(node);
44254afd
MT
73858+ jput(node);
73859+ }
73860+}
73861+
73862+static void init_wlinks(txn_wait_links * wlinks)
73863+{
73864+ wlinks->_lock_stack = get_current_lock_stack();
73865+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
73866+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
73867+ wlinks->waitfor_cb = NULL;
73868+ wlinks->waiting_cb = NULL;
73869+}
73870+
73871+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
71430cf6 73872+void reiser4_atom_wait_event(txn_atom * atom)
44254afd
MT
73873+{
73874+ txn_wait_links _wlinks;
73875+
73876+ assert_spin_locked(&(atom->alock));
73877+ assert("nikita-3156",
73878+ lock_stack_isclean(get_current_lock_stack()) ||
73879+ atom->nr_running_queues > 0);
73880+
73881+ init_wlinks(&_wlinks);
73882+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
73883+ atomic_inc(&atom->refcount);
73884+ spin_unlock_atom(atom);
73885+
71430cf6
MT
73886+ reiser4_prepare_to_sleep(_wlinks._lock_stack);
73887+ reiser4_go_to_sleep(_wlinks._lock_stack);
44254afd
MT
73888+
73889+ spin_lock_atom(atom);
73890+ list_del(&_wlinks._fwaitfor_link);
73891+ atom_dec_and_unlock(atom);
73892+}
73893+
71430cf6 73894+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
44254afd
MT
73895+{
73896+ assert("nikita-3535", atom != NULL);
73897+ assert_spin_locked(&(atom->alock));
71430cf6 73898+ assert("nikita-3536", stage <= ASTAGE_INVALID);
44254afd
MT
73899+ /* Excelsior! */
73900+ assert("nikita-3537", stage >= atom->stage);
73901+ if (atom->stage != stage) {
73902+ atom->stage = stage;
71430cf6 73903+ reiser4_atom_send_event(atom);
44254afd
MT
73904+ }
73905+}
73906+
73907+/* wake all threads which wait for an event */
71430cf6 73908+void reiser4_atom_send_event(txn_atom * atom)
44254afd
MT
73909+{
73910+ assert_spin_locked(&(atom->alock));
73911+ wakeup_atom_waitfor_list(atom);
73912+}
73913+
73914+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
73915+ example, because it does fsync(2)) */
73916+static int should_wait_commit(txn_handle * h)
73917+{
73918+ return h->flags & TXNH_WAIT_COMMIT;
73919+}
73920+
73921+typedef struct commit_data {
73922+ txn_atom *atom;
73923+ txn_handle *txnh;
73924+ long nr_written;
73925+ /* as an optimization we start committing atom by first trying to
73926+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
73927+ * allows to reduce stalls due to other threads waiting for atom in
73928+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
73929+ * preliminary flushes. */
73930+ int preflush;
73931+ /* have we waited on atom. */
73932+ int wait;
73933+ int failed;
73934+ int wake_ktxnmgrd_up;
73935+} commit_data;
73936+
73937+/*
73938+ * Called from commit_txnh() repeatedly, until either error happens, or atom
73939+ * commits successfully.
73940+ */
73941+static int try_commit_txnh(commit_data * cd)
73942+{
73943+ int result;
73944+
73945+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
73946+
73947+ /* Get the atom and txnh locked. */
73948+ cd->atom = txnh_get_atom(cd->txnh);
73949+ assert("jmacd-309", cd->atom != NULL);
73950+ spin_unlock_txnh(cd->txnh);
73951+
73952+ if (cd->wait) {
73953+ cd->atom->nr_waiters--;
73954+ cd->wait = 0;
73955+ }
73956+
73957+ if (cd->atom->stage == ASTAGE_DONE)
73958+ return 0;
73959+
73960+ if (cd->failed)
73961+ return 0;
73962+
73963+ if (atom_should_commit(cd->atom)) {
73964+ /* if atom is _very_ large schedule it for commit as soon as
73965+ * possible. */
73966+ if (atom_should_commit_asap(cd->atom)) {
73967+ /*
73968+ * When atom is in PRE_COMMIT or later stage following
73969+ * invariant (encoded in atom_can_be_committed())
73970+ * holds: there is exactly one non-waiter transaction
73971+ * handle opened on this atom. When thread wants to
73972+ * wait until atom commits (for example sync()) it
73973+ * waits on atom event after increasing
73974+ * atom->nr_waiters (see blow in this function). It
73975+ * cannot be guaranteed that atom is already committed
73976+ * after receiving event, so loop has to be
73977+ * re-started. But if atom switched into PRE_COMMIT
73978+ * stage and became too large, we cannot change its
73979+ * state back to CAPTURE_WAIT (atom stage can only
73980+ * increase monotonically), hence this check.
73981+ */
73982+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
71430cf6
MT
73983+ reiser4_atom_set_stage(cd->atom,
73984+ ASTAGE_CAPTURE_WAIT);
44254afd
MT
73985+ cd->atom->flags |= ATOM_FORCE_COMMIT;
73986+ }
73987+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
73988+ /*
73989+ * this thread (transaction handle that is) doesn't
73990+ * want to commit atom. Notify waiters that handle is
73991+ * closed. This can happen, for example, when we are
73992+ * under VFS directory lock and don't want to commit
73993+ * atom right now to avoid stalling other threads
73994+ * working in the same directory.
73995+ */
73996+
73997+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
73998+ * commit this atom: no atom waiters and only one
73999+ * (our) open transaction handle. */
74000+ cd->wake_ktxnmgrd_up =
74001+ cd->atom->txnh_count == 1 &&
74002+ cd->atom->nr_waiters == 0;
71430cf6 74003+ reiser4_atom_send_event(cd->atom);
44254afd
MT
74004+ result = 0;
74005+ } else if (!atom_can_be_committed(cd->atom)) {
74006+ if (should_wait_commit(cd->txnh)) {
74007+ /* sync(): wait for commit */
74008+ cd->atom->nr_waiters++;
74009+ cd->wait = 1;
71430cf6 74010+ reiser4_atom_wait_event(cd->atom);
44254afd
MT
74011+ result = RETERR(-E_REPEAT);
74012+ } else {
74013+ result = 0;
74014+ }
74015+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74016+ /*
74017+ * optimization: flush atom without switching it into
74018+ * ASTAGE_CAPTURE_WAIT.
74019+ *
74020+ * But don't do this for ktxnmgrd, because ktxnmgrd
74021+ * should never block on atom fusion.
74022+ */
74023+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74024+ LONG_MAX, &cd->nr_written,
74025+ &cd->atom, NULL);
74026+ if (result == 0) {
74027+ spin_unlock_atom(cd->atom);
74028+ cd->preflush = 0;
74029+ result = RETERR(-E_REPEAT);
74030+ } else /* Atoms wasn't flushed
74031+ * completely. Rinse. Repeat. */
74032+ --cd->preflush;
74033+ } else {
74034+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
74035+ prevent atom fusion and count ourself as an active
74036+ flusher */
71430cf6 74037+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
44254afd
MT
74038+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74039+
74040+ result =
74041+ commit_current_atom(&cd->nr_written, &cd->atom);
74042+ if (result != 0 && result != -E_REPEAT)
74043+ cd->failed = 1;
74044+ }
74045+ } else
74046+ result = 0;
74047+
74048+#if REISER4_DEBUG
74049+ if (result == 0)
74050+ assert_spin_locked(&(cd->atom->alock));
74051+#endif
74052+
74053+ /* perfectly valid assertion, except that when atom/txnh is not locked
74054+ * fusion can take place, and cd->atom points nowhere. */
74055+ /*
74056+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74057+ */
74058+ return result;
74059+}
74060+
74061+/* Called to commit a transaction handle. This decrements the atom's number of open
74062+ handles and if it is the last handle to commit and the atom should commit, initiates
74063+ atom commit. if commit does not fail, return number of written blocks */
74064+static int commit_txnh(txn_handle * txnh)
74065+{
74066+ commit_data cd;
74067+ assert("umka-192", txnh != NULL);
74068+
74069+ memset(&cd, 0, sizeof cd);
74070+ cd.txnh = txnh;
74071+ cd.preflush = 10;
74072+
74073+ /* calls try_commit_txnh() until either atom commits, or error
74074+ * happens */
74075+ while (try_commit_txnh(&cd) != 0)
71430cf6 74076+ reiser4_preempt_point();
44254afd
MT
74077+
74078+ spin_lock_txnh(txnh);
74079+
74080+ cd.atom->txnh_count -= 1;
74081+ txnh->atom = NULL;
74082+ /* remove transaction handle from atom's list of transaction handles */
74083+ list_del_init(&txnh->txnh_link);
74084+
74085+ spin_unlock_txnh(txnh);
74086+ atom_dec_and_unlock(cd.atom);
74087+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74088+ * because it takes time) by current thread, we do that work
74089+ * asynchronously by ktxnmgrd daemon. */
74090+ if (cd.wake_ktxnmgrd_up)
74091+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
74092+
74093+ return 0;
74094+}
74095+
74096+/* TRY_CAPTURE */
74097+
74098+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74099+ condition indicates that the request should be retried, and it may block if the
74100+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74101+
74102+ This routine encodes the basic logic of block capturing described by:
74103+
74104+ http://namesys.com/v4/v4.html
74105+
74106+ Our goal here is to ensure that any two blocks that contain dependent modifications
74107+ should commit at the same time. This function enforces this discipline by initiating
74108+ fusion whenever a transaction handle belonging to one atom requests to read or write a
74109+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74110+
74111+ In addition, this routine handles the initial assignment of atoms to blocks and
74112+ transaction handles. These are possible outcomes of this function:
74113+
74114+ 1. The block and handle are already part of the same atom: return immediate success
74115+
74116+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74117+ the handle to the block's atom.
74118+
74119+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
74120+ the block to the handle's atom.
74121+
74122+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74123+ to fuse atoms.
74124+
74125+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
74126+
74127+ 6. A read request for a non-captured block: return immediate success.
74128+
74129+ This function acquires and releases the handle's spinlock. This function is called
74130+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
74131+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
71430cf6
MT
74132+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
74133+ lock in the failure case.
44254afd
MT
74134+*/
74135+static int try_capture_block(
74136+ txn_handle * txnh, jnode * node, txn_capture mode,
74137+ txn_atom ** atom_alloc)
74138+{
74139+ txn_atom *block_atom;
74140+ txn_atom *txnh_atom;
74141+
71430cf6 74142+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
44254afd
MT
74143+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74144+
74145+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74146+ * node->tree somewhere. */
74147+ assert("umka-194", txnh != NULL);
74148+ assert("umka-195", node != NULL);
74149+
71430cf6 74150+ /* The jnode is already locked! Being called from reiser4_try_capture(). */
44254afd
MT
74151+ assert_spin_locked(&(node->guard));
74152+ block_atom = node->atom;
74153+
74154+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74155+ let us touch the atoms themselves. */
74156+ spin_lock_txnh(txnh);
74157+ txnh_atom = txnh->atom;
74158+ /* Process of capturing continues into one of four branches depends on
74159+ which atoms from (block atom (node->atom), current atom (txnh->atom))
74160+ exist. */
74161+ if (txnh_atom == NULL) {
74162+ if (block_atom == NULL) {
74163+ spin_unlock_txnh(txnh);
74164+ spin_unlock_jnode(node);
74165+ /* assign empty atom to the txnh and repeat */
74166+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74167+ } else {
74168+ atomic_inc(&block_atom->refcount);
74169+ /* node spin-lock isn't needed anymore */
74170+ spin_unlock_jnode(node);
74171+ if (!spin_trylock_atom(block_atom)) {
74172+ spin_unlock_txnh(txnh);
74173+ spin_lock_atom(block_atom);
74174+ spin_lock_txnh(txnh);
74175+ }
74176+ /* re-check state after getting txnh and the node
74177+ * atom spin-locked */
74178+ if (node->atom != block_atom || txnh->atom != NULL) {
74179+ spin_unlock_txnh(txnh);
74180+ atom_dec_and_unlock(block_atom);
74181+ return RETERR(-E_REPEAT);
74182+ }
74183+ atomic_dec(&block_atom->refcount);
74184+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74185+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74186+ block_atom->txnh_count != 0))
74187+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
74188+ capture_assign_txnh_nolock(block_atom, txnh);
74189+ spin_unlock_txnh(txnh);
74190+ spin_unlock_atom(block_atom);
74191+ return RETERR(-E_REPEAT);
74192+ }
74193+ } else {
74194+ /* It is time to perform deadlock prevention check over the
74195+ node we want to capture. It is possible this node was locked
74196+ for read without capturing it. The optimization which allows
74197+ to do it helps us in keeping atoms independent as long as
74198+ possible but it may cause lock/fuse deadlock problems.
74199+
74200+ A number of similar deadlock situations with locked but not
74201+ captured nodes were found. In each situation there are two
74202+ or more threads: one of them does flushing while another one
74203+ does routine balancing or tree lookup. The flushing thread
74204+ (F) sleeps in long term locking request for node (N), another
74205+ thread (A) sleeps in trying to capture some node already
74206+ belonging the atom F, F has a state which prevents
74207+ immediately fusion .
74208+
74209+ Deadlocks of this kind cannot happen if node N was properly
74210+ captured by thread A. The F thread fuse atoms before locking
74211+ therefore current atom of thread F and current atom of thread
74212+ A became the same atom and thread A may proceed. This does
74213+ not work if node N was not captured because the fusion of
74214+ atom does not happens.
74215+
74216+ The following scheme solves the deadlock: If
74217+ longterm_lock_znode locks and does not capture a znode, that
74218+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
74219+ is processed by the code below which restores the missed
74220+ capture and fuses current atoms of all the node lock owners
74221+ by calling the fuse_not_fused_lock_owners() function. */
74222+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74223+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74224+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74225+ spin_unlock_txnh(txnh);
74226+ spin_unlock_jnode(node);
74227+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
74228+ return RETERR(-E_REPEAT);
74229+ }
74230+ }
74231+ if (block_atom == NULL) {
74232+ atomic_inc(&txnh_atom->refcount);
74233+ spin_unlock_txnh(txnh);
74234+ if (!spin_trylock_atom(txnh_atom)) {
74235+ spin_unlock_jnode(node);
74236+ spin_lock_atom(txnh_atom);
74237+ spin_lock_jnode(node);
74238+ }
74239+ if (txnh->atom != txnh_atom || node->atom != NULL
74240+ || JF_ISSET(node, JNODE_IS_DYING)) {
74241+ spin_unlock_jnode(node);
74242+ atom_dec_and_unlock(txnh_atom);
74243+ return RETERR(-E_REPEAT);
74244+ }
74245+ atomic_dec(&txnh_atom->refcount);
74246+ capture_assign_block_nolock(txnh_atom, node);
74247+ spin_unlock_atom(txnh_atom);
74248+ } else {
74249+ if (txnh_atom != block_atom) {
74250+ if (mode & TXN_CAPTURE_DONT_FUSE) {
74251+ spin_unlock_txnh(txnh);
74252+ spin_unlock_jnode(node);
74253+ /* we are in a "no-fusion" mode and @node is
74254+ * already part of transaction. */
74255+ return RETERR(-E_NO_NEIGHBOR);
74256+ }
74257+ return capture_init_fusion(node, txnh, mode);
74258+ }
74259+ spin_unlock_txnh(txnh);
74260+ }
74261+ }
74262+ return 0;
74263+}
74264+
74265+static txn_capture
74266+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74267+{
74268+ txn_capture cap_mode;
74269+
74270+ assert_spin_locked(&(node->guard));
74271+
74272+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74273+
74274+ if (lock_mode == ZNODE_WRITE_LOCK) {
74275+ cap_mode = TXN_CAPTURE_WRITE;
74276+ } else if (node->atom != NULL) {
74277+ cap_mode = TXN_CAPTURE_WRITE;
74278+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
74279+ jnode_get_level(node) == LEAF_LEVEL) {
74280+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74281+ /* We only need a READ_FUSING capture at the leaf level. This
74282+ is because the internal levels of the tree (twigs included)
74283+ are redundant from the point of the user that asked for a
74284+ read-fusing transcrash. The user only wants to read-fuse
74285+ atoms due to reading uncommitted data that another user has
74286+ written. It is the file system that reads/writes the
74287+ internal tree levels, the user only reads/writes leaves. */
74288+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
74289+ } else {
74290+ /* In this case (read lock at a non-leaf) there's no reason to
74291+ * capture. */
74292+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74293+ return 0;
74294+ }
74295+
74296+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74297+ assert("nikita-3186", cap_mode != 0);
74298+ return cap_mode;
74299+}
74300+
74301+/* This is an external interface to try_capture_block(), it calls
74302+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
74303+
74304+ @node: node to capture,
74305+ @lock_mode: read or write lock is used in capture mode calculation,
74306+ @flags: see txn_capture flags enumeration,
74307+ @can_coc : can copy-on-capture
74308+
74309+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
74310+ cannot be processed immediately as it was requested in flags,
74311+ < 0 - other errors.
74312+*/
71430cf6
MT
74313+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
74314+ txn_capture flags)
44254afd
MT
74315+{
74316+ txn_atom *atom_alloc = NULL;
74317+ txn_capture cap_mode;
74318+ txn_handle *txnh = get_current_context()->trans;
74319+ int ret;
74320+
74321+ assert_spin_locked(&(node->guard));
74322+
74323+ repeat:
74324+ if (JF_ISSET(node, JNODE_IS_DYING))
74325+ return RETERR(-EINVAL);
74326+ if (node->atom != NULL && txnh->atom == node->atom)
74327+ return 0;
74328+ cap_mode = build_capture_mode(node, lock_mode, flags);
74329+ if (cap_mode == 0 ||
74330+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74331+ /* Mark this node as "MISSED". It helps in further deadlock
74332+ * analysis */
74333+ if (jnode_is_znode(node))
74334+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74335+ return 0;
74336+ }
74337+ /* Repeat try_capture as long as -E_REPEAT is returned. */
74338+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74339+ /* Regardless of non_blocking:
74340+
74341+ If ret == 0 then jnode is still locked.
74342+ If ret != 0 then jnode is unlocked.
74343+ */
74344+#if REISER4_DEBUG
74345+ if (ret == 0)
74346+ assert_spin_locked(&(node->guard));
74347+ else
74348+ assert_spin_not_locked(&(node->guard));
74349+#endif
74350+ assert_spin_not_locked(&(txnh->guard));
74351+
74352+ if (ret == -E_REPEAT) {
74353+ /* E_REPEAT implies all locks were released, therefore we need
74354+ to take the jnode's lock again. */
74355+ spin_lock_jnode(node);
74356+
74357+ /* Although this may appear to be a busy loop, it is not.
74358+ There are several conditions that cause E_REPEAT to be
74359+ returned by the call to try_capture_block, all cases
74360+ indicating some kind of state change that means you should
74361+ retry the request and will get a different result. In some
74362+ cases this could be avoided with some extra code, but
74363+ generally it is done because the necessary locks were
74364+ released as a result of the operation and repeating is the
74365+ simplest thing to do (less bug potential). The cases are:
74366+ atom fusion returns E_REPEAT after it completes (jnode and
74367+ txnh were unlocked); race conditions in assign_block,
74368+ assign_txnh, and init_fusion return E_REPEAT (trylock
74369+ failure); after going to sleep in capture_fuse_wait
74370+ (request was blocked but may now succeed). I'm not quite
74371+ sure how capture_copy works yet, but it may also return
74372+ E_REPEAT. When the request is legitimately blocked, the
74373+ requestor goes to sleep in fuse_wait, so this is not a busy
74374+ loop. */
74375+ /* NOTE-NIKITA: still don't understand:
74376+
74377+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74378+
74379+ looks like busy loop?
74380+ */
74381+ goto repeat;
74382+ }
74383+
74384+ /* free extra atom object that was possibly allocated by
74385+ try_capture_block().
74386+
74387+ Do this before acquiring jnode spin lock to
74388+ minimize time spent under lock. --nikita */
74389+ if (atom_alloc != NULL) {
74390+ kmem_cache_free(_atom_slab, atom_alloc);
74391+ }
74392+
74393+ if (ret != 0) {
74394+ if (ret == -E_BLOCK) {
74395+ assert("nikita-3360",
74396+ cap_mode & TXN_CAPTURE_NONBLOCKING);
74397+ ret = -E_REPEAT;
74398+ }
74399+
74400+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
74401+ want to fix the above code to avoid releasing the lock and
74402+ re-acquiring it, but there are cases were failure occurs
74403+ when the lock is not held, and those cases would need to be
74404+ modified to re-take the lock. */
74405+ spin_lock_jnode(node);
74406+ }
74407+
74408+ /* Jnode is still locked. */
74409+ assert_spin_locked(&(node->guard));
74410+ return ret;
74411+}
74412+
74413+static void release_two_atoms(txn_atom *one, txn_atom *two)
74414+{
74415+ spin_unlock_atom(one);
74416+ atom_dec_and_unlock(two);
74417+ spin_lock_atom(one);
74418+ atom_dec_and_unlock(one);
74419+}
74420+
74421+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
74422+ returned by that routine. The txn_capture request mode is computed here depending on
74423+ the transaction handle's type and the lock request. This is called from the depths of
74424+ the lock manager with the jnode lock held and it always returns with the jnode lock
74425+ held.
74426+*/
74427+
74428+/* fuse all 'active' atoms of lock owners of given node. */
74429+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
74430+{
74431+ lock_handle *lh;
74432+ int repeat;
74433+ txn_atom *atomh, *atomf;
74434+ reiser4_context *me = get_current_context();
74435+ reiser4_context *ctx = NULL;
74436+
74437+ assert_spin_not_locked(&(ZJNODE(node)->guard));
74438+ assert_spin_not_locked(&(txnh->hlock));
74439+
74440+ repeat:
74441+ repeat = 0;
74442+ atomh = txnh_get_atom(txnh);
74443+ spin_unlock_txnh(txnh);
74444+ assert("zam-692", atomh != NULL);
74445+
74446+ spin_lock_zlock(&node->lock);
74447+ /* inspect list of lock owners */
74448+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
74449+ ctx = get_context_by_lock_stack(lh->owner);
74450+ if (ctx == me)
74451+ continue;
74452+ /* below we use two assumptions to avoid addition spin-locks
74453+ for checking the condition :
74454+
74455+ 1) if the lock stack has lock, the transaction should be
74456+ opened, i.e. ctx->trans != NULL;
74457+
74458+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
74459+ equals to the address of spin-locked atomh, we take that
74460+ the atoms are the same, nothing has to be captured. */
74461+ if (atomh != ctx->trans->atom) {
74462+ reiser4_wake_up(lh->owner);
74463+ repeat = 1;
74464+ break;
74465+ }
74466+ }
74467+ if (repeat) {
74468+ if (!spin_trylock_txnh(ctx->trans)) {
74469+ spin_unlock_zlock(&node->lock);
74470+ spin_unlock_atom(atomh);
74471+ goto repeat;
74472+ }
74473+ atomf = ctx->trans->atom;
74474+ if (atomf == NULL) {
74475+ capture_assign_txnh_nolock(atomh, ctx->trans);
74476+ /* release zlock lock _after_ assigning the atom to the
74477+ * transaction handle, otherwise the lock owner thread
74478+ * may unlock all znodes, exit kernel context and here
74479+ * we would access an invalid transaction handle. */
74480+ spin_unlock_zlock(&node->lock);
74481+ spin_unlock_atom(atomh);
74482+ spin_unlock_txnh(ctx->trans);
74483+ goto repeat;
74484+ }
74485+ assert("zam-1059", atomf != atomh);
74486+ spin_unlock_zlock(&node->lock);
74487+ atomic_inc(&atomh->refcount);
74488+ atomic_inc(&atomf->refcount);
74489+ spin_unlock_txnh(ctx->trans);
74490+ if (atomf > atomh) {
71430cf6 74491+ spin_lock_atom_nested(atomf);
44254afd
MT
74492+ } else {
74493+ spin_unlock_atom(atomh);
74494+ spin_lock_atom(atomf);
71430cf6 74495+ spin_lock_atom_nested(atomh);
44254afd
MT
74496+ }
74497+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
74498+ release_two_atoms(atomf, atomh);
74499+ goto repeat;
74500+ }
74501+ atomic_dec(&atomh->refcount);
74502+ atomic_dec(&atomf->refcount);
74503+ capture_fuse_into(atomf, atomh);
74504+ goto repeat;
74505+ }
74506+ spin_unlock_zlock(&node->lock);
74507+ spin_unlock_atom(atomh);
74508+}
74509+
74510+/* This is the interface to capture unformatted nodes via their struct page
74511+ reference. Currently it is only used in reiser4_invalidatepage */
74512+int try_capture_page_to_invalidate(struct page *pg)
74513+{
74514+ int ret;
74515+ jnode *node;
74516+
74517+ assert("umka-292", pg != NULL);
74518+ assert("nikita-2597", PageLocked(pg));
74519+
74520+ if (IS_ERR(node = jnode_of_page(pg))) {
74521+ return PTR_ERR(node);
74522+ }
74523+
74524+ spin_lock_jnode(node);
74525+ unlock_page(pg);
74526+
71430cf6 74527+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
44254afd
MT
74528+ spin_unlock_jnode(node);
74529+ jput(node);
74530+ lock_page(pg);
74531+ return ret;
74532+}
74533+
74534+/* This informs the transaction manager when a node is deleted. Add the block to the
74535+ atom's delete set and uncapture the block.
74536+
74537+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
74538+explanations. find all the functions that use it, and unless there is some very
74539+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
74540+move the loop to inside the function.
74541+
74542+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
74543+ */
71430cf6 74544+void reiser4_uncapture_page(struct page *pg)
44254afd
MT
74545+{
74546+ jnode *node;
74547+ txn_atom *atom;
74548+
74549+ assert("umka-199", pg != NULL);
74550+ assert("nikita-3155", PageLocked(pg));
74551+
74552+ clear_page_dirty_for_io(pg);
74553+
74554+ reiser4_wait_page_writeback(pg);
74555+
74556+ node = jprivate(pg);
74557+ BUG_ON(node == NULL);
74558+
74559+ spin_lock_jnode(node);
74560+
74561+ atom = jnode_get_atom(node);
74562+ if (atom == NULL) {
74563+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74564+ spin_unlock_jnode(node);
74565+ return;
74566+ }
74567+
74568+ /* We can remove jnode from transaction even if it is on flush queue
74569+ * prepped list, we only need to be sure that flush queue is not being
71430cf6
MT
74570+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
74571+ * spin lock for protection of the prepped nodes list, instead
74572+ * write_fq() increments atom's nr_running_queues counters for the time
74573+ * when prepped list is not protected by spin lock. Here we check this
74574+ * counter if we want to remove jnode from flush queue and, if the
74575+ * counter is not zero, wait all reiser4_write_fq() for this atom to
74576+ * complete. This is not significant overhead. */
44254afd
MT
74577+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
74578+ spin_unlock_jnode(node);
74579+ /*
74580+ * at this moment we want to wait for "atom event", viz. wait
74581+ * until @node can be removed from flush queue. But
71430cf6
MT
74582+ * reiser4_atom_wait_event() cannot be called with page locked,
74583+ * because it deadlocks with jnode_extent_write(). Unlock page,
74584+ * after making sure (through page_cache_get()) that it cannot
74585+ * be released from memory.
44254afd
MT
74586+ */
74587+ page_cache_get(pg);
74588+ unlock_page(pg);
71430cf6 74589+ reiser4_atom_wait_event(atom);
44254afd
MT
74590+ lock_page(pg);
74591+ /*
74592+ * page may has been detached by ->writepage()->releasepage().
74593+ */
74594+ reiser4_wait_page_writeback(pg);
74595+ spin_lock_jnode(node);
74596+ page_cache_release(pg);
74597+ atom = jnode_get_atom(node);
74598+/* VS-FIXME-HANS: improve the commenting in this function */
74599+ if (atom == NULL) {
74600+ spin_unlock_jnode(node);
74601+ return;
74602+ }
74603+ }
71430cf6 74604+ reiser4_uncapture_block(node);
44254afd
MT
74605+ spin_unlock_atom(atom);
74606+ jput(node);
74607+}
74608+
74609+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
74610+ * inode's tree of jnodes */
71430cf6 74611+void reiser4_uncapture_jnode(jnode * node)
44254afd
MT
74612+{
74613+ txn_atom *atom;
74614+
74615+ assert_spin_locked(&(node->guard));
74616+ assert("", node->pg == 0);
74617+
74618+ atom = jnode_get_atom(node);
74619+ if (atom == NULL) {
74620+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
74621+ spin_unlock_jnode(node);
74622+ return;
74623+ }
74624+
71430cf6 74625+ reiser4_uncapture_block(node);
44254afd
MT
74626+ spin_unlock_atom(atom);
74627+ jput(node);
74628+}
74629+
74630+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
74631+ increases atom refcount and txnh_count, adds to txnh_list. */
74632+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
74633+{
74634+ assert("umka-200", atom != NULL);
74635+ assert("umka-201", txnh != NULL);
74636+
74637+ assert_spin_locked(&(txnh->hlock));
74638+ assert_spin_locked(&(atom->alock));
74639+ assert("jmacd-824", txnh->atom == NULL);
74640+ assert("nikita-3540", atom_isopen(atom));
74641+ BUG_ON(txnh->atom != NULL);
74642+
74643+ atomic_inc(&atom->refcount);
74644+ txnh->atom = atom;
71430cf6 74645+ reiser4_ctx_gfp_mask_set();
44254afd
MT
74646+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
74647+ atom->txnh_count += 1;
74648+}
74649+
74650+/* No-locking version of assign_block. Sets the block's atom pointer, references the
74651+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
74652+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
74653+{
74654+ assert("umka-202", atom != NULL);
74655+ assert("umka-203", node != NULL);
74656+ assert_spin_locked(&(node->guard));
74657+ assert_spin_locked(&(atom->alock));
74658+ assert("jmacd-323", node->atom == NULL);
74659+ BUG_ON(!list_empty_careful(&node->capture_link));
74660+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
74661+
74662+ /* Pointer from jnode to atom is not counted in atom->refcount. */
74663+ node->atom = atom;
74664+
74665+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
74666+ atom->capture_count += 1;
74667+ /* reference to jnode is acquired by atom. */
74668+ jref(node);
74669+
74670+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
74671+
74672+ LOCK_CNT_INC(t_refs);
74673+}
74674+
74675+/* common code for dirtying both unformatted jnodes and formatted znodes. */
74676+static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
74677+{
74678+ assert_spin_locked(&(node->guard));
74679+ assert_spin_locked(&(atom->alock));
74680+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
74681+
74682+ JF_SET(node, JNODE_DIRTY);
74683+
74684+ get_current_context()->nr_marked_dirty++;
74685+
74686+ /* We grab2flush_reserve one additional block only if node was
74687+ not CREATED and jnode_flush did not sort it into neither
74688+ relocate set nor overwrite one. If node is in overwrite or
74689+ relocate set we assume that atom's flush reserved counter was
74690+ already adjusted. */
74691+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
74692+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
74693+ && !jnode_is_cluster_page(node)) {
71430cf6 74694+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
44254afd
MT
74695+ assert("vs-1506", *jnode_get_block(node) != 0);
74696+ grabbed2flush_reserved_nolock(atom, (__u64) 1);
74697+ JF_SET(node, JNODE_FLUSH_RESERVED);
74698+ }
74699+
74700+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74701+ /* If the atom is not set yet, it will be added to the appropriate list in
74702+ capture_assign_block_nolock. */
74703+ /* Sometimes a node is set dirty before being captured -- the case for new
74704+ jnodes. In that case the jnode will be added to the appropriate list
74705+ in capture_assign_block_nolock. Another reason not to re-link jnode is
74706+ that jnode is on a flush queue (see flush.c for details) */
74707+
74708+ int level = jnode_get_level(node);
74709+
74710+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
74711+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
74712+ assert("nikita-2607", 0 <= level);
74713+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
74714+
74715+ /* move node to atom's dirty list */
74716+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
74717+ ON_DEBUG(count_jnode
74718+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
74719+ }
74720+}
74721+
74722+/* Set the dirty status for this (spin locked) jnode. */
74723+void jnode_make_dirty_locked(jnode * node)
74724+{
74725+ assert("umka-204", node != NULL);
74726+ assert_spin_locked(&(node->guard));
74727+
74728+ if (REISER4_DEBUG && rofs_jnode(node)) {
74729+ warning("nikita-3365", "Dirtying jnode on rofs");
74730+ dump_stack();
74731+ }
74732+
74733+ /* Fast check for already dirty node */
74734+ if (!JF_ISSET(node, JNODE_DIRTY)) {
74735+ txn_atom *atom;
74736+
74737+ atom = jnode_get_atom(node);
74738+ assert("vs-1094", atom);
74739+ /* Check jnode dirty status again because node spin lock might
74740+ * be released inside jnode_get_atom(). */
74741+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
74742+ do_jnode_make_dirty(node, atom);
74743+ spin_unlock_atom(atom);
74744+ }
74745+}
74746+
74747+/* Set the dirty status for this znode. */
74748+void znode_make_dirty(znode * z)
74749+{
74750+ jnode *node;
74751+ struct page *page;
74752+
74753+ assert("umka-204", z != NULL);
74754+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
74755+ assert("nikita-3560", znode_is_write_locked(z));
74756+
74757+ node = ZJNODE(z);
74758+ /* znode is longterm locked, we can check dirty bit without spinlock */
74759+ if (JF_ISSET(node, JNODE_DIRTY)) {
74760+ /* znode is dirty already. All we have to do is to change znode version */
74761+ z->version = znode_build_version(jnode_get_tree(node));
74762+ return;
74763+ }
74764+
74765+ spin_lock_jnode(node);
74766+ jnode_make_dirty_locked(node);
74767+ page = jnode_page(node);
74768+ if (page != NULL) {
74769+ /* this is useful assertion (allows one to check that no
74770+ * modifications are lost due to update of in-flight page),
74771+ * but it requires locking on page to check PG_writeback
74772+ * bit. */
74773+ /* assert("nikita-3292",
74774+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
74775+ page_cache_get(page);
74776+
74777+ /* jnode lock is not needed for the rest of
74778+ * znode_set_dirty(). */
74779+ spin_unlock_jnode(node);
74780+ /* reiser4 file write code calls set_page_dirty for
74781+ * unformatted nodes, for formatted nodes we do it here. */
71430cf6 74782+ reiser4_set_page_dirty_internal(page);
44254afd
MT
74783+ page_cache_release(page);
74784+ /* bump version counter in znode */
74785+ z->version = znode_build_version(jnode_get_tree(node));
74786+ } else {
74787+ assert("zam-596", znode_above_root(JZNODE(node)));
74788+ spin_unlock_jnode(node);
74789+ }
74790+
74791+ assert("nikita-1900", znode_is_write_locked(z));
74792+ assert("jmacd-9777", node->atom != NULL);
74793+}
74794+
71430cf6 74795+int reiser4_sync_atom(txn_atom * atom)
44254afd
MT
74796+{
74797+ int result;
74798+ txn_handle *txnh;
74799+
74800+ txnh = get_current_context()->trans;
74801+
74802+ result = 0;
74803+ if (atom != NULL) {
74804+ if (atom->stage < ASTAGE_PRE_COMMIT) {
74805+ spin_lock_txnh(txnh);
74806+ capture_assign_txnh_nolock(atom, txnh);
74807+ result = force_commit_atom(txnh);
74808+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
74809+ /* wait atom commit */
71430cf6 74810+ reiser4_atom_wait_event(atom);
44254afd
MT
74811+ /* try once more */
74812+ result = RETERR(-E_REPEAT);
74813+ } else
74814+ spin_unlock_atom(atom);
74815+ }
74816+ return result;
74817+}
74818+
74819+#if REISER4_DEBUG
74820+
74821+/* move jnode form one list to another
74822+ call this after atom->capture_count is updated */
74823+void
74824+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
74825+ atom_list new_list, int check_lists)
74826+{
74827+ struct list_head *pos;
74828+
74829+ assert("zam-1018", atom_is_protected(atom));
74830+ assert_spin_locked(&(node->guard));
74831+ assert("", NODE_LIST(node) == old_list);
74832+
74833+ switch (NODE_LIST(node)) {
74834+ case NOT_CAPTURED:
74835+ break;
74836+ case DIRTY_LIST:
74837+ assert("", atom->dirty > 0);
74838+ atom->dirty--;
74839+ break;
74840+ case CLEAN_LIST:
74841+ assert("", atom->clean > 0);
74842+ atom->clean--;
74843+ break;
74844+ case FQ_LIST:
74845+ assert("", atom->fq > 0);
74846+ atom->fq--;
74847+ break;
74848+ case WB_LIST:
74849+ assert("", atom->wb > 0);
74850+ atom->wb--;
74851+ break;
74852+ case OVRWR_LIST:
74853+ assert("", atom->ovrwr > 0);
74854+ atom->ovrwr--;
74855+ break;
74856+ default:
74857+ impossible("", "");
74858+ }
74859+
74860+ switch (new_list) {
74861+ case NOT_CAPTURED:
74862+ break;
74863+ case DIRTY_LIST:
74864+ atom->dirty++;
74865+ break;
74866+ case CLEAN_LIST:
74867+ atom->clean++;
74868+ break;
74869+ case FQ_LIST:
74870+ atom->fq++;
74871+ break;
74872+ case WB_LIST:
74873+ atom->wb++;
74874+ break;
74875+ case OVRWR_LIST:
74876+ atom->ovrwr++;
74877+ break;
74878+ default:
74879+ impossible("", "");
74880+ }
74881+ ASSIGN_NODE_LIST(node, new_list);
74882+ if (0 && check_lists) {
74883+ int count;
74884+ tree_level level;
74885+
74886+ count = 0;
74887+
74888+ /* flush queue list */
71430cf6 74889+ /* reiser4_check_fq(atom); */
44254afd
MT
74890+
74891+ /* dirty list */
74892+ count = 0;
74893+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
74894+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
74895+ count++;
74896+ }
74897+ if (count != atom->dirty)
74898+ warning("", "dirty counter %d, real %d\n", atom->dirty,
74899+ count);
74900+
74901+ /* clean list */
74902+ count = 0;
74903+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
74904+ count++;
74905+ if (count != atom->clean)
74906+ warning("", "clean counter %d, real %d\n", atom->clean,
74907+ count);
74908+
74909+ /* wb list */
74910+ count = 0;
74911+ list_for_each(pos, ATOM_WB_LIST(atom))
74912+ count++;
74913+ if (count != atom->wb)
74914+ warning("", "wb counter %d, real %d\n", atom->wb,
74915+ count);
74916+
74917+ /* overwrite list */
74918+ count = 0;
74919+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
74920+ count++;
74921+
74922+ if (count != atom->ovrwr)
74923+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
74924+ count);
74925+ }
74926+ assert("vs-1624", atom->num_queued == atom->fq);
74927+ if (atom->capture_count !=
74928+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
74929+ printk
74930+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
74931+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
74932+ atom->wb, atom->fq);
74933+ assert("vs-1622",
74934+ atom->capture_count ==
74935+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
74936+ atom->fq);
74937+ }
74938+}
74939+
74940+#endif
74941+
74942+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
74943+ * lock should be taken before calling this function. */
74944+void jnode_make_wander_nolock(jnode * node)
74945+{
74946+ txn_atom *atom;
74947+
74948+ assert("nikita-2431", node != NULL);
74949+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
74950+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
74951+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
71430cf6 74952+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
44254afd
MT
74953+
74954+ atom = node->atom;
74955+
74956+ assert("zam-895", atom != NULL);
74957+ assert("zam-894", atom_is_protected(atom));
74958+
74959+ JF_SET(node, JNODE_OVRWR);
74960+ /* move node to atom's overwrite list */
74961+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
74962+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
74963+}
74964+
74965+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
74966+ * this function. */
74967+void jnode_make_wander(jnode * node)
74968+{
74969+ txn_atom *atom;
74970+
74971+ spin_lock_jnode(node);
74972+ atom = jnode_get_atom(node);
74973+ assert("zam-913", atom != NULL);
74974+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
74975+
74976+ jnode_make_wander_nolock(node);
74977+ spin_unlock_atom(atom);
74978+ spin_unlock_jnode(node);
74979+}
74980+
74981+/* this just sets RELOC bit */
74982+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
74983+{
74984+ assert_spin_locked(&(node->guard));
74985+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
74986+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
74987+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
74988+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
71430cf6 74989+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
44254afd
MT
74990+ jnode_set_reloc(node);
74991+}
74992+
74993+/* Make znode RELOC and put it on flush queue */
74994+void znode_make_reloc(znode * z, flush_queue_t * fq)
74995+{
74996+ jnode *node;
74997+ txn_atom *atom;
74998+
74999+ node = ZJNODE(z);
75000+ spin_lock_jnode(node);
75001+
75002+ atom = jnode_get_atom(node);
75003+ assert("zam-919", atom != NULL);
75004+
75005+ jnode_make_reloc_nolock(fq, node);
75006+ queue_jnode(fq, node);
75007+
75008+ spin_unlock_atom(atom);
75009+ spin_unlock_jnode(node);
75010+
75011+}
75012+
75013+/* Make unformatted node RELOC and put it on flush queue */
75014+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75015+{
75016+ assert("vs-1479", jnode_is_unformatted(node));
75017+
75018+ jnode_make_reloc_nolock(fq, node);
75019+ queue_jnode(fq, node);
75020+}
75021+
71430cf6 75022+int reiser4_capture_super_block(struct super_block *s)
44254afd
MT
75023+{
75024+ int result;
75025+ znode *uber;
75026+ lock_handle lh;
75027+
75028+ init_lh(&lh);
71430cf6 75029+ result = get_uber_znode(reiser4_get_tree(s),
44254afd
MT
75030+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75031+ if (result)
75032+ return result;
75033+
75034+ uber = lh.node;
75035+ /* Grabbing one block for superblock */
75036+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75037+ if (result != 0)
75038+ return result;
75039+
75040+ znode_make_dirty(uber);
75041+
75042+ done_lh(&lh);
75043+ return 0;
75044+}
75045+
75046+/* Wakeup every handle on the atom's WAITFOR list */
75047+static void wakeup_atom_waitfor_list(txn_atom * atom)
75048+{
75049+ txn_wait_links *wlinks;
75050+
75051+ assert("umka-210", atom != NULL);
75052+
75053+ /* atom is locked */
75054+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75055+ if (wlinks->waitfor_cb == NULL ||
75056+ wlinks->waitfor_cb(atom, wlinks))
75057+ /* Wake up. */
75058+ reiser4_wake_up(wlinks->_lock_stack);
75059+ }
75060+}
75061+
75062+/* Wakeup every handle on the atom's WAITING list */
75063+static void wakeup_atom_waiting_list(txn_atom * atom)
75064+{
75065+ txn_wait_links *wlinks;
75066+
75067+ assert("umka-211", atom != NULL);
75068+
75069+ /* atom is locked */
75070+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75071+ if (wlinks->waiting_cb == NULL ||
75072+ wlinks->waiting_cb(atom, wlinks))
75073+ /* Wake up. */
75074+ reiser4_wake_up(wlinks->_lock_stack);
75075+ }
75076+}
75077+
75078+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75079+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75080+{
75081+ assert("nikita-3330", atom != NULL);
75082+ assert_spin_locked(&(atom->alock));
75083+
75084+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75085+ * last transaction handle. */
75086+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75087+}
75088+
75089+/* The general purpose of this function is to wait on the first of two possible events.
75090+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
75091+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75092+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75093+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75094+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75095+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
75096+
75097+ In other words, if either atomh or atomf change state, the handle will be awakened,
75098+ thus there are two lists per atom: WAITING and WAITFOR.
75099+
75100+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75101+ close but it is not assigned to an atom of its own.
75102+
75103+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75104+ BOTH_ATOM_LOCKS. Result: all four locks are released.
75105+*/
75106+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75107+ txn_atom * atomh, txn_capture mode)
75108+{
75109+ int ret;
75110+ txn_wait_links wlinks;
75111+
75112+ assert("umka-213", txnh != NULL);
75113+ assert("umka-214", atomf != NULL);
75114+
75115+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75116+ spin_unlock_txnh(txnh);
75117+ spin_unlock_atom(atomf);
75118+
75119+ if (atomh) {
75120+ spin_unlock_atom(atomh);
75121+ }
75122+
75123+ return RETERR(-E_BLOCK);
75124+ }
75125+
75126+ /* Initialize the waiting list links. */
75127+ init_wlinks(&wlinks);
75128+
75129+ /* Add txnh to atomf's waitfor list, unlock atomf. */
75130+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75131+ wlinks.waitfor_cb = wait_for_fusion;
75132+ atomic_inc(&atomf->refcount);
75133+ spin_unlock_atom(atomf);
75134+
75135+ if (atomh) {
75136+ /* Add txnh to atomh's waiting list, unlock atomh. */
75137+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75138+ atomic_inc(&atomh->refcount);
75139+ spin_unlock_atom(atomh);
75140+ }
75141+
75142+ /* Go to sleep. */
75143+ spin_unlock_txnh(txnh);
75144+
71430cf6 75145+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
44254afd 75146+ if (ret == 0) {
71430cf6 75147+ reiser4_go_to_sleep(wlinks._lock_stack);
44254afd
MT
75148+ ret = RETERR(-E_REPEAT);
75149+ }
75150+
75151+ /* Remove from the waitfor list. */
75152+ spin_lock_atom(atomf);
75153+
75154+ list_del(&wlinks._fwaitfor_link);
75155+ atom_dec_and_unlock(atomf);
75156+
75157+ if (atomh) {
75158+ /* Remove from the waiting list. */
75159+ spin_lock_atom(atomh);
75160+ list_del(&wlinks._fwaiting_link);
75161+ atom_dec_and_unlock(atomh);
75162+ }
75163+ return ret;
75164+}
75165+
75166+static void lock_two_atoms(txn_atom * one, txn_atom * two)
75167+{
75168+ assert("zam-1067", one != two);
75169+
75170+ /* lock the atom with lesser address first */
75171+ if (one < two) {
75172+ spin_lock_atom(one);
71430cf6 75173+ spin_lock_atom_nested(two);
44254afd
MT
75174+ } else {
75175+ spin_lock_atom(two);
71430cf6 75176+ spin_lock_atom_nested(one);
44254afd
MT
75177+ }
75178+}
75179+
44254afd
MT
75180+/* Perform the necessary work to prepare for fusing two atoms, which involves
75181+ * acquiring two atom locks in the proper order. If one of the node's atom is
75182+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75183+ * atom is not then the handle's request is put to sleep. If the node's atom
75184+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
75185+ * atom with fewer pointers to be fused into the atom with more pointer and
75186+ * call capture_fuse_into.
75187+ */
75188+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75189+{
75190+ txn_atom * txnh_atom = txnh->atom;
75191+ txn_atom * block_atom = node->atom;
75192+
75193+ atomic_inc(&txnh_atom->refcount);
75194+ atomic_inc(&block_atom->refcount);
75195+
75196+ spin_unlock_txnh(txnh);
75197+ spin_unlock_jnode(node);
75198+
75199+ lock_two_atoms(txnh_atom, block_atom);
75200+
75201+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75202+ release_two_atoms(txnh_atom, block_atom);
75203+ return RETERR(-E_REPEAT);
75204+ }
75205+
75206+ atomic_dec(&txnh_atom->refcount);
75207+ atomic_dec(&block_atom->refcount);
75208+
75209+ assert ("zam-1066", atom_isopen(txnh_atom));
75210+
75211+ if (txnh_atom->stage >= block_atom->stage ||
75212+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75213+ capture_fuse_into(txnh_atom, block_atom);
75214+ return RETERR(-E_REPEAT);
75215+ }
75216+ spin_lock_txnh(txnh);
75217+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75218+}
75219+
75220+/* This function splices together two jnode lists (small and large) and sets all jnodes in
75221+ the small list to point to the large atom. Returns the length of the list. */
75222+static int
75223+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75224+ struct list_head *small_head)
75225+{
75226+ int count = 0;
75227+ jnode *node;
75228+
75229+ assert("umka-218", large != NULL);
75230+ assert("umka-219", large_head != NULL);
75231+ assert("umka-220", small_head != NULL);
75232+ /* small atom should be locked also. */
75233+ assert_spin_locked(&(large->alock));
75234+
75235+ /* For every jnode on small's capture list... */
75236+ list_for_each_entry(node, small_head, capture_link) {
75237+ count += 1;
75238+
75239+ /* With the jnode lock held, update atom pointer. */
75240+ spin_lock_jnode(node);
75241+ node->atom = large;
75242+ spin_unlock_jnode(node);
75243+ }
75244+
75245+ /* Splice the lists. */
75246+ list_splice_init(small_head, large_head->prev);
75247+
75248+ return count;
75249+}
75250+
75251+/* This function splices together two txnh lists (small and large) and sets all txn handles in
75252+ the small list to point to the large atom. Returns the length of the list. */
75253+static int
75254+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75255+ struct list_head *small_head)
75256+{
75257+ int count = 0;
75258+ txn_handle *txnh;
75259+
75260+ assert("umka-221", large != NULL);
75261+ assert("umka-222", large_head != NULL);
75262+ assert("umka-223", small_head != NULL);
75263+
75264+ /* Adjust every txnh to the new atom. */
75265+ list_for_each_entry(txnh, small_head, txnh_link) {
75266+ count += 1;
75267+
75268+ /* With the txnh lock held, update atom pointer. */
75269+ spin_lock_txnh(txnh);
75270+ txnh->atom = large;
75271+ spin_unlock_txnh(txnh);
75272+ }
75273+
75274+ /* Splice the txn_handle list. */
75275+ list_splice_init(small_head, large_head->prev);
75276+
75277+ return count;
75278+}
75279+
75280+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
75281+ added to LARGE and their ->atom pointers are all updated. The associated counts are
75282+ updated as well, and any waiting handles belonging to either are awakened. Finally the
75283+ smaller atom's refcount is decremented.
75284+*/
75285+static void capture_fuse_into(txn_atom * small, txn_atom * large)
75286+{
75287+ int level;
75288+ unsigned zcount = 0;
75289+ unsigned tcount = 0;
75290+
75291+ assert("umka-224", small != NULL);
75292+ assert("umka-225", small != NULL);
75293+
75294+ assert_spin_locked(&(large->alock));
75295+ assert_spin_locked(&(small->alock));
75296+
75297+ assert("jmacd-201", atom_isopen(small));
75298+ assert("jmacd-202", atom_isopen(large));
75299+
75300+ /* Splice and update the per-level dirty jnode lists */
75301+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75302+ zcount +=
75303+ capture_fuse_jnode_lists(large,
75304+ ATOM_DIRTY_LIST(large, level),
75305+ ATOM_DIRTY_LIST(small, level));
75306+ }
75307+
75308+ /* Splice and update the [clean,dirty] jnode and txnh lists */
75309+ zcount +=
75310+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75311+ ATOM_CLEAN_LIST(small));
75312+ zcount +=
75313+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75314+ ATOM_OVRWR_LIST(small));
75315+ zcount +=
75316+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75317+ ATOM_WB_LIST(small));
75318+ zcount +=
75319+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75320+ tcount +=
75321+ capture_fuse_txnh_lists(large, &large->txnh_list,
75322+ &small->txnh_list);
75323+
75324+ /* Check our accounting. */
75325+ assert("jmacd-1063",
75326+ zcount + small->num_queued == small->capture_count);
75327+ assert("jmacd-1065", tcount == small->txnh_count);
75328+
75329+ /* sum numbers of waiters threads */
75330+ large->nr_waiters += small->nr_waiters;
75331+ small->nr_waiters = 0;
75332+
75333+ /* splice flush queues */
71430cf6 75334+ reiser4_fuse_fq(large, small);
44254afd
MT
75335+
75336+ /* update counter of jnode on every atom' list */
75337+ ON_DEBUG(large->dirty += small->dirty;
75338+ small->dirty = 0;
75339+ large->clean += small->clean;
75340+ small->clean = 0;
75341+ large->ovrwr += small->ovrwr;
75342+ small->ovrwr = 0;
75343+ large->wb += small->wb;
75344+ small->wb = 0;
75345+ large->fq += small->fq;
75346+ small->fq = 0;);
75347+
75348+ /* count flushers in result atom */
75349+ large->nr_flushers += small->nr_flushers;
75350+ small->nr_flushers = 0;
75351+
75352+ /* update counts of flushed nodes */
75353+ large->flushed += small->flushed;
75354+ small->flushed = 0;
75355+
75356+ /* Transfer list counts to large. */
75357+ large->txnh_count += small->txnh_count;
75358+ large->capture_count += small->capture_count;
75359+
75360+ /* Add all txnh references to large. */
75361+ atomic_add(small->txnh_count, &large->refcount);
75362+ atomic_sub(small->txnh_count, &small->refcount);
75363+
75364+ /* Reset small counts */
75365+ small->txnh_count = 0;
75366+ small->capture_count = 0;
75367+
75368+ /* Assign the oldest start_time, merge flags. */
75369+ large->start_time = min(large->start_time, small->start_time);
75370+ large->flags |= small->flags;
75371+
75372+ /* Merge blocknr sets. */
75373+ blocknr_set_merge(&small->delete_set, &large->delete_set);
75374+ blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75375+
75376+ /* Merge allocated/deleted file counts */
75377+ large->nr_objects_deleted += small->nr_objects_deleted;
75378+ large->nr_objects_created += small->nr_objects_created;
75379+
75380+ small->nr_objects_deleted = 0;
75381+ small->nr_objects_created = 0;
75382+
75383+ /* Merge allocated blocks counts */
75384+ large->nr_blocks_allocated += small->nr_blocks_allocated;
75385+
75386+ large->nr_running_queues += small->nr_running_queues;
75387+ small->nr_running_queues = 0;
75388+
75389+ /* Merge blocks reserved for overwrite set. */
75390+ large->flush_reserved += small->flush_reserved;
75391+ small->flush_reserved = 0;
75392+
75393+ if (large->stage < small->stage) {
75394+ /* Large only needs to notify if it has changed state. */
71430cf6 75395+ reiser4_atom_set_stage(large, small->stage);
44254afd
MT
75396+ wakeup_atom_waiting_list(large);
75397+ }
75398+
71430cf6 75399+ reiser4_atom_set_stage(small, ASTAGE_INVALID);
44254afd
MT
75400+
75401+ /* Notify any waiters--small needs to unload its wait lists. Waiters
75402+ actually remove themselves from the list before returning from the
75403+ fuse_wait function. */
75404+ wakeup_atom_waiting_list(small);
75405+
75406+ /* Unlock atoms */
75407+ spin_unlock_atom(large);
75408+ atom_dec_and_unlock(small);
75409+}
75410+
75411+/* TXNMGR STUFF */
75412+
75413+/* Release a block from the atom, reversing the effects of being captured,
75414+ do not release atom's reference to jnode due to holding spin-locks.
75415+ Currently this is only called when the atom commits.
75416+
75417+ NOTE: this function does not release a (journal) reference to jnode
75418+ due to locking optimizations, you should call jput() somewhere after
71430cf6
MT
75419+ calling reiser4_uncapture_block(). */
75420+void reiser4_uncapture_block(jnode * node)
44254afd
MT
75421+{
75422+ txn_atom *atom;
75423+
75424+ assert("umka-226", node != NULL);
75425+ atom = node->atom;
75426+ assert("umka-228", atom != NULL);
75427+
75428+ assert("jmacd-1021", node->atom == atom);
75429+ assert_spin_locked(&(node->guard));
75430+ assert("jmacd-1023", atom_is_protected(atom));
75431+
75432+ JF_CLR(node, JNODE_DIRTY);
75433+ JF_CLR(node, JNODE_RELOC);
75434+ JF_CLR(node, JNODE_OVRWR);
75435+ JF_CLR(node, JNODE_CREATED);
75436+ JF_CLR(node, JNODE_WRITEBACK);
75437+ JF_CLR(node, JNODE_REPACK);
75438+
75439+ list_del_init(&node->capture_link);
75440+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75441+ assert("zam-925", atom_isopen(atom));
75442+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
75443+ ON_DEBUG(atom->num_queued--);
75444+ JF_CLR(node, JNODE_FLUSH_QUEUED);
75445+ }
75446+ atom->capture_count -= 1;
75447+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
75448+ node->atom = NULL;
75449+
75450+ spin_unlock_jnode(node);
75451+ LOCK_CNT_DEC(t_refs);
75452+}
75453+
75454+/* Unconditional insert of jnode into atom's overwrite list. Currently used in
75455+ bitmap-based allocator code for adding modified bitmap blocks the
75456+ transaction. @atom and @node are spin locked */
75457+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
75458+{
75459+ assert("zam-538", atom_is_protected(atom));
75460+ assert_spin_locked(&(node->guard));
75461+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
75462+ assert("zam-543", node->atom == NULL);
75463+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
75464+
75465+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
75466+ jref(node);
75467+ node->atom = atom;
75468+ atom->capture_count++;
75469+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
75470+}
75471+
44254afd
MT
75472+static int count_deleted_blocks_actor(txn_atom * atom,
75473+ const reiser4_block_nr * a,
75474+ const reiser4_block_nr * b, void *data)
75475+{
75476+ reiser4_block_nr *counter = data;
75477+
75478+ assert("zam-995", data != NULL);
75479+ assert("zam-996", a != NULL);
75480+ if (b == NULL)
75481+ *counter += 1;
75482+ else
75483+ *counter += *b;
75484+ return 0;
75485+}
75486+
75487+reiser4_block_nr txnmgr_count_deleted_blocks(void)
75488+{
75489+ reiser4_block_nr result;
75490+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
75491+ txn_atom *atom;
75492+
75493+ result = 0;
75494+
75495+ spin_lock_txnmgr(tmgr);
75496+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
75497+ spin_lock_atom(atom);
75498+ if (atom_isopen(atom))
75499+ blocknr_set_iterator(
75500+ atom, &atom->delete_set,
75501+ count_deleted_blocks_actor, &result, 0);
75502+ spin_unlock_atom(atom);
75503+ }
75504+ spin_unlock_txnmgr(tmgr);
75505+
75506+ return result;
75507+}
75508+
75509+/*
75510+ * Local variables:
75511+ * c-indentation-style: "K&R"
75512+ * mode-name: "LC"
75513+ * c-basic-offset: 8
75514+ * tab-width: 8
75515+ * fill-column: 79
75516+ * End:
75517+ */
71430cf6
MT
75518diff -urN linux-2.6.22.orig/fs/reiser4/txnmgr.h linux-2.6.22/fs/reiser4/txnmgr.h
75519--- linux-2.6.22.orig/fs/reiser4/txnmgr.h 1970-01-01 03:00:00.000000000 +0300
75520+++ linux-2.6.22/fs/reiser4/txnmgr.h 2007-07-29 00:25:35.044739961 +0400
75521@@ -0,0 +1,701 @@
44254afd
MT
75522+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75523+ * reiser4/README */
75524+
75525+/* data-types and function declarations for transaction manager. See txnmgr.c
75526+ * for details. */
75527+
75528+#ifndef __REISER4_TXNMGR_H__
75529+#define __REISER4_TXNMGR_H__
75530+
75531+#include "forward.h"
75532+#include "dformat.h"
75533+
75534+#include <linux/fs.h>
75535+#include <linux/mm.h>
75536+#include <linux/types.h>
75537+#include <linux/spinlock.h>
75538+#include <asm/atomic.h>
71430cf6 75539+#include <linux/wait.h>
44254afd
MT
75540+
75541+/* TYPE DECLARATIONS */
75542+
71430cf6 75543+/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
44254afd
MT
75544+ A capture request dynamically assigns a block to the calling thread's transaction
75545+ handle. */
75546+typedef enum {
75547+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
75548+ atom should fuse in order to ensure that the block commits atomically with the
75549+ caller. */
75550+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
75551+
75552+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
75553+ willing to read a non-committed block without causing atoms to fuse. */
75554+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
75555+
75556+ /* A READ_MODIFY request indicates that a block will be read but that the caller
75557+ wishes for the block to be captured as it will be written. This capture request
75558+ mode is not currently used, but eventually it will be useful for preventing
75559+ deadlock in read-modify-write cycles. */
75560+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
75561+
75562+ /* A WRITE capture request indicates that a block will be modified and that atoms
75563+ should fuse to make the commit atomic. */
75564+ TXN_CAPTURE_WRITE = (1 << 3),
75565+
75566+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
75567+ exclusive type designation from extra bits that may be supplied -- see
75568+ below. */
75569+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
75570+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
75571+ TXN_CAPTURE_WRITE),
75572+
75573+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
75574+ indicate modification will occur. */
75575+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
75576+
71430cf6 75577+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
44254afd
MT
75578+ prefer not to sleep waiting for an aging atom to commit. */
75579+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
75580+
71430cf6
MT
75581+ /* An option to reiser4_try_capture to prevent atom fusion, just simple
75582+ capturing is allowed */
44254afd
MT
75583+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
75584+
75585+ /* This macro selects only the exclusive capture request types, stripping out any
75586+ options that were supplied (i.e., NONBLOCKING). */
75587+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
75588+} txn_capture;
75589+
75590+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
75591+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
75592+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
75593+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
75594+typedef enum {
75595+ TXN_WRITE_FUSING = (1 << 0),
75596+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
75597+} txn_mode;
75598+
75599+/* Every atom has a stage, which is one of these exclusive values: */
75600+typedef enum {
75601+ /* Initially an atom is free. */
75602+ ASTAGE_FREE = 0,
75603+
75604+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
75605+ blocks and fuse with other atoms. */
75606+ ASTAGE_CAPTURE_FUSE = 1,
75607+
75608+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
75609+
75610+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
75611+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
75612+ atoms in the CAPTURE_FUSE stage. */
75613+ ASTAGE_CAPTURE_WAIT = 2,
75614+
75615+ /* Waiting for I/O before commit. Copy-on-capture (see
75616+ http://namesys.com/v4/v4.html). */
75617+ ASTAGE_PRE_COMMIT = 3,
75618+
75619+ /* Post-commit overwrite I/O. Steal-on-capture. */
75620+ ASTAGE_POST_COMMIT = 4,
75621+
75622+ /* Atom which waits for the removal of the last reference to (it? ) to
75623+ * be deleted from memory */
75624+ ASTAGE_DONE = 5,
75625+
75626+ /* invalid atom. */
75627+ ASTAGE_INVALID = 6,
75628+
75629+} txn_stage;
75630+
75631+/* Certain flags may be set in the txn_atom->flags field. */
75632+typedef enum {
75633+ /* Indicates that the atom should commit as soon as possible. */
75634+ ATOM_FORCE_COMMIT = (1 << 0),
75635+ /* to avoid endless loop, mark the atom (which was considered as too
75636+ * small) after failed attempt to fuse it. */
75637+ ATOM_CANCEL_FUSION = (1 << 1)
75638+} txn_flags;
75639+
75640+/* Flags for controlling commit_txnh */
75641+typedef enum {
75642+ /* Wait commit atom completion in commit_txnh */
75643+ TXNH_WAIT_COMMIT = 0x2,
75644+ /* Don't commit atom when this handle is closed */
75645+ TXNH_DONT_COMMIT = 0x4
75646+} txn_handle_flags_t;
75647+
75648+/* TYPE DEFINITIONS */
75649+
75650+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
75651+ fields, so typically an operation on the atom through either of these objects must (1)
75652+ lock the object, (2) read the atom pointer, (3) lock the atom.
75653+
75654+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
75655+ through the list of handles and pages held by the smaller of the two atoms. For each
75656+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
75657+ object, and (2) update the atom pointer.
75658+
75659+ You can see that there is a conflict of lock ordering here, so the more-complex
75660+ procedure should have priority, i.e., the fusing process has priority so that it is
75661+ guaranteed to make progress and to avoid restarts.
75662+
75663+ This decision, however, means additional complexity for aquiring the atom lock in the
75664+ first place.
75665+
75666+ The general original procedure followed in the code was:
75667+
75668+ TXN_OBJECT *obj = ...;
75669+ TXN_ATOM *atom;
75670+
75671+ spin_lock (& obj->_lock);
75672+
75673+ atom = obj->_atom;
75674+
75675+ if (! spin_trylock_atom (atom))
75676+ {
75677+ spin_unlock (& obj->_lock);
75678+ RESTART OPERATION, THERE WAS A RACE;
75679+ }
75680+
75681+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75682+
75683+ It has however been found that this wastes CPU a lot in a manner that is
75684+ hard to profile. So, proper refcounting was added to atoms, and new
75685+ standard locking sequence is like following:
75686+
75687+ TXN_OBJECT *obj = ...;
75688+ TXN_ATOM *atom;
75689+
75690+ spin_lock (& obj->_lock);
75691+
75692+ atom = obj->_atom;
75693+
75694+ if (! spin_trylock_atom (atom))
75695+ {
75696+ atomic_inc (& atom->refcount);
75697+ spin_unlock (& obj->_lock);
75698+ spin_lock (&atom->_lock);
75699+ atomic_dec (& atom->refcount);
75700+ // HERE atom is locked
75701+ spin_unlock (&atom->_lock);
75702+ RESTART OPERATION, THERE WAS A RACE;
75703+ }
75704+
75705+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
75706+
75707+ (core of this is implemented in trylock_throttle() function)
75708+
75709+ See the jnode_get_atom() function for a common case.
75710+
75711+ As an additional (and important) optimization allowing to avoid restarts,
75712+ it is possible to re-check required pre-conditions at the HERE point in
75713+ code above and proceed without restarting if they are still satisfied.
75714+*/
75715+
44254afd
MT
75716+/* An atomic transaction: this is the underlying system representation
75717+ of a transaction, not the one seen by clients.
75718+
75719+ Invariants involving this data-type:
75720+
75721+ [sb-fake-allocated]
75722+*/
75723+struct txn_atom {
75724+ /* The spinlock protecting the atom, held during fusion and various other state
75725+ changes. */
75726+ spinlock_t alock;
75727+
75728+ /* The atom's reference counter, increasing (in case of a duplication
75729+ of an existing reference or when we are sure that some other
75730+ reference exists) may be done without taking spinlock, decrementing
75731+ of the ref. counter requires a spinlock to be held.
75732+
75733+ Each transaction handle counts in ->refcount. All jnodes count as
75734+ one reference acquired in atom_begin_andlock(), released in
75735+ commit_current_atom().
75736+ */
75737+ atomic_t refcount;
75738+
75739+ /* The atom_id identifies the atom in persistent records such as the log. */
75740+ __u32 atom_id;
75741+
75742+ /* Flags holding any of the txn_flags enumerated values (e.g.,
75743+ ATOM_FORCE_COMMIT). */
75744+ __u32 flags;
75745+
75746+ /* Number of open handles. */
75747+ __u32 txnh_count;
75748+
75749+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
75750+ dirty_nodes[level] and clean_nodes lists. */
75751+ __u32 capture_count;
75752+
75753+#if REISER4_DEBUG
75754+ int clean;
75755+ int dirty;
75756+ int ovrwr;
75757+ int wb;
75758+ int fq;
75759+#endif
75760+
75761+ __u32 flushed;
75762+
75763+ /* Current transaction stage. */
75764+ txn_stage stage;
75765+
75766+ /* Start time. */
75767+ unsigned long start_time;
75768+
75769+ /* The atom's delete set. It collects block numbers of the nodes
75770+ which were deleted during the transaction. */
71430cf6 75771+ struct list_head delete_set;
44254afd
MT
75772+
75773+ /* The atom's wandered_block mapping. */
71430cf6 75774+ struct list_head wandered_map;
44254afd
MT
75775+
75776+ /* The transaction's list of dirty captured nodes--per level. Index
75777+ by (level). dirty_nodes[0] is for znode-above-root */
75778+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
75779+
75780+ /* The transaction's list of clean captured nodes. */
75781+ struct list_head clean_nodes;
75782+
75783+ /* The atom's overwrite set */
75784+ struct list_head ovrwr_nodes;
75785+
75786+ /* nodes which are being written to disk */
75787+ struct list_head writeback_nodes;
75788+
75789+ /* list of inodes */
75790+ struct list_head inodes;
75791+
75792+ /* List of handles associated with this atom. */
75793+ struct list_head txnh_list;
75794+
75795+ /* Transaction list link: list of atoms in the transaction manager. */
75796+ struct list_head atom_link;
75797+
75798+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
75799+ struct list_head fwaitfor_list;
75800+
75801+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
75802+ struct list_head fwaiting_list;
75803+
75804+ /* Numbers of objects which were deleted/created in this transaction
75805+ thereby numbers of objects IDs which were released/deallocated. */
75806+ int nr_objects_deleted;
75807+ int nr_objects_created;
75808+ /* number of blocks allocated during the transaction */
75809+ __u64 nr_blocks_allocated;
75810+ /* All atom's flush queue objects are on this list */
75811+ struct list_head flush_queues;
75812+#if REISER4_DEBUG
75813+ /* number of flush queues for this atom. */
75814+ int nr_flush_queues;
75815+ /* Number of jnodes which were removed from atom's lists and put
75816+ on flush_queue */
75817+ int num_queued;
75818+#endif
75819+ /* number of threads who wait for this atom to complete commit */
75820+ int nr_waiters;
75821+ /* number of threads which do jnode_flush() over this atom */
75822+ int nr_flushers;
75823+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
71430cf6 75824+ are submitted to disk by the reiser4_write_fq() routine. */
44254afd
MT
75825+ int nr_running_queues;
75826+ /* A counter of grabbed unformatted nodes, see a description of the
75827+ * reiser4 space reservation scheme at block_alloc.c */
75828+ reiser4_block_nr flush_reserved;
75829+#if REISER4_DEBUG
75830+ void *committer;
75831+#endif
75832+ struct super_block *super;
75833+};
75834+
75835+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
75836+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
75837+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
75838+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
75839+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
75840+
75841+#define NODE_LIST(node) (node)->list
75842+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
75843+ON_DEBUG(void
75844+ count_jnode(txn_atom *, jnode *, atom_list old_list,
75845+ atom_list new_list, int check_lists));
75846+
44254afd
MT
75847+/* A transaction handle: the client obtains and commits this handle which is assigned by
75848+ the system to a txn_atom. */
75849+struct txn_handle {
75850+ /* Spinlock protecting ->atom pointer */
75851+ spinlock_t hlock;
75852+
75853+ /* Flags for controlling commit_txnh() behavior */
75854+ /* from txn_handle_flags_t */
75855+ txn_handle_flags_t flags;
75856+
75857+ /* Whether it is READ_FUSING or WRITE_FUSING. */
75858+ txn_mode mode;
75859+
75860+ /* If assigned, the atom it is part of. */
75861+ txn_atom *atom;
75862+
75863+ /* Transaction list link. Head is in txn_atom. */
75864+ struct list_head txnh_link;
75865+};
75866+
75867+/* The transaction manager: one is contained in the reiser4_super_info_data */
75868+struct txn_mgr {
75869+ /* A spinlock protecting the atom list, id_count, flush_control */
75870+ spinlock_t tmgr_lock;
75871+
75872+ /* List of atoms. */
75873+ struct list_head atoms_list;
75874+
75875+ /* Number of atoms. */
75876+ int atom_count;
75877+
75878+ /* A counter used to assign atom->atom_id values. */
75879+ __u32 id_count;
75880+
71430cf6
MT
75881+ /* a mutex object for commit serialization */
75882+ struct mutex commit_mutex;
44254afd
MT
75883+
75884+ /* a list of all txnmrgs served by particular daemon. */
75885+ struct list_head linkage;
75886+
75887+ /* description of daemon for this txnmgr */
75888+ ktxnmgrd_context *daemon;
75889+
75890+ /* parameters. Adjustable through mount options. */
75891+ unsigned int atom_max_size;
75892+ unsigned int atom_max_age;
75893+ unsigned int atom_min_size;
75894+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
75895+ unsigned int atom_max_flushers;
75896+ struct dentry *debugfs_atom_count;
75897+ struct dentry *debugfs_id_count;
75898+};
75899+
75900+/* FUNCTION DECLARATIONS */
75901+
75902+/* These are the externally (within Reiser4) visible transaction functions, therefore they
75903+ are prefixed with "txn_". For comments, see txnmgr.c. */
75904+
75905+extern int init_txnmgr_static(void);
75906+extern void done_txnmgr_static(void);
75907+
71430cf6
MT
75908+extern void reiser4_init_txnmgr(txn_mgr *);
75909+extern void reiser4_done_txnmgr(txn_mgr *);
44254afd 75910+
71430cf6 75911+extern int reiser4_txn_reserve(int reserved);
44254afd 75912+
71430cf6
MT
75913+extern void reiser4_txn_begin(reiser4_context * context);
75914+extern int reiser4_txn_end(reiser4_context * context);
44254afd 75915+
71430cf6
MT
75916+extern void reiser4_txn_restart(reiser4_context * context);
75917+extern void reiser4_txn_restart_current(void);
44254afd
MT
75918+
75919+extern int txnmgr_force_commit_all(struct super_block *, int);
75920+extern int current_atom_should_commit(void);
75921+
75922+extern jnode *find_first_dirty_jnode(txn_atom *, int);
75923+
75924+extern int commit_some_atoms(txn_mgr *);
75925+extern int force_commit_atom(txn_handle *);
75926+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
75927+
75928+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
75929+
71430cf6 75930+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
44254afd
MT
75931+
75932+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
75933+ int alloc_value);
75934+extern void atom_dec_and_unlock(txn_atom * atom);
75935+
71430cf6 75936+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
44254afd
MT
75937+extern int try_capture_page_to_invalidate(struct page *pg);
75938+
71430cf6
MT
75939+extern void reiser4_uncapture_page(struct page *pg);
75940+extern void reiser4_uncapture_block(jnode *);
75941+extern void reiser4_uncapture_jnode(jnode *);
44254afd 75942+
71430cf6
MT
75943+extern int reiser4_capture_inode(struct inode *);
75944+extern int reiser4_uncapture_inode(struct inode *);
44254afd
MT
75945+
75946+extern txn_atom *get_current_atom_locked_nocheck(void);
75947+
75948+#if REISER4_DEBUG
75949+
75950+/**
75951+ * atom_is_protected - make sure that nobody but us can do anything with atom
75952+ * @atom: atom to be checked
75953+ *
75954+ * This is used to assert that atom either entered commit stages or is spin
75955+ * locked.
75956+ */
75957+static inline int atom_is_protected(txn_atom *atom)
75958+{
75959+ if (atom->stage >= ASTAGE_PRE_COMMIT)
75960+ return 1;
75961+ assert_spin_locked(&(atom->alock));
75962+ return 1;
75963+}
75964+
75965+#endif
75966+
75967+/* Get the current atom and spinlock it if current atom present. May not return NULL */
75968+static inline txn_atom *get_current_atom_locked(void)
75969+{
75970+ txn_atom *atom;
75971+
75972+ atom = get_current_atom_locked_nocheck();
75973+ assert("zam-761", atom != NULL);
75974+
75975+ return atom;
75976+}
75977+
75978+extern txn_atom *jnode_get_atom(jnode *);
75979+
71430cf6
MT
75980+extern void reiser4_atom_wait_event(txn_atom *);
75981+extern void reiser4_atom_send_event(txn_atom *);
44254afd
MT
75982+
75983+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
71430cf6 75984+extern int reiser4_capture_super_block(struct super_block *s);
44254afd
MT
75985+int capture_bulk(jnode **, int count);
75986+
75987+/* See the comment on the function blocknrset.c:blocknr_set_add for the
75988+ calling convention of these three routines. */
71430cf6
MT
75989+extern void blocknr_set_init(struct list_head * bset);
75990+extern void blocknr_set_destroy(struct list_head * bset);
75991+extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
44254afd 75992+extern int blocknr_set_add_extent(txn_atom * atom,
71430cf6 75993+ struct list_head * bset,
44254afd
MT
75994+ blocknr_set_entry ** new_bsep,
75995+ const reiser4_block_nr * start,
75996+ const reiser4_block_nr * len);
71430cf6 75997+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
44254afd
MT
75998+ blocknr_set_entry ** new_bsep,
75999+ const reiser4_block_nr * a,
76000+ const reiser4_block_nr * b);
76001+
76002+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76003+ const reiser4_block_nr *, void *);
76004+
71430cf6 76005+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
44254afd
MT
76006+ blocknr_set_actor_f actor, void *data,
76007+ int delete);
76008+
76009+/* flush code takes care about how to fuse flush queues */
76010+extern void flush_init_atom(txn_atom * atom);
76011+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76012+
76013+static inline void spin_lock_atom(txn_atom *atom)
76014+{
76015+ /* check that spinlocks of lower priorities are not held */
76016+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71430cf6 76017+ LOCK_CNT_NIL(spin_locked_atom) &&
44254afd
MT
76018+ LOCK_CNT_NIL(spin_locked_jnode) &&
76019+ LOCK_CNT_NIL(spin_locked_zlock) &&
76020+ LOCK_CNT_NIL(rw_locked_dk) &&
76021+ LOCK_CNT_NIL(rw_locked_tree)));
76022+
76023+ spin_lock(&(atom->alock));
76024+
76025+ LOCK_CNT_INC(spin_locked_atom);
76026+ LOCK_CNT_INC(spin_locked);
76027+}
76028+
71430cf6
MT
76029+static inline void spin_lock_atom_nested(txn_atom *atom)
76030+{
76031+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76032+ LOCK_CNT_NIL(spin_locked_jnode) &&
76033+ LOCK_CNT_NIL(spin_locked_zlock) &&
76034+ LOCK_CNT_NIL(rw_locked_dk) &&
76035+ LOCK_CNT_NIL(rw_locked_tree)));
76036+
76037+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
76038+
76039+ LOCK_CNT_INC(spin_locked_atom);
76040+ LOCK_CNT_INC(spin_locked);
76041+}
76042+
44254afd
MT
76043+static inline int spin_trylock_atom(txn_atom *atom)
76044+{
76045+ if (spin_trylock(&(atom->alock))) {
76046+ LOCK_CNT_INC(spin_locked_atom);
76047+ LOCK_CNT_INC(spin_locked);
76048+ return 1;
76049+ }
76050+ return 0;
76051+}
76052+
76053+static inline void spin_unlock_atom(txn_atom *atom)
76054+{
76055+ assert_spin_locked(&(atom->alock));
76056+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76057+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76058+
76059+ LOCK_CNT_DEC(spin_locked_atom);
76060+ LOCK_CNT_DEC(spin_locked);
76061+
76062+ spin_unlock(&(atom->alock));
76063+}
76064+
76065+static inline void spin_lock_txnh(txn_handle *txnh)
76066+{
76067+ /* check that spinlocks of lower priorities are not held */
76068+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76069+ LOCK_CNT_NIL(spin_locked_zlock) &&
76070+ LOCK_CNT_NIL(rw_locked_tree)));
76071+
76072+ spin_lock(&(txnh->hlock));
76073+
76074+ LOCK_CNT_INC(spin_locked_txnh);
76075+ LOCK_CNT_INC(spin_locked);
76076+}
76077+
76078+static inline int spin_trylock_txnh(txn_handle *txnh)
76079+{
76080+ if (spin_trylock(&(txnh->hlock))) {
76081+ LOCK_CNT_INC(spin_locked_txnh);
76082+ LOCK_CNT_INC(spin_locked);
76083+ return 1;
76084+ }
76085+ return 0;
76086+}
76087+
76088+static inline void spin_unlock_txnh(txn_handle *txnh)
76089+{
76090+ assert_spin_locked(&(txnh->hlock));
76091+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76092+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76093+
76094+ LOCK_CNT_DEC(spin_locked_txnh);
76095+ LOCK_CNT_DEC(spin_locked);
76096+
76097+ spin_unlock(&(txnh->hlock));
76098+}
76099+
76100+#define spin_ordering_pred_txnmgr(tmgr) \
76101+ ( LOCK_CNT_NIL(spin_locked_atom) && \
76102+ LOCK_CNT_NIL(spin_locked_txnh) && \
76103+ LOCK_CNT_NIL(spin_locked_jnode) && \
76104+ LOCK_CNT_NIL(rw_locked_zlock) && \
76105+ LOCK_CNT_NIL(rw_locked_dk) && \
76106+ LOCK_CNT_NIL(rw_locked_tree) )
76107+
76108+static inline void spin_lock_txnmgr(txn_mgr *mgr)
76109+{
76110+ /* check that spinlocks of lower priorities are not held */
76111+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76112+ LOCK_CNT_NIL(spin_locked_txnh) &&
76113+ LOCK_CNT_NIL(spin_locked_jnode) &&
76114+ LOCK_CNT_NIL(spin_locked_zlock) &&
76115+ LOCK_CNT_NIL(rw_locked_dk) &&
76116+ LOCK_CNT_NIL(rw_locked_tree)));
76117+
76118+ spin_lock(&(mgr->tmgr_lock));
76119+
76120+ LOCK_CNT_INC(spin_locked_txnmgr);
76121+ LOCK_CNT_INC(spin_locked);
76122+}
76123+
76124+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76125+{
76126+ if (spin_trylock(&(mgr->tmgr_lock))) {
76127+ LOCK_CNT_INC(spin_locked_txnmgr);
76128+ LOCK_CNT_INC(spin_locked);
76129+ return 1;
76130+ }
76131+ return 0;
76132+}
76133+
76134+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76135+{
76136+ assert_spin_locked(&(mgr->tmgr_lock));
76137+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76138+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76139+
76140+ LOCK_CNT_DEC(spin_locked_txnmgr);
76141+ LOCK_CNT_DEC(spin_locked);
76142+
76143+ spin_unlock(&(mgr->tmgr_lock));
76144+}
76145+
76146+typedef enum {
76147+ FQ_IN_USE = 0x1
76148+} flush_queue_state_t;
76149+
76150+typedef struct flush_queue flush_queue_t;
76151+
76152+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76153+ is filled by the jnode_flush() routine, and written to disk under memory
76154+ pressure or at atom commit time. */
76155+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76156+ field and fq->prepped list can be modified if atom is spin-locked and fq
76157+ object is "in-use" state. For read-only traversal of the fq->prepped list
76158+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76159+ only have atom spin-locked. */
76160+struct flush_queue {
76161+ /* linkage element is the first in this structure to make debugging
76162+ easier. See field in atom struct for description of list. */
76163+ struct list_head alink;
76164+ /* A spinlock to protect changes of fq state and fq->atom pointer */
76165+ spinlock_t guard;
76166+ /* flush_queue state: [in_use | ready] */
76167+ flush_queue_state_t state;
76168+ /* A list which contains queued nodes, queued nodes are removed from any
76169+ * atom's list and put on this ->prepped one. */
76170+ struct list_head prepped;
76171+ /* number of submitted i/o requests */
76172+ atomic_t nr_submitted;
76173+ /* number of i/o errors */
76174+ atomic_t nr_errors;
76175+ /* An atom this flush queue is attached to */
76176+ txn_atom *atom;
71430cf6
MT
76177+ /* A wait queue head to wait on i/o completion */
76178+ wait_queue_head_t wait;
44254afd
MT
76179+#if REISER4_DEBUG
76180+ /* A thread which took this fq in exclusive use, NULL if fq is free,
76181+ * used for debugging. */
76182+ struct task_struct *owner;
76183+#endif
76184+};
76185+
71430cf6
MT
76186+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
76187+extern void reiser4_fq_put_nolock(flush_queue_t *);
76188+extern void reiser4_fq_put(flush_queue_t *);
76189+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
44254afd 76190+extern void queue_jnode(flush_queue_t *, jnode *);
44254afd 76191+
71430cf6 76192+extern int reiser4_write_fq(flush_queue_t *, long *, int);
44254afd
MT
76193+extern int current_atom_finish_all_fq(void);
76194+extern void init_atom_fq_parts(txn_atom *);
76195+
76196+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76197+
76198+extern void znode_make_dirty(znode * node);
76199+extern void jnode_make_dirty_locked(jnode * node);
76200+
71430cf6 76201+extern int reiser4_sync_atom(txn_atom * atom);
44254afd
MT
76202+
76203+#if REISER4_DEBUG
76204+extern int atom_fq_parts_are_clean(txn_atom *);
76205+#endif
76206+
76207+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76208+extern flush_queue_t *get_fq_for_current_atom(void);
76209+
71430cf6 76210+void reiser4_invalidate_list(struct list_head * head);
44254afd
MT
76211+
76212+# endif /* __REISER4_TXNMGR_H__ */
76213+
76214+/* Make Linus happy.
76215+ Local variables:
76216+ c-indentation-style: "K&R"
76217+ mode-name: "LC"
76218+ c-basic-offset: 8
76219+ tab-width: 8
76220+ fill-column: 120
76221+ End:
76222+*/
71430cf6
MT
76223diff -urN linux-2.6.22.orig/fs/reiser4/type_safe_hash.h linux-2.6.22/fs/reiser4/type_safe_hash.h
76224--- linux-2.6.22.orig/fs/reiser4/type_safe_hash.h 1970-01-01 03:00:00.000000000 +0300
76225+++ linux-2.6.22/fs/reiser4/type_safe_hash.h 2007-07-29 00:25:35.044739961 +0400
44254afd
MT
76226@@ -0,0 +1,320 @@
76227+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76228+ * reiser4/README */
76229+
76230+/* A hash table class that uses hash chains (singly-linked) and is
76231+ parametrized to provide type safety. */
76232+
76233+#ifndef __REISER4_TYPE_SAFE_HASH_H__
76234+#define __REISER4_TYPE_SAFE_HASH_H__
76235+
76236+#include "debug.h"
76237+
76238+#include <asm/errno.h>
76239+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76240+ based on the object type. You need to declare the item type before
76241+ this definition, define it after this definition. */
76242+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
76243+ \
76244+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
76245+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
76246+ \
76247+struct PREFIX##_hash_table_ \
76248+{ \
76249+ ITEM_TYPE **_table; \
76250+ __u32 _buckets; \
76251+}; \
76252+ \
76253+struct PREFIX##_hash_link_ \
76254+{ \
76255+ ITEM_TYPE *_next; \
76256+}
76257+
76258+/* Step 2: Define the object type of the hash: give it field of type
76259+ PREFIX_hash_link. */
76260+
76261+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76262+ the type and field name used in step 3. The arguments are:
76263+
76264+ ITEM_TYPE The item type being hashed
76265+ KEY_TYPE The type of key being hashed
76266+ KEY_NAME The name of the key field within the item
76267+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
76268+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
76269+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
76270+
76271+ It implements these functions:
76272+
76273+ prefix_hash_init Initialize the table given its size.
76274+ prefix_hash_insert Insert an item
76275+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
76276+ prefix_hash_find Find an item by key
76277+ prefix_hash_find_index Find an item w/ precomputed hash_index
76278+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
76279+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
76280+
76281+ If you'd like something to be done differently, feel free to ask me
76282+ for modifications. Additional features that could be added but
76283+ have not been:
76284+
76285+ prefix_hash_remove_key Find and remove an item by key
76286+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
76287+
76288+ The hash_function currently receives only the key as an argument,
76289+ meaning it must somehow know the number of buckets. If this is a
76290+ problem let me know.
76291+
76292+ This hash table uses a single-linked hash chain. This means
76293+ insertion is fast but deletion requires searching the chain.
76294+
76295+ There is also the doubly-linked hash chain approach, under which
76296+ deletion requires no search but the code is longer and it takes two
76297+ pointers per item.
76298+
76299+ The circularly-linked approach has the shortest code but requires
76300+ two pointers per bucket, doubling the size of the bucket array (in
76301+ addition to two pointers per item).
76302+*/
76303+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
76304+ \
76305+static __inline__ void \
76306+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
76307+ __u32 hash UNUSED_ARG) \
76308+{ \
76309+ assert("nikita-2780", hash < table->_buckets); \
76310+} \
76311+ \
76312+static __inline__ int \
76313+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
76314+ __u32 buckets) \
76315+{ \
76316+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
76317+ hash->_buckets = buckets; \
76318+ if (hash->_table == NULL) \
76319+ { \
76320+ return RETERR(-ENOMEM); \
76321+ } \
76322+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
76323+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
76324+ return 0; \
76325+} \
76326+ \
76327+static __inline__ void \
76328+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
76329+{ \
76330+ if (REISER4_DEBUG && hash->_table != NULL) { \
76331+ __u32 i; \
76332+ for (i = 0 ; i < hash->_buckets ; ++ i) \
76333+ assert("nikita-2905", hash->_table[i] == NULL); \
76334+ } \
76335+ if (hash->_table != NULL) \
76336+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
76337+ hash->_table = NULL; \
76338+} \
76339+ \
76340+static __inline__ void \
76341+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
76342+{ \
76343+ prefetch(item->LINK_NAME._next); \
76344+} \
76345+ \
76346+static __inline__ void \
76347+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
76348+ __u32 index) \
76349+{ \
76350+ prefetch(hash->_table[index]); \
76351+} \
76352+ \
76353+static __inline__ ITEM_TYPE* \
76354+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
76355+ __u32 hash_index, \
76356+ KEY_TYPE const *find_key) \
76357+{ \
76358+ ITEM_TYPE *item; \
76359+ \
76360+ PREFIX##_check_hash(hash, hash_index); \
76361+ \
76362+ for (item = hash->_table[hash_index]; \
76363+ item != NULL; \
76364+ item = item->LINK_NAME._next) \
76365+ { \
76366+ prefetch(item->LINK_NAME._next); \
76367+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
76368+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
76369+ { \
76370+ return item; \
76371+ } \
76372+ } \
76373+ \
76374+ return NULL; \
76375+} \
76376+ \
76377+static __inline__ ITEM_TYPE* \
76378+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
76379+ __u32 hash_index, \
76380+ KEY_TYPE const *find_key) \
76381+{ \
76382+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
76383+ \
76384+ PREFIX##_check_hash(hash, hash_index); \
76385+ \
76386+ while (*item != NULL) { \
76387+ prefetch(&(*item)->LINK_NAME._next); \
76388+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
76389+ ITEM_TYPE *found; \
76390+ \
76391+ found = *item; \
76392+ *item = found->LINK_NAME._next; \
76393+ found->LINK_NAME._next = hash->_table[hash_index]; \
76394+ hash->_table[hash_index] = found; \
76395+ return found; \
76396+ } \
76397+ item = &(*item)->LINK_NAME._next; \
76398+ } \
76399+ return NULL; \
76400+} \
76401+ \
76402+static __inline__ int \
76403+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
76404+ __u32 hash_index, \
76405+ ITEM_TYPE *del_item) \
76406+{ \
76407+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
76408+ \
76409+ PREFIX##_check_hash(hash, hash_index); \
76410+ \
76411+ while (*hash_item_p != NULL) { \
76412+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
76413+ if (*hash_item_p == del_item) { \
76414+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
76415+ return 1; \
76416+ } \
76417+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
76418+ } \
76419+ return 0; \
76420+} \
76421+ \
76422+static __inline__ void \
76423+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
76424+ __u32 hash_index, \
76425+ ITEM_TYPE *ins_item) \
76426+{ \
76427+ PREFIX##_check_hash(hash, hash_index); \
76428+ \
76429+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76430+ hash->_table[hash_index] = ins_item; \
76431+} \
76432+ \
76433+static __inline__ void \
76434+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
76435+ __u32 hash_index, \
76436+ ITEM_TYPE *ins_item) \
76437+{ \
76438+ PREFIX##_check_hash(hash, hash_index); \
76439+ \
76440+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
76441+ smp_wmb(); \
76442+ hash->_table[hash_index] = ins_item; \
76443+} \
76444+ \
76445+static __inline__ ITEM_TYPE* \
76446+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
76447+ KEY_TYPE const *find_key) \
76448+{ \
76449+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
76450+} \
76451+ \
76452+static __inline__ ITEM_TYPE* \
76453+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
76454+ KEY_TYPE const *find_key) \
76455+{ \
76456+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
76457+} \
76458+ \
76459+static __inline__ int \
76460+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
76461+ ITEM_TYPE *del_item) \
76462+{ \
76463+ return PREFIX##_hash_remove_index (hash, \
76464+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
76465+} \
76466+ \
76467+static __inline__ int \
76468+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
76469+ ITEM_TYPE *del_item) \
76470+{ \
76471+ return PREFIX##_hash_remove (hash, del_item); \
76472+} \
76473+ \
76474+static __inline__ void \
76475+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
76476+ ITEM_TYPE *ins_item) \
76477+{ \
76478+ return PREFIX##_hash_insert_index (hash, \
76479+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
76480+} \
76481+ \
76482+static __inline__ void \
76483+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
76484+ ITEM_TYPE *ins_item) \
76485+{ \
76486+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
76487+ ins_item); \
76488+} \
76489+ \
76490+static __inline__ ITEM_TYPE * \
76491+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
76492+{ \
76493+ ITEM_TYPE *first; \
76494+ \
76495+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
76496+ first = hash->_table[ind]; \
76497+ if (first != NULL) \
76498+ break; \
76499+ } \
76500+ return first; \
76501+} \
76502+ \
76503+static __inline__ ITEM_TYPE * \
76504+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
76505+ ITEM_TYPE *item) \
76506+{ \
76507+ ITEM_TYPE *next; \
76508+ \
76509+ if (item == NULL) \
76510+ return NULL; \
76511+ next = item->LINK_NAME._next; \
76512+ if (next == NULL) \
76513+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
76514+ return next; \
76515+} \
76516+ \
76517+typedef struct {} PREFIX##_hash_dummy
76518+
76519+#define for_all_ht_buckets(table, head) \
76520+for ((head) = &(table) -> _table[ 0 ] ; \
76521+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
76522+
76523+#define for_all_in_bucket(bucket, item, next, field) \
76524+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
76525+ (item) != NULL ; \
76526+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
76527+
76528+#define for_all_in_htable(table, prefix, item, next) \
76529+for ((item) = prefix ## _hash_first ((table), 0), \
76530+ (next) = prefix ## _hash_next ((table), (item)) ; \
76531+ (item) != NULL ; \
76532+ (item) = (next), \
76533+ (next) = prefix ## _hash_next ((table), (item)))
76534+
76535+/* __REISER4_TYPE_SAFE_HASH_H__ */
76536+#endif
76537+
76538+/* Make Linus happy.
76539+ Local variables:
76540+ c-indentation-style: "K&R"
76541+ mode-name: "LC"
76542+ c-basic-offset: 8
76543+ tab-width: 8
76544+ fill-column: 120
76545+ End:
76546+*/
71430cf6
MT
76547diff -urN linux-2.6.22.orig/fs/reiser4/vfs_ops.c linux-2.6.22/fs/reiser4/vfs_ops.c
76548--- linux-2.6.22.orig/fs/reiser4/vfs_ops.c 1970-01-01 03:00:00.000000000 +0300
76549+++ linux-2.6.22/fs/reiser4/vfs_ops.c 2007-07-29 00:25:35.044739961 +0400
76550@@ -0,0 +1,259 @@
44254afd
MT
76551+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76552+ * reiser4/README */
76553+
76554+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
76555+ here. */
76556+
76557+#include "forward.h"
76558+#include "debug.h"
76559+#include "dformat.h"
76560+#include "coord.h"
76561+#include "plugin/item/item.h"
76562+#include "plugin/file/file.h"
76563+#include "plugin/security/perm.h"
76564+#include "plugin/disk_format/disk_format.h"
76565+#include "plugin/plugin.h"
76566+#include "plugin/plugin_set.h"
76567+#include "plugin/object.h"
76568+#include "txnmgr.h"
76569+#include "jnode.h"
76570+#include "znode.h"
76571+#include "block_alloc.h"
76572+#include "tree.h"
76573+#include "vfs_ops.h"
76574+#include "inode.h"
76575+#include "page_cache.h"
76576+#include "ktxnmgrd.h"
76577+#include "super.h"
76578+#include "reiser4.h"
76579+#include "entd.h"
76580+#include "status_flags.h"
76581+#include "flush.h"
76582+#include "dscale.h"
76583+
76584+#include <linux/profile.h>
76585+#include <linux/types.h>
76586+#include <linux/mount.h>
76587+#include <linux/vfs.h>
76588+#include <linux/mm.h>
76589+#include <linux/buffer_head.h>
76590+#include <linux/dcache.h>
76591+#include <linux/list.h>
76592+#include <linux/pagemap.h>
76593+#include <linux/slab.h>
76594+#include <linux/seq_file.h>
76595+#include <linux/init.h>
76596+#include <linux/module.h>
76597+#include <linux/writeback.h>
76598+#include <linux/blkdev.h>
76599+#include <linux/quotaops.h>
76600+#include <linux/security.h>
76601+#include <linux/reboot.h>
76602+#include <linux/rcupdate.h>
76603+
44254afd
MT
76604+/* update inode stat-data by calling plugin */
76605+int reiser4_update_sd(struct inode *object)
76606+{
76607+ file_plugin *fplug;
76608+
76609+ assert("nikita-2338", object != NULL);
76610+ /* check for read-only file system. */
76611+ if (IS_RDONLY(object))
76612+ return 0;
76613+
76614+ fplug = inode_file_plugin(object);
76615+ assert("nikita-2339", fplug != NULL);
76616+ return fplug->write_sd_by_inode(object);
76617+}
76618+
76619+/* helper function: increase inode nlink count and call plugin method to save
76620+ updated stat-data.
76621+
76622+ Used by link/create and during creation of dot and dotdot in mkdir
76623+*/
76624+int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
76625+ struct inode *parent /* parent where new entry will be */
76626+ ,
76627+ int write_sd_p /* true if stat-data has to be
76628+ * updated */ )
76629+{
76630+ file_plugin *fplug;
76631+ int result;
76632+
76633+ assert("nikita-1351", object != NULL);
76634+
76635+ fplug = inode_file_plugin(object);
76636+ assert("nikita-1445", fplug != NULL);
76637+
76638+ /* ask plugin whether it can add yet another link to this
76639+ object */
76640+ if (!fplug->can_add_link(object))
76641+ return RETERR(-EMLINK);
76642+
76643+ assert("nikita-2211", fplug->add_link != NULL);
76644+ /* call plugin to do actual addition of link */
76645+ result = fplug->add_link(object, parent);
76646+
76647+ /* optionally update stat data */
76648+ if (result == 0 && write_sd_p)
76649+ result = fplug->write_sd_by_inode(object);
76650+ return result;
76651+}
76652+
76653+/* helper function: decrease inode nlink count and call plugin method to save
76654+ updated stat-data.
76655+
76656+ Used by unlink/create
76657+*/
76658+int reiser4_del_nlink(struct inode *object /* object from which link is
76659+ * removed */ ,
76660+ struct inode *parent /* parent where entry was */ ,
76661+ int write_sd_p /* true is stat-data has to be
76662+ * updated */ )
76663+{
76664+ file_plugin *fplug;
76665+ int result;
76666+
76667+ assert("nikita-1349", object != NULL);
76668+
76669+ fplug = inode_file_plugin(object);
76670+ assert("nikita-1350", fplug != NULL);
76671+ assert("nikita-1446", object->i_nlink > 0);
76672+ assert("nikita-2210", fplug->rem_link != NULL);
76673+
76674+ /* call plugin to do actual deletion of link */
76675+ result = fplug->rem_link(object, parent);
76676+
76677+ /* optionally update stat data */
76678+ if (result == 0 && write_sd_p)
76679+ result = fplug->write_sd_by_inode(object);
76680+ return result;
76681+}
76682+
44254afd
MT
76683+/* Release reiser4 dentry. This is d_op->d_release() method. */
76684+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
76685+{
76686+ reiser4_free_dentry_fsdata(dentry);
76687+}
76688+
76689+/*
76690+ * Called by reiser4_sync_inodes(), during speculative write-back (through
76691+ * pdflush, or balance_dirty_pages()).
76692+ */
71430cf6 76693+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
44254afd
MT
76694+{
76695+ long written = 0;
76696+ int repeats = 0;
76697+ int result;
76698+ struct address_space *mapping;
76699+
76700+ /*
76701+ * Performs early flushing, trying to free some memory. If there is
76702+ * nothing to flush, commits some atoms.
76703+ */
76704+
76705+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
76706+ sys_fsync(). */
76707+ if (wbc->sync_mode != WB_SYNC_NONE) {
76708+ txnmgr_force_commit_all(sb, 0);
76709+ return;
76710+ }
76711+
71430cf6
MT
76712+ BUG_ON(reiser4_get_super_fake(sb) == NULL);
76713+ mapping = reiser4_get_super_fake(sb)->i_mapping;
44254afd
MT
76714+ do {
76715+ long nr_submitted = 0;
76716+ jnode *node = NULL;
76717+
76718+ /* do not put more requests to overload write queue */
76719+ if (wbc->nonblocking &&
76720+ bdi_write_congested(mapping->backing_dev_info)) {
76721+ blk_run_address_space(mapping);
76722+ wbc->encountered_congestion = 1;
76723+ break;
76724+ }
76725+ repeats++;
76726+ BUG_ON(wbc->nr_to_write <= 0);
76727+
76728+ if (get_current_context()->entd) {
76729+ entd_context *ent = get_entd_context(sb);
76730+
76731+ if (ent->cur_request->node)
76732+ /*
76733+ * this is ent thread and it managed to capture
76734+ * requested page itself - start flush from
76735+ * that page
76736+ */
76737+ node = jref(ent->cur_request->node);
76738+ }
76739+
76740+ result = flush_some_atom(node, &nr_submitted, wbc,
76741+ JNODE_FLUSH_WRITE_BLOCKS);
76742+ if (result != 0)
76743+ warning("nikita-31001", "Flush failed: %i", result);
76744+ if (node)
76745+ jput(node);
76746+ if (!nr_submitted)
76747+ break;
76748+
76749+ wbc->nr_to_write -= nr_submitted;
76750+ written += nr_submitted;
76751+ } while (wbc->nr_to_write > 0);
76752+}
76753+
44254afd
MT
76754+void reiser4_throttle_write(struct inode *inode)
76755+{
71430cf6 76756+ reiser4_txn_restart_current();
44254afd
MT
76757+ balance_dirty_pages_ratelimited(inode->i_mapping);
76758+}
76759+
76760+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
76761+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
76762+ * beginning of device */
76763+
44254afd
MT
76764+/*
76765+ * Reiser4 initialization/shutdown.
76766+ *
76767+ * Code below performs global reiser4 initialization that is done either as
76768+ * part of kernel initialization (when reiser4 is statically built-in), or
76769+ * during reiser4 module load (when compiled as module).
76770+ */
76771+
44254afd
MT
76772+void reiser4_handle_error(void)
76773+{
76774+ struct super_block *sb = reiser4_get_current_sb();
76775+
76776+ if (!sb)
76777+ return;
76778+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
76779+ "Filesystem error occured");
76780+ switch (get_super_private(sb)->onerror) {
76781+ case 0:
76782+ reiser4_panic("foobar-42", "Filesystem error occured\n");
76783+ case 1:
76784+ default:
76785+ if (sb->s_flags & MS_RDONLY)
76786+ return;
76787+ sb->s_flags |= MS_RDONLY;
76788+ break;
76789+ }
76790+}
76791+
76792+struct dentry_operations reiser4_dentry_operations = {
76793+ .d_revalidate = NULL,
76794+ .d_hash = NULL,
76795+ .d_compare = NULL,
76796+ .d_delete = NULL,
76797+ .d_release = reiser4_d_release,
76798+ .d_iput = NULL,
76799+};
76800+
76801+/* Make Linus happy.
76802+ Local variables:
76803+ c-indentation-style: "K&R"
76804+ mode-name: "LC"
76805+ c-basic-offset: 8
76806+ tab-width: 8
76807+ fill-column: 120
76808+ End:
76809+*/
71430cf6
MT
76810diff -urN linux-2.6.22.orig/fs/reiser4/vfs_ops.h linux-2.6.22/fs/reiser4/vfs_ops.h
76811--- linux-2.6.22.orig/fs/reiser4/vfs_ops.h 1970-01-01 03:00:00.000000000 +0300
76812+++ linux-2.6.22/fs/reiser4/vfs_ops.h 2007-07-29 00:25:35.044739961 +0400
76813@@ -0,0 +1,53 @@
44254afd
MT
76814+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76815+ * reiser4/README */
76816+
76817+/* vfs_ops.c's exported symbols */
76818+
76819+#if !defined( __FS_REISER4_VFS_OPS_H__ )
76820+#define __FS_REISER4_VFS_OPS_H__
76821+
76822+#include "forward.h"
76823+#include "coord.h"
76824+#include "seal.h"
76825+#include "plugin/file/file.h"
76826+#include "super.h"
76827+#include "readahead.h"
76828+
76829+#include <linux/types.h> /* for loff_t */
76830+#include <linux/fs.h> /* for struct address_space */
76831+#include <linux/dcache.h> /* for struct dentry */
76832+#include <linux/mm.h>
76833+#include <linux/backing-dev.h>
76834+
76835+/* address space operations */
76836+int reiser4_writepage(struct page *, struct writeback_control *);
76837+int reiser4_set_page_dirty(struct page *);
71430cf6 76838+void reiser4_invalidatepage(struct page *, unsigned long offset);
44254afd
MT
76839+int reiser4_releasepage(struct page *, gfp_t);
76840+
76841+extern int reiser4_update_sd(struct inode *);
76842+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
76843+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
76844+
44254afd
MT
76845+extern int reiser4_start_up_io(struct page *page);
76846+extern void reiser4_throttle_write(struct inode *);
76847+extern int jnode_is_releasable(jnode *);
76848+
76849+#define CAPTURE_APAGE_BURST (1024l)
71430cf6 76850+void reiser4_writeout(struct super_block *, struct writeback_control *);
44254afd
MT
76851+
76852+extern void reiser4_handle_error(void);
76853+
44254afd
MT
76854+/* __FS_REISER4_VFS_OPS_H__ */
76855+#endif
76856+
76857+/* Make Linus happy.
76858+ Local variables:
76859+ c-indentation-style: "K&R"
76860+ mode-name: "LC"
76861+ c-basic-offset: 8
76862+ tab-width: 8
76863+ fill-column: 120
76864+ scroll-step: 1
76865+ End:
76866+*/
71430cf6
MT
76867diff -urN linux-2.6.22.orig/fs/reiser4/wander.c linux-2.6.22/fs/reiser4/wander.c
76868--- linux-2.6.22.orig/fs/reiser4/wander.c 1970-01-01 03:00:00.000000000 +0300
76869+++ linux-2.6.22/fs/reiser4/wander.c 2007-07-29 00:25:35.048740996 +0400
76870@@ -0,0 +1,1797 @@
44254afd
MT
76871+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76872+ * reiser4/README */
76873+
76874+/* Reiser4 Wandering Log */
76875+
76876+/* You should read http://www.namesys.com/txn-doc.html
76877+
76878+ That describes how filesystem operations are performed as atomic
76879+ transactions, and how we try to arrange it so that we can write most of the
76880+ data only once while performing the operation atomically.
76881+
76882+ For the purposes of this code, it is enough for it to understand that it
76883+ has been told a given block should be written either once, or twice (if
76884+ twice then once to the wandered location and once to the real location).
76885+
76886+ This code guarantees that those blocks that are defined to be part of an
76887+ atom either all take effect or none of them take effect.
76888+
76889+ Relocate set nodes are submitted to write by the jnode_flush() routine, and
76890+ the overwrite set is submitted by reiser4_write_log(). This is because with
76891+ the overwrite set we seek to optimize writes, and with the relocate set we
76892+ seek to cause disk order to correlate with the parent first pre-order.
76893+
76894+ reiser4_write_log() allocates and writes wandered blocks and maintains
76895+ additional on-disk structures of the atom as wander records (each wander
76896+ record occupies one block) for storing of the "wandered map" (a table which
76897+ contains a relation between wandered and real block numbers) and other
76898+ information which might be needed at transaction recovery time.
76899+
76900+ The wander records are unidirectionally linked into a circle: each wander
76901+ record contains a block number of the next wander record, the last wander
76902+ record points to the first one.
76903+
76904+ One wander record (named "tx head" in this file) has a format which is
76905+ different from the other wander records. The "tx head" has a reference to the
76906+ "tx head" block of the previously committed atom. Also, "tx head" contains
76907+ fs information (the free blocks counter, and the oid allocator state) which
76908+ is logged in a special way .
76909+
76910+ There are two journal control blocks, named journal header and journal
76911+ footer which have fixed on-disk locations. The journal header has a
76912+ reference to the "tx head" block of the last committed atom. The journal
76913+ footer points to the "tx head" of the last flushed atom. The atom is
76914+ "played" when all blocks from its overwrite set are written to disk the
76915+ second time (i.e. written to their real locations).
76916+
76917+ NOTE: People who know reiserfs internals and its journal structure might be
76918+ confused with these terms journal footer and journal header. There is a table
76919+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
76920+
76921+ REISER3 TERM | REISER4 TERM | DESCRIPTION
76922+ --------------------+-----------------------+----------------------------
76923+ commit record | journal header | atomic write of this record
76924+ | | ends transaction commit
76925+ --------------------+-----------------------+----------------------------
76926+ journal header | journal footer | atomic write of this record
76927+ | | ends post-commit writes.
76928+ | | After successful
76929+ | | writing of this journal
76930+ | | blocks (in reiser3) or
76931+ | | wandered blocks/records are
76932+ | | free for re-use.
76933+ --------------------+-----------------------+----------------------------
76934+
76935+ The atom commit process is the following:
76936+
76937+ 1. The overwrite set is taken from atom's clean list, and its size is
76938+ counted.
76939+
76940+ 2. The number of necessary wander records (including tx head) is calculated,
76941+ and the wander record blocks are allocated.
76942+
76943+ 3. Allocate wandered blocks and populate wander records by wandered map.
76944+
76945+ 4. submit write requests for wander records and wandered blocks.
76946+
76947+ 5. wait until submitted write requests complete.
76948+
76949+ 6. update journal header: change the pointer to the block number of just
76950+ written tx head, submit an i/o for modified journal header block and wait
76951+ for i/o completion.
76952+
76953+ NOTE: The special logging for bitmap blocks and some reiser4 super block
76954+ fields makes processes of atom commit, flush and recovering a bit more
76955+ complex (see comments in the source code for details).
76956+
76957+ The atom playing process is the following:
76958+
76959+ 1. Write atom's overwrite set in-place.
76960+
76961+ 2. Wait on i/o.
76962+
76963+ 3. Update journal footer: change the pointer to block number of tx head
76964+ block of the atom we currently flushing, submit an i/o, wait on i/o
76965+ completion.
76966+
76967+ 4. Free disk space which was used for wandered blocks and wander records.
76968+
76969+ After the freeing of wandered blocks and wander records we have that journal
76970+ footer points to the on-disk structure which might be overwritten soon.
76971+ Neither the log writer nor the journal recovery procedure use that pointer
76972+ for accessing the data. When the journal recovery procedure finds the oldest
76973+ transaction it compares the journal footer pointer value with the "prev_tx"
76974+ pointer value in tx head, if values are equal the oldest not flushed
76975+ transaction is found.
76976+
76977+ NOTE on disk space leakage: the information about of what blocks and how many
76978+ blocks are allocated for wandered blocks, wandered records is not written to
76979+ the disk because of special logging for bitmaps and some super blocks
76980+ counters. After a system crash we the reiser4 does not remember those
76981+ objects allocation, thus we have no such a kind of disk space leakage.
76982+*/
76983+
76984+/* Special logging of reiser4 super block fields. */
76985+
76986+/* There are some reiser4 super block fields (free block count and OID allocator
76987+ state (number of files and next free OID) which are logged separately from
76988+ super block to avoid unnecessary atom fusion.
76989+
76990+ So, the reiser4 super block can be not captured by a transaction with
76991+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
76992+ the reiser4 on-disk super block is not touched when such a transaction is
76993+ committed and flushed. Those "counters logged specially" are logged in "tx
76994+ head" blocks and in the journal footer block.
76995+
76996+ A step-by-step description of special logging:
76997+
76998+ 0. The per-atom information about deleted or created files and allocated or
76999+ freed blocks is collected during the transaction. The atom's
77000+ ->nr_objects_created and ->nr_objects_deleted are for object
77001+ deletion/creation tracking, the numbers of allocated and freed blocks are
77002+ calculated using atom's delete set and atom's capture list -- all new and
77003+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
77004+ bit set.
77005+
77006+ 1. The "logged specially" reiser4 super block fields have their "committed"
77007+ versions in the reiser4 in-memory super block. They get modified only at
77008+ atom commit time. The atom's commit thread has an exclusive access to those
77009+ "committed" fields because the log writer implementation supports only one
71430cf6 77010+ atom commit a time (there is a per-fs "commit" mutex). At
44254afd
MT
77011+ that time "committed" counters are modified using per-atom information
77012+ collected during the transaction. These counters are stored on disk as a
77013+ part of tx head block when atom is committed.
77014+
77015+ 2. When the atom is flushed the value of the free block counter and the OID
77016+ allocator state get written to the journal footer block. A special journal
77017+ procedure (journal_recover_sb_data()) takes those values from the journal
77018+ footer and updates the reiser4 in-memory super block.
77019+
77020+ NOTE: That means free block count and OID allocator state are logged
77021+ separately from the reiser4 super block regardless of the fact that the
77022+ reiser4 super block has fields to store both the free block counter and the
77023+ OID allocator.
77024+
77025+ Writing the whole super block at commit time requires knowing true values of
77026+ all its fields without changes made by not yet committed transactions. It is
77027+ possible by having their "committed" version of the super block like the
77028+ reiser4 bitmap blocks have "committed" and "working" versions. However,
77029+ another scheme was implemented which stores special logged values in the
77030+ unused free space inside transaction head block. In my opinion it has an
77031+ advantage of not writing whole super block when only part of it was
77032+ modified. */
77033+
77034+#include "debug.h"
77035+#include "dformat.h"
77036+#include "txnmgr.h"
77037+#include "jnode.h"
77038+#include "znode.h"
77039+#include "block_alloc.h"
77040+#include "page_cache.h"
77041+#include "wander.h"
77042+#include "reiser4.h"
77043+#include "super.h"
77044+#include "vfs_ops.h"
77045+#include "writeout.h"
77046+#include "inode.h"
77047+#include "entd.h"
77048+
77049+#include <linux/types.h>
77050+#include <linux/fs.h> /* for struct super_block */
77051+#include <linux/mm.h> /* for struct page */
77052+#include <linux/pagemap.h>
77053+#include <linux/bio.h> /* for struct bio */
77054+#include <linux/blkdev.h>
77055+
77056+static int write_jnodes_to_disk_extent(
77057+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77058+
77059+/* The commit_handle is a container for objects needed at atom commit time */
77060+struct commit_handle {
77061+ /* A pointer to atom's list of OVRWR nodes */
77062+ struct list_head *overwrite_set;
77063+ /* atom's overwrite set size */
77064+ int overwrite_set_size;
77065+ /* jnodes for wander record blocks */
77066+ struct list_head tx_list;
77067+ /* number of wander records */
77068+ __u32 tx_size;
77069+ /* 'committed' sb counters are saved here until atom is completely
77070+ flushed */
77071+ __u64 free_blocks;
77072+ __u64 nr_files;
77073+ __u64 next_oid;
77074+ /* A pointer to the atom which is being committed */
77075+ txn_atom *atom;
77076+ /* A pointer to current super block */
77077+ struct super_block *super;
77078+ /* The counter of modified bitmaps */
77079+ reiser4_block_nr nr_bitmap;
77080+};
77081+
77082+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77083+{
77084+ memset(ch, 0, sizeof(struct commit_handle));
77085+ INIT_LIST_HEAD(&ch->tx_list);
77086+
77087+ ch->atom = atom;
77088+ ch->super = reiser4_get_current_sb();
77089+}
77090+
77091+static void done_commit_handle(struct commit_handle *ch)
77092+{
77093+ assert("zam-690", list_empty(&ch->tx_list));
77094+}
77095+
77096+static inline int reiser4_use_write_barrier(struct super_block * s)
77097+{
77098+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77099+}
77100+
77101+static void disable_write_barrier(struct super_block * s)
77102+{
77103+ notice("zam-1055", "%s does not support write barriers,"
77104+ " using synchronous write instead.", s->s_id);
77105+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77106+}
77107+
44254afd
MT
77108+/* fill journal header block data */
77109+static void format_journal_header(struct commit_handle *ch)
77110+{
77111+ struct reiser4_super_info_data *sbinfo;
77112+ struct journal_header *header;
77113+ jnode *txhead;
77114+
77115+ sbinfo = get_super_private(ch->super);
77116+ assert("zam-479", sbinfo != NULL);
77117+ assert("zam-480", sbinfo->journal_header != NULL);
77118+
77119+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77120+
77121+ jload(sbinfo->journal_header);
77122+
77123+ header = (struct journal_header *)jdata(sbinfo->journal_header);
77124+ assert("zam-484", header != NULL);
77125+
77126+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77127+ &header->last_committed_tx);
77128+
77129+ jrelse(sbinfo->journal_header);
77130+}
77131+
77132+/* fill journal footer block data */
77133+static void format_journal_footer(struct commit_handle *ch)
77134+{
77135+ struct reiser4_super_info_data *sbinfo;
77136+ struct journal_footer *footer;
77137+ jnode *tx_head;
77138+
77139+ sbinfo = get_super_private(ch->super);
77140+
77141+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77142+
77143+ assert("zam-493", sbinfo != NULL);
77144+ assert("zam-494", sbinfo->journal_header != NULL);
77145+
77146+ check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77147+
77148+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77149+ assert("zam-495", footer != NULL);
77150+
77151+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77152+ &footer->last_flushed_tx);
77153+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77154+
77155+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77156+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77157+
77158+ jrelse(sbinfo->journal_footer);
77159+}
77160+
77161+/* wander record capacity depends on current block size */
77162+static int wander_record_capacity(const struct super_block *super)
77163+{
77164+ return (super->s_blocksize -
77165+ sizeof(struct wander_record_header)) /
77166+ sizeof(struct wander_entry);
77167+}
77168+
77169+/* Fill first wander record (tx head) in accordance with supplied given data */
77170+static void format_tx_head(struct commit_handle *ch)
77171+{
77172+ jnode *tx_head;
77173+ jnode *next;
77174+ struct tx_header *header;
77175+
77176+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77177+ assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77178+
77179+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77180+ if (&ch->tx_list == &next->capture_link)
77181+ next = tx_head;
77182+
77183+ header = (struct tx_header *)jdata(tx_head);
77184+
77185+ assert("zam-460", header != NULL);
77186+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77187+
77188+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77189+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77190+
77191+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77192+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77193+ &header->prev_tx);
77194+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77195+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77196+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77197+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77198+}
77199+
77200+/* prepare ordinary wander record block (fill all service fields) */
77201+static void
77202+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77203+{
77204+ struct wander_record_header *LRH;
77205+ jnode *next;
77206+
77207+ assert("zam-464", node != NULL);
77208+
77209+ LRH = (struct wander_record_header *)jdata(node);
77210+ next = list_entry(node->capture_link.next, jnode, capture_link);
77211+
77212+ if (&ch->tx_list == &next->capture_link)
77213+ next = list_entry(ch->tx_list.next, jnode, capture_link);
77214+
77215+ assert("zam-465", LRH != NULL);
77216+ assert("zam-463",
77217+ ch->super->s_blocksize > sizeof(struct wander_record_header));
77218+
77219+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77220+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77221+
77222+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77223+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
77224+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77225+}
77226+
77227+/* add one wandered map entry to formatted wander record */
77228+static void
77229+store_entry(jnode * node, int index, const reiser4_block_nr * a,
77230+ const reiser4_block_nr * b)
77231+{
77232+ char *data;
77233+ struct wander_entry *pairs;
77234+
77235+ data = jdata(node);
77236+ assert("zam-451", data != NULL);
77237+
77238+ pairs =
77239+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
77240+
77241+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77242+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77243+}
77244+
77245+/* currently, wander records contains contain only wandered map, which depend on
77246+ overwrite set size */
77247+static void get_tx_size(struct commit_handle *ch)
77248+{
77249+ assert("zam-440", ch->overwrite_set_size != 0);
77250+ assert("zam-695", ch->tx_size == 0);
77251+
77252+ /* count all ordinary wander records
77253+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77254+ for tx head block */
77255+ ch->tx_size =
77256+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77257+ 2;
77258+}
77259+
77260+/* A special structure for using in store_wmap_actor() for saving its state
77261+ between calls */
77262+struct store_wmap_params {
77263+ jnode *cur; /* jnode of current wander record to fill */
77264+ int idx; /* free element index in wander record */
77265+ int capacity; /* capacity */
77266+
77267+#if REISER4_DEBUG
77268+ struct list_head *tx_list;
77269+#endif
77270+};
77271+
77272+/* an actor for use in blocknr_set_iterator routine which populates the list
77273+ of pre-formatted wander records by wandered map info */
77274+static int
77275+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77276+ const reiser4_block_nr * b, void *data)
77277+{
77278+ struct store_wmap_params *params = data;
77279+
77280+ if (params->idx >= params->capacity) {
77281+ /* a new wander record should be taken from the tx_list */
77282+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77283+ assert("zam-454",
77284+ params->tx_list != &params->cur->capture_link);
77285+
77286+ params->idx = 0;
77287+ }
77288+
77289+ store_entry(params->cur, params->idx, a, b);
77290+ params->idx++;
77291+
77292+ return 0;
77293+}
77294+
77295+/* This function is called after Relocate set gets written to disk, Overwrite
77296+ set is written to wandered locations and all wander records are written
77297+ also. Updated journal header blocks contains a pointer (block number) to
77298+ first wander record of the just written transaction */
77299+static int update_journal_header(struct commit_handle *ch, int use_barrier)
77300+{
77301+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77302+ jnode *jh = sbinfo->journal_header;
77303+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77304+ int ret;
77305+
77306+ format_journal_header(ch);
77307+
77308+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77309+ use_barrier ? WRITEOUT_BARRIER : 0);
77310+ if (ret)
77311+ return ret;
77312+
77313+ // blk_run_address_space(sbinfo->fake->i_mapping);
77314+ /*blk_run_queues(); */
77315+
77316+ ret = jwait_io(jh, WRITE);
77317+
77318+ if (ret)
77319+ return ret;
77320+
77321+ sbinfo->last_committed_tx = *jnode_get_block(head);
77322+
77323+ return 0;
77324+}
77325+
77326+/* This function is called after write-back is finished. We update journal
77327+ footer block and free blocks which were occupied by wandered blocks and
77328+ transaction wander records */
77329+static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77330+{
77331+ reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77332+
77333+ jnode *jf = sbinfo->journal_footer;
77334+
77335+ int ret;
77336+
77337+ format_journal_footer(ch);
77338+
77339+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77340+ use_barrier ? WRITEOUT_BARRIER : 0);
77341+ if (ret)
77342+ return ret;
77343+
77344+ // blk_run_address_space(sbinfo->fake->i_mapping);
77345+ /*blk_run_queue(); */
77346+
77347+ ret = jwait_io(jf, WRITE);
77348+ if (ret)
77349+ return ret;
77350+
77351+ return 0;
77352+}
77353+
77354+/* free block numbers of wander records of already written in place transaction */
77355+static void dealloc_tx_list(struct commit_handle *ch)
77356+{
77357+ while (!list_empty(&ch->tx_list)) {
77358+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77359+ list_del(&cur->capture_link);
77360+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77361+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77362+ BA_FORMATTED);
77363+
77364+ unpin_jnode_data(cur);
71430cf6 77365+ reiser4_drop_io_head(cur);
44254afd
MT
77366+ }
77367+}
77368+
77369+/* An actor for use in block_nr_iterator() routine which frees wandered blocks
77370+ from atom's overwrite set. */
77371+static int
77372+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
77373+ const reiser4_block_nr * a UNUSED_ARG,
77374+ const reiser4_block_nr * b, void *data UNUSED_ARG)
77375+{
77376+
77377+ assert("zam-499", b != NULL);
77378+ assert("zam-500", *b != 0);
71430cf6 77379+ assert("zam-501", !reiser4_blocknr_is_fake(b));
44254afd
MT
77380+
77381+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
77382+ return 0;
77383+}
77384+
77385+/* free wandered block locations of already written in place transaction */
77386+static void dealloc_wmap(struct commit_handle *ch)
77387+{
77388+ assert("zam-696", ch->atom != NULL);
77389+
77390+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
77391+ dealloc_wmap_actor, NULL, 1);
77392+}
77393+
77394+/* helper function for alloc wandered blocks, which refill set of block
77395+ numbers needed for wandered blocks */
77396+static int
77397+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
77398+{
77399+ reiser4_blocknr_hint hint;
77400+ int ret;
77401+
77402+ reiser4_block_nr wide_len = count;
77403+
77404+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
77405+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
77406+ reserved allocation area so as to get the best qualities of fixed
77407+ journals? */
71430cf6 77408+ reiser4_blocknr_hint_init(&hint);
44254afd
MT
77409+ hint.block_stage = BLOCK_GRABBED;
77410+
77411+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
77412+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
77413+ *len = (int)wide_len;
77414+
77415+ return ret;
77416+}
77417+
77418+/*
77419+ * roll back changes made before issuing BIO in the case of IO error.
77420+ */
77421+static void undo_bio(struct bio *bio)
77422+{
77423+ int i;
77424+
77425+ for (i = 0; i < bio->bi_vcnt; ++i) {
77426+ struct page *pg;
77427+ jnode *node;
77428+
77429+ pg = bio->bi_io_vec[i].bv_page;
71430cf6 77430+ end_page_writeback(pg);
44254afd
MT
77431+ node = jprivate(pg);
77432+ spin_lock_jnode(node);
77433+ JF_CLR(node, JNODE_WRITEBACK);
77434+ JF_SET(node, JNODE_DIRTY);
77435+ spin_unlock_jnode(node);
77436+ }
77437+ bio_put(bio);
77438+}
77439+
77440+/* put overwrite set back to atom's clean list */
77441+static void put_overwrite_set(struct commit_handle *ch)
77442+{
77443+ jnode *cur;
77444+
77445+ list_for_each_entry(cur, ch->overwrite_set, capture_link)
77446+ jrelse_tail(cur);
77447+}
77448+
77449+/* Count overwrite set size, grab disk space for wandered blocks allocation.
77450+ Since we have a separate list for atom's overwrite set we just scan the list,
77451+ count bitmap and other not leaf nodes which wandered blocks allocation we
77452+ have to grab space for. */
77453+static int get_overwrite_set(struct commit_handle *ch)
77454+{
77455+ int ret;
77456+ jnode *cur;
77457+ __u64 nr_not_leaves = 0;
77458+#if REISER4_DEBUG
77459+ __u64 nr_formatted_leaves = 0;
77460+ __u64 nr_unformatted_leaves = 0;
77461+#endif
77462+
77463+ assert("zam-697", ch->overwrite_set_size == 0);
77464+
77465+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
77466+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77467+
77468+ while (ch->overwrite_set != &cur->capture_link) {
77469+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
77470+
77471+ /* Count bitmap locks for getting correct statistics what number
77472+ * of blocks were cleared by the transaction commit. */
77473+ if (jnode_get_type(cur) == JNODE_BITMAP)
77474+ ch->nr_bitmap++;
77475+
77476+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
77477+ || jnode_get_type(cur) == JNODE_BITMAP);
77478+
77479+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
77480+ /* we replace fake znode by another (real)
77481+ znode which is suggested by disk_layout
77482+ plugin */
77483+
77484+ /* FIXME: it looks like fake znode should be
77485+ replaced by jnode supplied by
77486+ disk_layout. */
77487+
77488+ struct super_block *s = reiser4_get_current_sb();
77489+ reiser4_super_info_data *sbinfo =
77490+ get_current_super_private();
77491+
77492+ if (sbinfo->df_plug->log_super) {
77493+ jnode *sj = sbinfo->df_plug->log_super(s);
77494+
77495+ assert("zam-593", sj != NULL);
77496+
77497+ if (IS_ERR(sj))
77498+ return PTR_ERR(sj);
77499+
77500+ spin_lock_jnode(sj);
77501+ JF_SET(sj, JNODE_OVRWR);
77502+ insert_into_atom_ovrwr_list(ch->atom, sj);
77503+ spin_unlock_jnode(sj);
77504+
77505+ /* jload it as the rest of overwrite set */
71430cf6 77506+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
44254afd
MT
77507+
77508+ ch->overwrite_set_size++;
77509+ }
77510+ spin_lock_jnode(cur);
71430cf6 77511+ reiser4_uncapture_block(cur);
44254afd
MT
77512+ jput(cur);
77513+
77514+ } else {
77515+ int ret;
77516+ ch->overwrite_set_size++;
71430cf6 77517+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
44254afd
MT
77518+ if (ret)
77519+ reiser4_panic("zam-783",
77520+ "cannot load e-flushed jnode back (ret = %d)\n",
77521+ ret);
77522+ }
77523+
77524+ /* Count not leaves here because we have to grab disk space
77525+ * for wandered blocks. They were not counted as "flush
77526+ * reserved". Counting should be done _after_ nodes are pinned
77527+ * into memory by jload(). */
77528+ if (!jnode_is_leaf(cur))
77529+ nr_not_leaves++;
77530+ else {
77531+#if REISER4_DEBUG
77532+ /* at this point @cur either has JNODE_FLUSH_RESERVED
77533+ * or is eflushed. Locking is not strong enough to
77534+ * write an assertion checking for this. */
77535+ if (jnode_is_znode(cur))
77536+ nr_formatted_leaves++;
77537+ else
77538+ nr_unformatted_leaves++;
77539+#endif
77540+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
77541+ }
77542+
77543+ cur = next;
77544+ }
77545+
77546+ /* Grab space for writing (wandered blocks) of not leaves found in
77547+ * overwrite set. */
77548+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
77549+ if (ret)
77550+ return ret;
77551+
77552+ /* Disk space for allocation of wandered blocks of leaf nodes already
77553+ * reserved as "flush reserved", move it to grabbed space counter. */
77554+ spin_lock_atom(ch->atom);
77555+ assert("zam-940",
77556+ nr_formatted_leaves + nr_unformatted_leaves <=
77557+ ch->atom->flush_reserved);
77558+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
77559+ spin_unlock_atom(ch->atom);
77560+
77561+ return ch->overwrite_set_size;
77562+}
77563+
77564+/**
77565+ * write_jnodes_to_disk_extent - submit write request
77566+ * @head:
77567+ * @first: first jnode of the list
77568+ * @nr: number of jnodes on the list
77569+ * @block_p:
77570+ * @fq:
77571+ * @flags: used to decide whether page is to get PG_reclaim flag
77572+ *
77573+ * Submits a write request for @nr jnodes beginning from the @first, other
77574+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
77575+ * will be written to the disk region of @nr blocks starting with @block_p block
77576+ * number. If @fq is not NULL it means that waiting for i/o completion will be
77577+ * done more efficiently by using flush_queue_t objects.
77578+ * This function is the one which writes list of jnodes in batch mode. It does
77579+ * all low-level things as bio construction and page states manipulation.
77580+ *
77581+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
77582+ * aggregated in this function instead of being left to the layers below
77583+ *
77584+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
77585+ * Why that layer needed? Why BIOs cannot be constructed here?
77586+ */
77587+static int write_jnodes_to_disk_extent(
77588+ jnode *first, int nr, const reiser4_block_nr *block_p,
77589+ flush_queue_t *fq, int flags)
77590+{
77591+ struct super_block *super = reiser4_get_current_sb();
77592+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
77593+ int max_blocks;
77594+ jnode *cur = first;
77595+ reiser4_block_nr block;
77596+
77597+ assert("zam-571", first != NULL);
77598+ assert("zam-572", block_p != NULL);
77599+ assert("zam-570", nr > 0);
77600+
77601+ block = *block_p;
77602+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
77603+
77604+ while (nr > 0) {
77605+ struct bio *bio;
77606+ int nr_blocks = min(nr, max_blocks);
77607+ int i;
77608+ int nr_used;
77609+
77610+ bio = bio_alloc(GFP_NOIO, nr_blocks);
77611+ if (!bio)
77612+ return RETERR(-ENOMEM);
77613+
77614+ bio->bi_bdev = super->s_bdev;
77615+ bio->bi_sector = block * (super->s_blocksize >> 9);
77616+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
77617+ struct page *pg;
77618+
77619+ pg = jnode_page(cur);
77620+ assert("zam-573", pg != NULL);
77621+
77622+ page_cache_get(pg);
77623+
77624+ lock_and_wait_page_writeback(pg);
77625+
77626+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
77627+ /*
77628+ * underlying device is satiated. Stop adding
77629+ * pages to the bio.
77630+ */
77631+ unlock_page(pg);
77632+ page_cache_release(pg);
77633+ break;
77634+ }
77635+
77636+ spin_lock_jnode(cur);
77637+ assert("nikita-3166",
77638+ pg->mapping == jnode_get_mapping(cur));
77639+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
77640+#if REISER4_DEBUG
77641+ spin_lock(&cur->load);
77642+ assert("nikita-3165", !jnode_is_releasable(cur));
77643+ spin_unlock(&cur->load);
77644+#endif
77645+ JF_SET(cur, JNODE_WRITEBACK);
77646+ JF_CLR(cur, JNODE_DIRTY);
77647+ ON_DEBUG(cur->written++);
77648+ spin_unlock_jnode(cur);
77649+
77650+ ClearPageError(pg);
77651+ set_page_writeback(pg);
77652+
77653+ if (get_current_context()->entd) {
77654+ /* this is ent thread */
77655+ entd_context *ent = get_entd_context(super);
77656+ struct wbq *rq, *next;
77657+
77658+ spin_lock(&ent->guard);
77659+
77660+ if (pg == ent->cur_request->page) {
77661+ /*
77662+ * entd is called for this page. This
77663+ * request is not in th etodo list
77664+ */
77665+ ent->cur_request->written = 1;
77666+ } else {
77667+ /*
77668+ * if we have written a page for which writepage
77669+ * is called for - move request to another list.
77670+ */
77671+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
77672+ assert("", rq->magic == WBQ_MAGIC);
77673+ if (pg == rq->page) {
77674+ /*
77675+ * remove request from
77676+ * entd's queue, but do
77677+ * not wake up a thread
77678+ * which put this
77679+ * request
77680+ */
77681+ list_del_init(&rq->link);
77682+ ent->nr_todo_reqs --;
77683+ list_add_tail(&rq->link, &ent->done_list);
77684+ ent->nr_done_reqs ++;
77685+ rq->written = 1;
77686+ break;
77687+ }
77688+ }
77689+ }
77690+ spin_unlock(&ent->guard);
77691+ }
77692+
77693+ clear_page_dirty_for_io(pg);
77694+
77695+ unlock_page(pg);
77696+
77697+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77698+ nr_used++;
77699+ }
77700+ if (nr_used > 0) {
77701+ assert("nikita-3453",
77702+ bio->bi_size == super->s_blocksize * nr_used);
77703+ assert("nikita-3454", bio->bi_vcnt == nr_used);
77704+
77705+ /* Check if we are allowed to write at all */
77706+ if (super->s_flags & MS_RDONLY)
77707+ undo_bio(bio);
77708+ else {
77709+ int not_supported;
77710+
77711+ add_fq_to_bio(fq, bio);
77712+ bio_get(bio);
77713+ reiser4_submit_bio(write_op, bio);
77714+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
77715+ bio_put(bio);
77716+ if (not_supported)
77717+ return -EOPNOTSUPP;
77718+ }
77719+
77720+ block += nr_used - 1;
77721+ update_blocknr_hint_default(super, &block);
77722+ block += 1;
77723+ } else {
77724+ bio_put(bio);
77725+ }
77726+ nr -= nr_used;
77727+ }
77728+
77729+ return 0;
77730+}
77731+
77732+/* This is a procedure which recovers a contiguous sequences of disk block
77733+ numbers in the given list of j-nodes and submits write requests on this
77734+ per-sequence basis */
77735+int
77736+write_jnode_list(struct list_head *head, flush_queue_t *fq,
77737+ long *nr_submitted, int flags)
77738+{
77739+ int ret;
77740+ jnode *beg = list_entry(head->next, jnode, capture_link);
77741+
77742+ while (head != &beg->capture_link) {
77743+ int nr = 1;
77744+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
77745+
77746+ while (head != &cur->capture_link) {
77747+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
77748+ break;
77749+ ++nr;
77750+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77751+ }
77752+
77753+ ret = write_jnodes_to_disk_extent(
77754+ beg, nr, jnode_get_block(beg), fq, flags);
77755+ if (ret)
77756+ return ret;
77757+
77758+ if (nr_submitted)
77759+ *nr_submitted += nr;
77760+
77761+ beg = cur;
77762+ }
77763+
77764+ return 0;
77765+}
77766+
77767+/* add given wandered mapping to atom's wandered map */
77768+static int
77769+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
77770+{
77771+ int ret;
77772+ blocknr_set_entry *new_bsep = NULL;
77773+ reiser4_block_nr block;
77774+
77775+ txn_atom *atom;
77776+
77777+ assert("zam-568", block_p != NULL);
77778+ block = *block_p;
77779+ assert("zam-569", len > 0);
77780+
77781+ while ((len--) > 0) {
77782+ do {
77783+ atom = get_current_atom_locked();
77784+ assert("zam-536",
71430cf6 77785+ !reiser4_blocknr_is_fake(jnode_get_block(cur)));
44254afd
MT
77786+ ret =
77787+ blocknr_set_add_pair(atom, &atom->wandered_map,
77788+ &new_bsep,
77789+ jnode_get_block(cur), &block);
77790+ } while (ret == -E_REPEAT);
77791+
77792+ if (ret) {
77793+ /* deallocate blocks which were not added to wandered
77794+ map */
77795+ reiser4_block_nr wide_len = len;
77796+
77797+ reiser4_dealloc_blocks(&block, &wide_len,
77798+ BLOCK_NOT_COUNTED,
77799+ BA_FORMATTED
77800+ /* formatted, without defer */ );
77801+
77802+ return ret;
77803+ }
77804+
77805+ spin_unlock_atom(atom);
77806+
77807+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77808+ ++block;
77809+ }
77810+
77811+ return 0;
77812+}
77813+
77814+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
77815+ submit IO for allocated blocks. We assume that current atom is in a stage
77816+ when any atom fusion is impossible and atom is unlocked and it is safe. */
77817+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
77818+{
77819+ reiser4_block_nr block;
77820+
77821+ int rest;
77822+ int len;
77823+ int ret;
77824+
77825+ jnode *cur;
77826+
77827+ assert("zam-534", ch->overwrite_set_size > 0);
77828+
77829+ rest = ch->overwrite_set_size;
77830+
77831+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
77832+ while (ch->overwrite_set != &cur->capture_link) {
77833+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
77834+
77835+ ret = get_more_wandered_blocks(rest, &block, &len);
77836+ if (ret)
77837+ return ret;
77838+
77839+ rest -= len;
77840+
77841+ ret = add_region_to_wmap(cur, len, &block);
77842+ if (ret)
77843+ return ret;
77844+
77845+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
77846+ if (ret)
77847+ return ret;
77848+
77849+ while ((len--) > 0) {
77850+ assert("zam-604",
77851+ ch->overwrite_set != &cur->capture_link);
77852+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77853+ }
77854+ }
77855+
77856+ return 0;
77857+}
77858+
77859+/* allocate given number of nodes over the journal area and link them into a
77860+ list, return pointer to the first jnode in the list */
77861+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
77862+{
77863+ reiser4_blocknr_hint hint;
77864+ reiser4_block_nr allocated = 0;
77865+ reiser4_block_nr first, len;
77866+ jnode *cur;
77867+ jnode *txhead;
77868+ int ret;
77869+ reiser4_context *ctx;
77870+ reiser4_super_info_data *sbinfo;
77871+
77872+ assert("zam-698", ch->tx_size > 0);
77873+ assert("zam-699", list_empty_careful(&ch->tx_list));
77874+
77875+ ctx = get_current_context();
77876+ sbinfo = get_super_private(ctx->super);
77877+
77878+ while (allocated < (unsigned)ch->tx_size) {
77879+ len = (ch->tx_size - allocated);
77880+
71430cf6 77881+ reiser4_blocknr_hint_init(&hint);
44254afd
MT
77882+
77883+ hint.block_stage = BLOCK_GRABBED;
77884+
77885+ /* FIXME: there should be some block allocation policy for
77886+ nodes which contain wander records */
77887+
77888+ /* We assume that disk space for wandered record blocks can be
77889+ * taken from reserved area. */
77890+ ret = reiser4_alloc_blocks(&hint, &first, &len,
77891+ BA_FORMATTED | BA_RESERVED |
77892+ BA_USE_DEFAULT_SEARCH_START);
71430cf6 77893+ reiser4_blocknr_hint_done(&hint);
44254afd
MT
77894+
77895+ if (ret)
77896+ return ret;
77897+
77898+ allocated += len;
77899+
77900+ /* create jnodes for all wander records */
77901+ while (len--) {
71430cf6 77902+ cur = reiser4_alloc_io_head(&first);
44254afd
MT
77903+
77904+ if (cur == NULL) {
77905+ ret = RETERR(-ENOMEM);
77906+ goto free_not_assigned;
77907+ }
77908+
71430cf6 77909+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
44254afd
MT
77910+
77911+ if (ret != 0) {
77912+ jfree(cur);
77913+ goto free_not_assigned;
77914+ }
77915+
77916+ pin_jnode_data(cur);
77917+
77918+ list_add_tail(&cur->capture_link, &ch->tx_list);
77919+
77920+ first++;
77921+ }
77922+ }
77923+
77924+ { /* format a on-disk linked list of wander records */
77925+ int serial = 1;
77926+
77927+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77928+ format_tx_head(ch);
77929+
77930+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
77931+ while (&ch->tx_list != &cur->capture_link) {
77932+ format_wander_record(ch, cur, serial++);
77933+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77934+ }
77935+ }
77936+
77937+ { /* Fill wander records with Wandered Set */
77938+ struct store_wmap_params params;
77939+ txn_atom *atom;
77940+
77941+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
77942+
77943+ params.idx = 0;
77944+ params.capacity =
77945+ wander_record_capacity(reiser4_get_current_sb());
77946+
77947+ atom = get_current_atom_locked();
77948+ blocknr_set_iterator(atom, &atom->wandered_map,
77949+ &store_wmap_actor, &params, 0);
77950+ spin_unlock_atom(atom);
77951+ }
77952+
77953+ { /* relse all jnodes from tx_list */
77954+ cur = list_entry(ch->tx_list.next, jnode, capture_link);
77955+ while (&ch->tx_list != &cur->capture_link) {
77956+ jrelse(cur);
77957+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
77958+ }
77959+ }
77960+
77961+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
77962+
77963+ return ret;
77964+
77965+ free_not_assigned:
77966+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The
77967+ caller takes care about invalidating of tx list */
77968+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
77969+
77970+ return ret;
77971+}
77972+
77973+static int commit_tx(struct commit_handle *ch)
77974+{
77975+ flush_queue_t *fq;
77976+ int barrier;
77977+ int ret;
77978+
77979+ /* Grab more space for wandered records. */
77980+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
77981+ if (ret)
77982+ return ret;
77983+
77984+ fq = get_fq_for_current_atom();
77985+ if (IS_ERR(fq))
77986+ return PTR_ERR(fq);
77987+
77988+ spin_unlock_atom(fq->atom);
77989+ do {
77990+ ret = alloc_wandered_blocks(ch, fq);
77991+ if (ret)
77992+ break;
77993+ ret = alloc_tx(ch, fq);
77994+ if (ret)
77995+ break;
77996+ } while (0);
77997+
71430cf6 77998+ reiser4_fq_put(fq);
44254afd
MT
77999+ if (ret)
78000+ return ret;
78001+ repeat_wo_barrier:
78002+ barrier = reiser4_use_write_barrier(ch->super);
78003+ if (!barrier) {
78004+ ret = current_atom_finish_all_fq();
78005+ if (ret)
78006+ return ret;
78007+ }
78008+ ret = update_journal_header(ch, barrier);
78009+ if (barrier) {
78010+ if (ret) {
78011+ if (ret == -EOPNOTSUPP) {
78012+ disable_write_barrier(ch->super);
78013+ goto repeat_wo_barrier;
78014+ }
78015+ return ret;
78016+ }
78017+ ret = current_atom_finish_all_fq();
78018+ }
78019+ return ret;
78020+}
78021+
44254afd
MT
78022+static int write_tx_back(struct commit_handle * ch)
78023+{
78024+ flush_queue_t *fq;
78025+ int ret;
78026+ int barrier;
78027+
71430cf6 78028+ reiser4_post_commit_hook();
44254afd
MT
78029+ fq = get_fq_for_current_atom();
78030+ if (IS_ERR(fq))
78031+ return PTR_ERR(fq);
78032+ spin_unlock_atom(fq->atom);
78033+ ret = write_jnode_list(
78034+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
71430cf6 78035+ reiser4_fq_put(fq);
44254afd
MT
78036+ if (ret)
78037+ return ret;
78038+ repeat_wo_barrier:
78039+ barrier = reiser4_use_write_barrier(ch->super);
78040+ if (!barrier) {
78041+ ret = current_atom_finish_all_fq();
78042+ if (ret)
78043+ return ret;
78044+ }
78045+ ret = update_journal_footer(ch, barrier);
78046+ if (barrier) {
78047+ if (ret) {
78048+ if (ret == -EOPNOTSUPP) {
78049+ disable_write_barrier(ch->super);
78050+ goto repeat_wo_barrier;
78051+ }
78052+ return ret;
78053+ }
78054+ ret = current_atom_finish_all_fq();
78055+ }
78056+ if (ret)
78057+ return ret;
71430cf6 78058+ reiser4_post_write_back_hook();
44254afd
MT
78059+ return 0;
78060+}
78061+
78062+/* We assume that at this moment all captured blocks are marked as RELOC or
78063+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78064+ are submitted to write.
78065+*/
78066+
78067+int reiser4_write_logs(long *nr_submitted)
78068+{
78069+ txn_atom *atom;
78070+ struct super_block *super = reiser4_get_current_sb();
78071+ reiser4_super_info_data *sbinfo = get_super_private(super);
78072+ struct commit_handle ch;
78073+ int ret;
78074+
78075+ writeout_mode_enable();
78076+
78077+ /* block allocator may add j-nodes to the clean_list */
71430cf6 78078+ ret = reiser4_pre_commit_hook();
44254afd
MT
78079+ if (ret)
78080+ return ret;
78081+
78082+ /* No locks are required if we take atom which stage >=
78083+ * ASTAGE_PRE_COMMIT */
78084+ atom = get_current_context()->trans->atom;
78085+ assert("zam-965", atom != NULL);
78086+
78087+ /* relocate set is on the atom->clean_nodes list after
78088+ * current_atom_complete_writes() finishes. It can be safely
71430cf6 78089+ * uncaptured after commit_mutex is locked, because any atom that
44254afd
MT
78090+ * captures these nodes is guaranteed to commit after current one.
78091+ *
71430cf6 78092+ * This can only be done after reiser4_pre_commit_hook(), because it is where
44254afd
MT
78093+ * early flushed jnodes with CREATED bit are transferred to the
78094+ * overwrite list. */
71430cf6 78095+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
44254afd
MT
78096+ spin_lock_atom(atom);
78097+ /* There might be waiters for the relocate nodes which we have
78098+ * released, wake them up. */
71430cf6 78099+ reiser4_atom_send_event(atom);
44254afd
MT
78100+ spin_unlock_atom(atom);
78101+
78102+ if (REISER4_DEBUG) {
78103+ int level;
78104+
78105+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78106+ assert("nikita-3352",
78107+ list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78108+ }
78109+
78110+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78111+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78112+
78113+ init_commit_handle(&ch, atom);
78114+
78115+ ch.free_blocks = sbinfo->blocks_free_committed;
78116+ ch.nr_files = sbinfo->nr_files_committed;
78117+ /* ZAM-FIXME-HANS: email me what the contention level is for the super
78118+ * lock. */
78119+ ch.next_oid = oid_next(super);
78120+
78121+ /* count overwrite set and place it in a separate list */
78122+ ret = get_overwrite_set(&ch);
78123+
78124+ if (ret <= 0) {
78125+ /* It is possible that overwrite set is empty here, it means
78126+ all captured nodes are clean */
78127+ goto up_and_ret;
78128+ }
78129+
78130+ /* Inform the caller about what number of dirty pages will be
78131+ * submitted to disk. */
78132+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78133+
78134+ /* count all records needed for storing of the wandered set */
78135+ get_tx_size(&ch);
78136+
78137+ ret = commit_tx(&ch);
78138+ if (ret)
78139+ goto up_and_ret;
78140+
78141+ spin_lock_atom(atom);
71430cf6 78142+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
44254afd
MT
78143+ spin_unlock_atom(atom);
78144+
78145+ ret = write_tx_back(&ch);
71430cf6 78146+ reiser4_post_write_back_hook();
44254afd
MT
78147+
78148+ up_and_ret:
78149+ if (ret) {
78150+ /* there could be fq attached to current atom; the only way to
78151+ remove them is: */
78152+ current_atom_finish_all_fq();
78153+ }
78154+
78155+ /* free blocks of flushed transaction */
78156+ dealloc_tx_list(&ch);
78157+ dealloc_wmap(&ch);
78158+
78159+ put_overwrite_set(&ch);
78160+
78161+ done_commit_handle(&ch);
78162+
78163+ writeout_mode_disable();
78164+
78165+ return ret;
78166+}
78167+
78168+/* consistency checks for journal data/control blocks: header, footer, log
78169+ records, transactions head blocks. All functions return zero on success. */
78170+
78171+static int check_journal_header(const jnode * node UNUSED_ARG)
78172+{
78173+ /* FIXME: journal header has no magic field yet. */
78174+ return 0;
78175+}
78176+
78177+/* wait for write completion for all jnodes from given list */
78178+static int wait_on_jnode_list(struct list_head *head)
78179+{
78180+ jnode *scan;
78181+ int ret = 0;
78182+
78183+ list_for_each_entry(scan, head, capture_link) {
78184+ struct page *pg = jnode_page(scan);
78185+
78186+ if (pg) {
78187+ if (PageWriteback(pg))
78188+ wait_on_page_writeback(pg);
78189+
78190+ if (PageError(pg))
78191+ ret++;
78192+ }
78193+ }
78194+
78195+ return ret;
78196+}
78197+
78198+static int check_journal_footer(const jnode * node UNUSED_ARG)
78199+{
78200+ /* FIXME: journal footer has no magic field yet. */
78201+ return 0;
78202+}
78203+
78204+static int check_tx_head(const jnode * node)
78205+{
78206+ struct tx_header *header = (struct tx_header *)jdata(node);
78207+
78208+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78209+ warning("zam-627", "tx head at block %s corrupted\n",
78210+ sprint_address(jnode_get_block(node)));
78211+ return RETERR(-EIO);
78212+ }
78213+
78214+ return 0;
78215+}
78216+
78217+static int check_wander_record(const jnode * node)
78218+{
78219+ struct wander_record_header *RH =
78220+ (struct wander_record_header *)jdata(node);
78221+
78222+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78223+ 0) {
78224+ warning("zam-628", "wander record at block %s corrupted\n",
78225+ sprint_address(jnode_get_block(node)));
78226+ return RETERR(-EIO);
78227+ }
78228+
78229+ return 0;
78230+}
78231+
78232+/* fill commit_handler structure by everything what is needed for update_journal_footer */
78233+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78234+{
78235+ struct tx_header *TXH;
78236+ int ret;
78237+
78238+ ret = jload(tx_head);
78239+ if (ret)
78240+ return ret;
78241+
78242+ TXH = (struct tx_header *)jdata(tx_head);
78243+
78244+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78245+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78246+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78247+
78248+ jrelse(tx_head);
78249+
78250+ list_add(&tx_head->capture_link, &ch->tx_list);
78251+
78252+ return 0;
78253+}
78254+
78255+/* replay one transaction: restore and write overwrite set in place */
78256+static int replay_transaction(const struct super_block *s,
78257+ jnode * tx_head,
78258+ const reiser4_block_nr * log_rec_block_p,
78259+ const reiser4_block_nr * end_block,
78260+ unsigned int nr_wander_records)
78261+{
78262+ reiser4_block_nr log_rec_block = *log_rec_block_p;
78263+ struct commit_handle ch;
78264+ LIST_HEAD(overwrite_set);
78265+ jnode *log;
78266+ int ret;
78267+
78268+ init_commit_handle(&ch, NULL);
78269+ ch.overwrite_set = &overwrite_set;
78270+
78271+ restore_commit_handle(&ch, tx_head);
78272+
78273+ while (log_rec_block != *end_block) {
78274+ struct wander_record_header *header;
78275+ struct wander_entry *entry;
78276+
78277+ int i;
78278+
78279+ if (nr_wander_records == 0) {
78280+ warning("zam-631",
78281+ "number of wander records in the linked list"
78282+ " greater than number stored in tx head.\n");
78283+ ret = RETERR(-EIO);
78284+ goto free_ow_set;
78285+ }
78286+
71430cf6 78287+ log = reiser4_alloc_io_head(&log_rec_block);
44254afd
MT
78288+ if (log == NULL)
78289+ return RETERR(-ENOMEM);
78290+
78291+ ret = jload(log);
78292+ if (ret < 0) {
71430cf6 78293+ reiser4_drop_io_head(log);
44254afd
MT
78294+ return ret;
78295+ }
78296+
78297+ ret = check_wander_record(log);
78298+ if (ret) {
78299+ jrelse(log);
71430cf6 78300+ reiser4_drop_io_head(log);
44254afd
MT
78301+ return ret;
78302+ }
78303+
78304+ header = (struct wander_record_header *)jdata(log);
78305+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78306+
78307+ entry = (struct wander_entry *)(header + 1);
78308+
78309+ /* restore overwrite set from wander record content */
78310+ for (i = 0; i < wander_record_capacity(s); i++) {
78311+ reiser4_block_nr block;
78312+ jnode *node;
78313+
78314+ block = le64_to_cpu(get_unaligned(&entry->wandered));
78315+ if (block == 0)
78316+ break;
78317+
71430cf6 78318+ node = reiser4_alloc_io_head(&block);
44254afd
MT
78319+ if (node == NULL) {
78320+ ret = RETERR(-ENOMEM);
78321+ /*
78322+ * FIXME-VS:???
78323+ */
78324+ jrelse(log);
71430cf6 78325+ reiser4_drop_io_head(log);
44254afd
MT
78326+ goto free_ow_set;
78327+ }
78328+
78329+ ret = jload(node);
78330+
78331+ if (ret < 0) {
71430cf6 78332+ reiser4_drop_io_head(node);
44254afd
MT
78333+ /*
78334+ * FIXME-VS:???
78335+ */
78336+ jrelse(log);
71430cf6 78337+ reiser4_drop_io_head(log);
44254afd
MT
78338+ goto free_ow_set;
78339+ }
78340+
78341+ block = le64_to_cpu(get_unaligned(&entry->original));
78342+
78343+ assert("zam-603", block != 0);
78344+
78345+ jnode_set_block(node, &block);
78346+
78347+ list_add_tail(&node->capture_link, ch.overwrite_set);
78348+
78349+ ++entry;
78350+ }
78351+
78352+ jrelse(log);
71430cf6 78353+ reiser4_drop_io_head(log);
44254afd
MT
78354+
78355+ --nr_wander_records;
78356+ }
78357+
78358+ if (nr_wander_records != 0) {
78359+ warning("zam-632", "number of wander records in the linked list"
78360+ " less than number stored in tx head.\n");
78361+ ret = RETERR(-EIO);
78362+ goto free_ow_set;
78363+ }
78364+
78365+ { /* write wandered set in place */
78366+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
78367+ ret = wait_on_jnode_list(ch.overwrite_set);
78368+
78369+ if (ret) {
78370+ ret = RETERR(-EIO);
78371+ goto free_ow_set;
78372+ }
78373+ }
78374+
78375+ ret = update_journal_footer(&ch, 0);
78376+
78377+ free_ow_set:
78378+
78379+ while (!list_empty(ch.overwrite_set)) {
78380+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
78381+ list_del_init(&cur->capture_link);
78382+ jrelse(cur);
71430cf6 78383+ reiser4_drop_io_head(cur);
44254afd
MT
78384+ }
78385+
78386+ list_del_init(&tx_head->capture_link);
78387+
78388+ done_commit_handle(&ch);
78389+
78390+ return ret;
78391+}
78392+
78393+/* find oldest committed and not played transaction and play it. The transaction
78394+ * was committed and journal header block was updated but the blocks from the
78395+ * process of writing the atom's overwrite set in-place and updating of journal
78396+ * footer block were not completed. This function completes the process by
78397+ * recovering the atom's overwrite set from their wandered locations and writes
78398+ * them in-place and updating the journal footer. */
78399+static int replay_oldest_transaction(struct super_block *s)
78400+{
78401+ reiser4_super_info_data *sbinfo = get_super_private(s);
78402+ jnode *jf = sbinfo->journal_footer;
78403+ unsigned int total;
78404+ struct journal_footer *F;
78405+ struct tx_header *T;
78406+
78407+ reiser4_block_nr prev_tx;
78408+ reiser4_block_nr last_flushed_tx;
78409+ reiser4_block_nr log_rec_block = 0;
78410+
78411+ jnode *tx_head;
78412+
78413+ int ret;
78414+
78415+ if ((ret = jload(jf)) < 0)
78416+ return ret;
78417+
78418+ F = (struct journal_footer *)jdata(jf);
78419+
78420+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
78421+
78422+ jrelse(jf);
78423+
78424+ if (sbinfo->last_committed_tx == last_flushed_tx) {
78425+ /* all transactions are replayed */
78426+ return 0;
78427+ }
78428+
78429+ prev_tx = sbinfo->last_committed_tx;
78430+
78431+ /* searching for oldest not flushed transaction */
78432+ while (1) {
71430cf6 78433+ tx_head = reiser4_alloc_io_head(&prev_tx);
44254afd
MT
78434+ if (!tx_head)
78435+ return RETERR(-ENOMEM);
78436+
78437+ ret = jload(tx_head);
78438+ if (ret < 0) {
71430cf6 78439+ reiser4_drop_io_head(tx_head);
44254afd
MT
78440+ return ret;
78441+ }
78442+
78443+ ret = check_tx_head(tx_head);
78444+ if (ret) {
78445+ jrelse(tx_head);
71430cf6 78446+ reiser4_drop_io_head(tx_head);
44254afd
MT
78447+ return ret;
78448+ }
78449+
78450+ T = (struct tx_header *)jdata(tx_head);
78451+
78452+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
78453+
78454+ if (prev_tx == last_flushed_tx)
78455+ break;
78456+
78457+ jrelse(tx_head);
71430cf6 78458+ reiser4_drop_io_head(tx_head);
44254afd
MT
78459+ }
78460+
78461+ total = le32_to_cpu(get_unaligned(&T->total));
78462+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
78463+
78464+ pin_jnode_data(tx_head);
78465+ jrelse(tx_head);
78466+
78467+ ret =
78468+ replay_transaction(s, tx_head, &log_rec_block,
78469+ jnode_get_block(tx_head), total - 1);
78470+
78471+ unpin_jnode_data(tx_head);
71430cf6 78472+ reiser4_drop_io_head(tx_head);
44254afd
MT
78473+
78474+ if (ret)
78475+ return ret;
78476+ return -E_REPEAT;
78477+}
78478+
78479+/* The reiser4 journal current implementation was optimized to not to capture
78480+ super block if certain super blocks fields are modified. Currently, the set
78481+ is (<free block count>, <OID allocator>). These fields are logged by
78482+ special way which includes storing them in each transaction head block at
78483+ atom commit time and writing that information to journal footer block at
78484+ atom flush time. For getting info from journal footer block to the
78485+ in-memory super block there is a special function
78486+ reiser4_journal_recover_sb_data() which should be called after disk format
78487+ plugin re-reads super block after journal replaying.
78488+*/
78489+
78490+/* get the information from journal footer in-memory super block */
78491+int reiser4_journal_recover_sb_data(struct super_block *s)
78492+{
78493+ reiser4_super_info_data *sbinfo = get_super_private(s);
78494+ struct journal_footer *jf;
78495+ int ret;
78496+
78497+ assert("zam-673", sbinfo->journal_footer != NULL);
78498+
78499+ ret = jload(sbinfo->journal_footer);
78500+ if (ret != 0)
78501+ return ret;
78502+
78503+ ret = check_journal_footer(sbinfo->journal_footer);
78504+ if (ret != 0)
78505+ goto out;
78506+
78507+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
78508+
78509+ /* was there at least one flushed transaction? */
78510+ if (jf->last_flushed_tx) {
78511+
78512+ /* restore free block counter logged in this transaction */
78513+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
78514+
78515+ /* restore oid allocator state */
78516+ oid_init_allocator(s,
78517+ le64_to_cpu(get_unaligned(&jf->nr_files)),
78518+ le64_to_cpu(get_unaligned(&jf->next_oid)));
78519+ }
78520+ out:
78521+ jrelse(sbinfo->journal_footer);
78522+ return ret;
78523+}
78524+
78525+/* reiser4 replay journal procedure */
78526+int reiser4_journal_replay(struct super_block *s)
78527+{
78528+ reiser4_super_info_data *sbinfo = get_super_private(s);
78529+ jnode *jh, *jf;
78530+ struct journal_header *header;
78531+ int nr_tx_replayed = 0;
78532+ int ret;
78533+
78534+ assert("zam-582", sbinfo != NULL);
78535+
78536+ jh = sbinfo->journal_header;
78537+ jf = sbinfo->journal_footer;
78538+
78539+ if (!jh || !jf) {
78540+ /* it is possible that disk layout does not support journal
78541+ structures, we just warn about this */
78542+ warning("zam-583",
78543+ "journal control blocks were not loaded by disk layout plugin. "
78544+ "journal replaying is not possible.\n");
78545+ return 0;
78546+ }
78547+
78548+ /* Take free block count from journal footer block. The free block
78549+ counter value corresponds the last flushed transaction state */
78550+ ret = jload(jf);
78551+ if (ret < 0)
78552+ return ret;
78553+
78554+ ret = check_journal_footer(jf);
78555+ if (ret) {
78556+ jrelse(jf);
78557+ return ret;
78558+ }
78559+
78560+ jrelse(jf);
78561+
78562+ /* store last committed transaction info in reiser4 in-memory super
78563+ block */
78564+ ret = jload(jh);
78565+ if (ret < 0)
78566+ return ret;
78567+
78568+ ret = check_journal_header(jh);
78569+ if (ret) {
78570+ jrelse(jh);
78571+ return ret;
78572+ }
78573+
78574+ header = (struct journal_header *)jdata(jh);
78575+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
78576+
78577+ jrelse(jh);
78578+
78579+ /* replay committed transactions */
78580+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
78581+ nr_tx_replayed++;
78582+
78583+ return ret;
78584+}
78585+
78586+/* load journal control block (either journal header or journal footer block) */
78587+static int
78588+load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
78589+{
78590+ int ret;
78591+
71430cf6 78592+ *node = reiser4_alloc_io_head(block);
44254afd
MT
78593+ if (!(*node))
78594+ return RETERR(-ENOMEM);
78595+
78596+ ret = jload(*node);
78597+
78598+ if (ret) {
71430cf6 78599+ reiser4_drop_io_head(*node);
44254afd
MT
78600+ *node = NULL;
78601+ return ret;
78602+ }
78603+
78604+ pin_jnode_data(*node);
78605+ jrelse(*node);
78606+
78607+ return 0;
78608+}
78609+
78610+/* unload journal header or footer and free jnode */
78611+static void unload_journal_control_block(jnode ** node)
78612+{
78613+ if (*node) {
78614+ unpin_jnode_data(*node);
71430cf6 78615+ reiser4_drop_io_head(*node);
44254afd
MT
78616+ *node = NULL;
78617+ }
78618+}
78619+
78620+/* release journal control blocks */
71430cf6 78621+void reiser4_done_journal_info(struct super_block *s)
44254afd
MT
78622+{
78623+ reiser4_super_info_data *sbinfo = get_super_private(s);
78624+
78625+ assert("zam-476", sbinfo != NULL);
78626+
78627+ unload_journal_control_block(&sbinfo->journal_header);
78628+ unload_journal_control_block(&sbinfo->journal_footer);
78629+ rcu_barrier();
78630+}
78631+
78632+/* load journal control blocks */
71430cf6 78633+int reiser4_init_journal_info(struct super_block *s)
44254afd
MT
78634+{
78635+ reiser4_super_info_data *sbinfo = get_super_private(s);
78636+ journal_location *loc;
78637+ int ret;
78638+
78639+ loc = &sbinfo->jloc;
78640+
78641+ assert("zam-651", loc != NULL);
78642+ assert("zam-652", loc->header != 0);
78643+ assert("zam-653", loc->footer != 0);
78644+
78645+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
78646+
78647+ if (ret)
78648+ return ret;
78649+
78650+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
78651+
78652+ if (ret) {
78653+ unload_journal_control_block(&sbinfo->journal_header);
78654+ }
78655+
78656+ return ret;
78657+}
78658+
78659+/* Make Linus happy.
78660+ Local variables:
78661+ c-indentation-style: "K&R"
78662+ mode-name: "LC"
78663+ c-basic-offset: 8
78664+ tab-width: 8
78665+ fill-column: 80
78666+ End:
78667+*/
71430cf6
MT
78668diff -urN linux-2.6.22.orig/fs/reiser4/wander.h linux-2.6.22/fs/reiser4/wander.h
78669--- linux-2.6.22.orig/fs/reiser4/wander.h 1970-01-01 03:00:00.000000000 +0300
78670+++ linux-2.6.22/fs/reiser4/wander.h 2007-07-29 00:25:35.048740996 +0400
44254afd
MT
78671@@ -0,0 +1,135 @@
78672+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
78673+
78674+#if !defined (__FS_REISER4_WANDER_H__)
78675+#define __FS_REISER4_WANDER_H__
78676+
78677+#include "dformat.h"
78678+
78679+#include <linux/fs.h> /* for struct super_block */
78680+
78681+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
78682+
78683+#define TX_HEADER_MAGIC "TxMagic4"
78684+#define WANDER_RECORD_MAGIC "LogMagc4"
78685+
78686+#define TX_HEADER_MAGIC_SIZE (8)
78687+#define WANDER_RECORD_MAGIC_SIZE (8)
78688+
78689+/* journal header block format */
78690+struct journal_header {
78691+ /* last written transaction head location */
78692+ d64 last_committed_tx;
78693+};
78694+
78695+typedef struct journal_location {
78696+ reiser4_block_nr footer;
78697+ reiser4_block_nr header;
78698+} journal_location;
78699+
78700+/* The wander.c head comment describes usage and semantic of all these structures */
78701+/* journal footer block format */
78702+struct journal_footer {
78703+ /* last flushed transaction location. */
78704+ /* This block number is no more valid after the transaction it points
78705+ to gets flushed, this number is used only at journal replaying time
78706+ for detection of the end of on-disk list of committed transactions
78707+ which were not flushed completely */
78708+ d64 last_flushed_tx;
78709+
78710+ /* free block counter is written in journal footer at transaction
78711+ flushing , not in super block because free blocks counter is logged
78712+ by another way than super block fields (root pointer, for
78713+ example). */
78714+ d64 free_blocks;
78715+
78716+ /* number of used OIDs and maximal used OID are logged separately from
78717+ super block */
78718+ d64 nr_files;
78719+ d64 next_oid;
78720+};
78721+
78722+/* Each wander record (except the first one) has unified format with wander
78723+ record header followed by an array of log entries */
78724+struct wander_record_header {
78725+ /* when there is no predefined location for wander records, this magic
78726+ string should help reiser4fsck. */
78727+ char magic[WANDER_RECORD_MAGIC_SIZE];
78728+
78729+ /* transaction id */
78730+ d64 id;
78731+
78732+ /* total number of wander records in current transaction */
78733+ d32 total;
78734+
78735+ /* this block number in transaction */
78736+ d32 serial;
78737+
78738+ /* number of previous block in commit */
78739+ d64 next_block;
78740+};
78741+
78742+/* The first wander record (transaction head) of written transaction has the
78743+ special format */
78744+struct tx_header {
78745+ /* magic string makes first block in transaction different from other
78746+ logged blocks, it should help fsck. */
78747+ char magic[TX_HEADER_MAGIC_SIZE];
78748+
78749+ /* transaction id */
78750+ d64 id;
78751+
78752+ /* total number of records (including this first tx head) in the
78753+ transaction */
78754+ d32 total;
78755+
78756+ /* align next field to 8-byte boundary; this field always is zero */
78757+ d32 padding;
78758+
78759+ /* block number of previous transaction head */
78760+ d64 prev_tx;
78761+
78762+ /* next wander record location */
78763+ d64 next_block;
78764+
78765+ /* committed versions of free blocks counter */
78766+ d64 free_blocks;
78767+
78768+ /* number of used OIDs (nr_files) and maximal used OID are logged
78769+ separately from super block */
78770+ d64 nr_files;
78771+ d64 next_oid;
78772+};
78773+
78774+/* A transaction gets written to disk as a set of wander records (each wander
78775+ record size is fs block) */
78776+
78777+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
78778+ by zeroes */
78779+struct wander_entry {
78780+ d64 original; /* block original location */
78781+ d64 wandered; /* block wandered location */
78782+};
78783+
78784+/* REISER4 JOURNAL WRITER FUNCTIONS */
78785+
78786+extern int reiser4_write_logs(long *);
78787+extern int reiser4_journal_replay(struct super_block *);
78788+extern int reiser4_journal_recover_sb_data(struct super_block *);
78789+
71430cf6
MT
78790+extern int reiser4_init_journal_info(struct super_block *);
78791+extern void reiser4_done_journal_info(struct super_block *);
44254afd
MT
78792+
78793+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
78794+
78795+#endif /* __FS_REISER4_WANDER_H__ */
78796+
78797+/* Make Linus happy.
78798+ Local variables:
78799+ c-indentation-style: "K&R"
78800+ mode-name: "LC"
78801+ c-basic-offset: 8
78802+ tab-width: 8
78803+ fill-column: 80
78804+ scroll-step: 1
78805+ End:
78806+*/
71430cf6
MT
78807diff -urN linux-2.6.22.orig/fs/reiser4/writeout.h linux-2.6.22/fs/reiser4/writeout.h
78808--- linux-2.6.22.orig/fs/reiser4/writeout.h 1970-01-01 03:00:00.000000000 +0300
78809+++ linux-2.6.22/fs/reiser4/writeout.h 2007-07-29 00:25:35.052742032 +0400
44254afd
MT
78810@@ -0,0 +1,21 @@
78811+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
78812+
78813+#if !defined (__FS_REISER4_WRITEOUT_H__)
78814+
78815+#define WRITEOUT_SINGLE_STREAM (0x1)
78816+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
78817+#define WRITEOUT_BARRIER (0x4)
78818+
71430cf6 78819+extern int reiser4_get_writeout_flags(void);
44254afd
MT
78820+
78821+#endif /* __FS_REISER4_WRITEOUT_H__ */
78822+
78823+/* Make Linus happy.
78824+ Local variables:
78825+ c-indentation-style: "K&R"
78826+ mode-name: "LC"
78827+ c-basic-offset: 8
78828+ tab-width: 8
78829+ fill-column: 80
78830+ End:
78831+*/
71430cf6
MT
78832diff -urN linux-2.6.22.orig/fs/reiser4/znode.c linux-2.6.22/fs/reiser4/znode.c
78833--- linux-2.6.22.orig/fs/reiser4/znode.c 1970-01-01 03:00:00.000000000 +0300
78834+++ linux-2.6.22/fs/reiser4/znode.c 2007-07-29 00:25:35.052742032 +0400
78835@@ -0,0 +1,1029 @@
44254afd
MT
78836+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
78837+ * reiser4/README */
78838+/* Znode manipulation functions. */
78839+/* Znode is the in-memory header for a tree node. It is stored
78840+ separately from the node itself so that it does not get written to
78841+ disk. In this respect znode is like buffer head or page head. We
78842+ also use znodes for additional reiser4 specific purposes:
78843+
78844+ . they are organized into tree structure which is a part of whole
78845+ reiser4 tree.
78846+ . they are used to implement node grained locking
78847+ . they are used to keep additional state associated with a
78848+ node
78849+ . they contain links to lists used by the transaction manager
78850+
78851+ Znode is attached to some variable "block number" which is instance of
78852+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
78853+ appropriate node being actually loaded in memory. Existence of znode itself
78854+ is regulated by reference count (->x_count) in it. Each time thread
78855+ acquires reference to znode through call to zget(), ->x_count is
78856+ incremented and decremented on call to zput(). Data (content of node) are
78857+ brought in memory through call to zload(), which also increments ->d_count
78858+ reference counter. zload can block waiting on IO. Call to zrelse()
78859+ decreases this counter. Also, ->c_count keeps track of number of child
78860+ znodes and prevents parent znode from being recycled until all of its
78861+ children are. ->c_count is decremented whenever child goes out of existence
78862+ (being actually recycled in zdestroy()) which can be some time after last
78863+ reference to this child dies if we support some form of LRU cache for
78864+ znodes.
78865+
78866+*/
78867+/* EVERY ZNODE'S STORY
78868+
78869+ 1. His infancy.
78870+
78871+ Once upon a time, the znode was born deep inside of zget() by call to
78872+ zalloc(). At the return from zget() znode had:
78873+
78874+ . reference counter (x_count) of 1
78875+ . assigned block number, marked as used in bitmap
78876+ . pointer to parent znode. Root znode parent pointer points
78877+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
78878+ . hash table linkage
78879+ . no data loaded from disk
78880+ . no node plugin
78881+ . no sibling linkage
78882+
78883+ 2. His childhood
78884+
78885+ Each node is either brought into memory as a result of tree traversal, or
78886+ created afresh, creation of the root being a special case of the latter. In
78887+ either case it's inserted into sibling list. This will typically require
78888+ some ancillary tree traversing, but ultimately both sibling pointers will
78889+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
78890+ zjnode.state.
78891+
78892+ 3. His youth.
78893+
78894+ If znode is bound to already existing node in a tree, its content is read
78895+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
78896+ in zjnode.state and zdata() function starts to return non null for this
78897+ znode. zload() further calls zparse() that determines which node layout
78898+ this node is rendered in, and sets ->nplug on success.
78899+
78900+ If znode is for new node just created, memory for it is allocated and
78901+ zinit_new() function is called to initialise data, according to selected
78902+ node layout.
78903+
78904+ 4. His maturity.
78905+
78906+ After this point, znode lingers in memory for some time. Threads can
78907+ acquire references to znode either by blocknr through call to zget(), or by
78908+ following a pointer to unallocated znode from internal item. Each time
78909+ reference to znode is obtained, x_count is increased. Thread can read/write
78910+ lock znode. Znode data can be loaded through calls to zload(), d_count will
78911+ be increased appropriately. If all references to znode are released
78912+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
78913+ still cached in the hash table in the hope that it will be accessed
78914+ shortly.
78915+
78916+ There are two ways in which znode existence can be terminated:
78917+
78918+ . sudden death: node bound to this znode is removed from the tree
78919+ . overpopulation: znode is purged out of memory due to memory pressure
78920+
78921+ 5. His death.
78922+
78923+ Death is complex process.
78924+
78925+ When we irrevocably commit ourselves to decision to remove node from the
78926+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
78927+ znode. This is done either in ->kill_hook() of internal item or in
71430cf6 78928+ reiser4_kill_root() function when tree root is removed.
44254afd
MT
78929+
78930+ At this moment znode still has:
78931+
78932+ . locks held on it, necessary write ones
78933+ . references to it
78934+ . disk block assigned to it
78935+ . data loaded from the disk
78936+ . pending requests for lock
78937+
78938+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
78939+ deletion. Node deletion includes two phases. First all ways to get
78940+ references to that znode (sibling and parent links and hash lookup using
78941+ block number stored in parent node) should be deleted -- it is done through
78942+ sibling_list_remove(), also we assume that nobody uses down link from
78943+ parent node due to its nonexistence or proper parent node locking and
78944+ nobody uses parent pointers from children due to absence of them. Second we
78945+ invalidate all pending lock requests which still are on znode's lock
71430cf6
MT
78946+ request queue, this is done by reiser4_invalidate_lock(). Another
78947+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
78948+ Once it set all requesters are forced to return -EINVAL from
44254afd
MT
78949+ longterm_lock_znode(). Future locking attempts are not possible because all
78950+ ways to get references to that znode are removed already. Last, node is
78951+ uncaptured from transaction.
78952+
78953+ When last reference to the dying znode is just about to be released,
78954+ block number for this lock is released and znode is removed from the
78955+ hash table.
78956+
78957+ Now znode can be recycled.
78958+
78959+ [it's possible to free bitmap block and remove znode from the hash
78960+ table when last lock is released. This will result in having
78961+ referenced but completely orphaned znode]
78962+
78963+ 6. Limbo
78964+
78965+ As have been mentioned above znodes with reference counter 0 are
78966+ still cached in a hash table. Once memory pressure increases they are
78967+ purged out of there [this requires something like LRU list for
78968+ efficient implementation. LRU list would also greatly simplify
78969+ implementation of coord cache that would in this case morph to just
78970+ scanning some initial segment of LRU list]. Data loaded into
78971+ unreferenced znode are flushed back to the durable storage if
78972+ necessary and memory is freed. Znodes themselves can be recycled at
78973+ this point too.
78974+
78975+*/
78976+
78977+#include "debug.h"
78978+#include "dformat.h"
78979+#include "key.h"
78980+#include "coord.h"
78981+#include "plugin/plugin_header.h"
78982+#include "plugin/node/node.h"
78983+#include "plugin/plugin.h"
78984+#include "txnmgr.h"
78985+#include "jnode.h"
78986+#include "znode.h"
78987+#include "block_alloc.h"
78988+#include "tree.h"
78989+#include "tree_walk.h"
78990+#include "super.h"
78991+#include "reiser4.h"
78992+
78993+#include <linux/pagemap.h>
78994+#include <linux/spinlock.h>
78995+#include <linux/slab.h>
78996+#include <linux/err.h>
78997+
78998+static z_hash_table *get_htable(reiser4_tree *,
78999+ const reiser4_block_nr * const blocknr);
79000+static z_hash_table *znode_get_htable(const znode *);
79001+static void zdrop(znode *);
79002+
79003+/* hash table support */
79004+
79005+/* compare two block numbers for equality. Used by hash-table macros */
79006+static inline int
79007+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79008+{
79009+ assert("nikita-534", b1 != NULL);
79010+ assert("nikita-535", b2 != NULL);
79011+
79012+ return *b1 == *b2;
79013+}
79014+
79015+/* Hash znode by block number. Used by hash-table macros */
79016+/* Audited by: umka (2002.06.11) */
79017+static inline __u32
79018+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79019+{
79020+ assert("nikita-536", b != NULL);
79021+
79022+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79023+}
79024+
79025+/* The hash table definition */
71430cf6 79026+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
44254afd
MT
79027+#define KFREE(ptr, size) kfree(ptr)
79028+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79029+ blknrhashfn, blknreq);
79030+#undef KFREE
79031+#undef KMALLOC
79032+
79033+/* slab for znodes */
71430cf6 79034+static struct kmem_cache *znode_cache;
44254afd
MT
79035+
79036+int znode_shift_order;
79037+
79038+/**
79039+ * init_znodes - create znode cache
79040+ *
79041+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79042+ */
79043+int init_znodes(void)
79044+{
79045+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79046+ SLAB_HWCACHE_ALIGN |
79047+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79048+ if (znode_cache == NULL)
79049+ return RETERR(-ENOMEM);
79050+
79051+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79052+ ++znode_shift_order);
79053+ --znode_shift_order;
79054+ return 0;
79055+}
79056+
79057+/**
79058+ * done_znodes - delete znode cache
79059+ *
79060+ * This is called on reiser4 module unloading or system shutdown.
79061+ */
79062+void done_znodes(void)
79063+{
79064+ destroy_reiser4_cache(&znode_cache);
79065+}
79066+
79067+/* call this to initialise tree of znodes */
79068+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79069+{
79070+ int result;
79071+ assert("umka-050", tree != NULL);
79072+
79073+ rwlock_init(&tree->dk_lock);
79074+
79075+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79076+ if (result != 0)
79077+ return result;
79078+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79079+ return result;
79080+}
79081+
79082+/* free this znode */
79083+void zfree(znode * node /* znode to free */ )
79084+{
79085+ assert("nikita-465", node != NULL);
79086+ assert("nikita-2120", znode_page(node) == NULL);
79087+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
79088+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79089+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79090+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79091+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79092+ assert("nikita-3293", !znode_is_right_connected(node));
79093+ assert("nikita-3294", !znode_is_left_connected(node));
79094+ assert("nikita-3295", node->left == NULL);
79095+ assert("nikita-3296", node->right == NULL);
79096+
79097+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
79098+
79099+ kmem_cache_free(znode_cache, node);
79100+}
79101+
79102+/* call this to free tree of znodes */
79103+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79104+{
79105+ znode *node;
79106+ znode *next;
79107+ z_hash_table *ztable;
79108+
79109+ /* scan znode hash-tables and kill all znodes, then free hash tables
79110+ * themselves. */
79111+
79112+ assert("nikita-795", tree != NULL);
79113+
79114+ ztable = &tree->zhash_table;
79115+
79116+ if (ztable->_table != NULL) {
79117+ for_all_in_htable(ztable, z, node, next) {
79118+ node->c_count = 0;
79119+ node->in_parent.node = NULL;
79120+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79121+ zdrop(node);
79122+ }
79123+
79124+ z_hash_done(&tree->zhash_table);
79125+ }
79126+
79127+ ztable = &tree->zfake_table;
79128+
79129+ if (ztable->_table != NULL) {
79130+ for_all_in_htable(ztable, z, node, next) {
79131+ node->c_count = 0;
79132+ node->in_parent.node = NULL;
79133+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79134+ zdrop(node);
79135+ }
79136+
79137+ z_hash_done(&tree->zfake_table);
79138+ }
79139+}
79140+
79141+/* ZNODE STRUCTURES */
79142+
79143+/* allocate fresh znode */
79144+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79145+{
79146+ znode *node;
79147+
79148+ node = kmem_cache_alloc(znode_cache, gfp_flag);
79149+ return node;
79150+}
79151+
79152+/* Initialize fields of znode
79153+ @node: znode to initialize;
79154+ @parent: parent znode;
79155+ @tree: tree we are in. */
79156+void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79157+{
79158+ assert("nikita-466", node != NULL);
79159+ assert("umka-268", current_tree != NULL);
79160+
79161+ memset(node, 0, sizeof *node);
79162+
79163+ assert("umka-051", tree != NULL);
79164+
79165+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79166+ reiser4_init_lock(&node->lock);
79167+ init_parent_coord(&node->in_parent, parent);
79168+}
79169+
79170+/*
79171+ * remove znode from indices. This is called jput() when last reference on
79172+ * znode is released.
79173+ */
79174+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79175+{
79176+ assert("nikita-2108", node != NULL);
79177+ assert("nikita-470", node->c_count == 0);
79178+ assert_rw_write_locked(&(tree->tree_lock));
79179+
79180+ /* remove reference to this znode from cbk cache */
79181+ cbk_cache_invalidate(node, tree);
79182+
79183+ /* update c_count of parent */
79184+ if (znode_parent(node) != NULL) {
79185+ assert("nikita-472", znode_parent(node)->c_count > 0);
79186+ /* father, onto your hands I forward my spirit... */
79187+ znode_parent(node)->c_count--;
79188+ node->in_parent.node = NULL;
79189+ } else {
79190+ /* orphaned znode?! Root? */
79191+ }
79192+
79193+ /* remove znode from hash-table */
79194+ z_hash_remove_rcu(znode_get_htable(node), node);
79195+}
79196+
79197+/* zdrop() -- Remove znode from the tree.
79198+
79199+ This is called when znode is removed from the memory. */
79200+static void zdrop(znode * node /* znode to finish with */ )
79201+{
79202+ jdrop(ZJNODE(node));
79203+}
79204+
79205+/*
79206+ * put znode into right place in the hash table. This is called by relocate
79207+ * code.
79208+ */
79209+int znode_rehash(znode * node /* node to rehash */ ,
79210+ const reiser4_block_nr * new_block_nr /* new block number */ )
79211+{
79212+ z_hash_table *oldtable;
79213+ z_hash_table *newtable;
79214+ reiser4_tree *tree;
79215+
79216+ assert("nikita-2018", node != NULL);
79217+
79218+ tree = znode_get_tree(node);
79219+ oldtable = znode_get_htable(node);
79220+ newtable = get_htable(tree, new_block_nr);
79221+
79222+ write_lock_tree(tree);
79223+ /* remove znode from hash-table */
79224+ z_hash_remove_rcu(oldtable, node);
79225+
79226+ /* assertion no longer valid due to RCU */
79227+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79228+
79229+ /* update blocknr */
79230+ znode_set_block(node, new_block_nr);
79231+ node->zjnode.key.z = *new_block_nr;
79232+
79233+ /* insert it into hash */
79234+ z_hash_insert_rcu(newtable, node);
79235+ write_unlock_tree(tree);
79236+ return 0;
79237+}
79238+
79239+/* ZNODE LOOKUP, GET, PUT */
79240+
79241+/* zlook() - get znode with given block_nr in a hash table or return NULL
79242+
79243+ If result is non-NULL then the znode's x_count is incremented. Internal version
79244+ accepts pre-computed hash index. The hash table is accessed under caller's
79245+ tree->hash_lock.
79246+*/
79247+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79248+{
79249+ znode *result;
79250+ __u32 hash;
79251+ z_hash_table *htable;
79252+
79253+ assert("jmacd-506", tree != NULL);
79254+ assert("jmacd-507", blocknr != NULL);
79255+
79256+ htable = get_htable(tree, blocknr);
79257+ hash = blknrhashfn(htable, blocknr);
79258+
79259+ rcu_read_lock();
79260+ result = z_hash_find_index(htable, hash, blocknr);
79261+
79262+ if (result != NULL) {
79263+ add_x_ref(ZJNODE(result));
79264+ result = znode_rip_check(tree, result);
79265+ }
79266+ rcu_read_unlock();
79267+
79268+ return result;
79269+}
79270+
79271+/* return hash table where znode with block @blocknr is (or should be)
79272+ * stored */
79273+static z_hash_table *get_htable(reiser4_tree * tree,
79274+ const reiser4_block_nr * const blocknr)
79275+{
79276+ z_hash_table *table;
79277+ if (is_disk_addr_unallocated(blocknr))
79278+ table = &tree->zfake_table;
79279+ else
79280+ table = &tree->zhash_table;
79281+ return table;
79282+}
79283+
79284+/* return hash table where znode @node is (or should be) stored */
79285+static z_hash_table *znode_get_htable(const znode * node)
79286+{
79287+ return get_htable(znode_get_tree(node), znode_get_block(node));
79288+}
79289+
79290+/* zget() - get znode from hash table, allocating it if necessary.
79291+
79292+ First a call to zlook, locating a x-referenced znode if one
79293+ exists. If znode is not found, allocate new one and return. Result
79294+ is returned with x_count reference increased.
79295+
79296+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
79297+ LOCK ORDERING: NONE
79298+*/
79299+znode *zget(reiser4_tree * tree,
79300+ const reiser4_block_nr * const blocknr,
79301+ znode * parent, tree_level level, gfp_t gfp_flag)
79302+{
79303+ znode *result;
79304+ __u32 hashi;
79305+
79306+ z_hash_table *zth;
79307+
79308+ assert("jmacd-512", tree != NULL);
79309+ assert("jmacd-513", blocknr != NULL);
79310+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79311+
79312+ zth = get_htable(tree, blocknr);
79313+ hashi = blknrhashfn(zth, blocknr);
79314+
79315+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79316+ implemented. */
79317+
79318+ z_hash_prefetch_bucket(zth, hashi);
79319+
79320+ rcu_read_lock();
79321+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
79322+ we obtain an reference (x_count) but the znode remains unlocked.
79323+ Have to worry about race conditions later. */
79324+ result = z_hash_find_index(zth, hashi, blocknr);
79325+ /* According to the current design, the hash table lock protects new
79326+ znode references. */
79327+ if (result != NULL) {
79328+ add_x_ref(ZJNODE(result));
79329+ /* NOTE-NIKITA it should be so, but special case during
79330+ creation of new root makes such assertion highly
79331+ complicated. */
79332+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
79333+ (ZF_ISSET(result, JNODE_ORPHAN)
79334+ && (znode_parent(result) == NULL)));
79335+ result = znode_rip_check(tree, result);
79336+ }
79337+
79338+ rcu_read_unlock();
79339+
79340+ if (!result) {
79341+ znode *shadow;
79342+
79343+ result = zalloc(gfp_flag);
79344+ if (!result) {
79345+ return ERR_PTR(RETERR(-ENOMEM));
79346+ }
79347+
79348+ zinit(result, parent, tree);
79349+ ZJNODE(result)->blocknr = *blocknr;
79350+ ZJNODE(result)->key.z = *blocknr;
79351+ result->level = level;
79352+
79353+ write_lock_tree(tree);
79354+
79355+ shadow = z_hash_find_index(zth, hashi, blocknr);
79356+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
79357+ jnode_list_remove(ZJNODE(result));
79358+ zfree(result);
79359+ result = shadow;
79360+ } else {
79361+ result->version = znode_build_version(tree);
79362+ z_hash_insert_index_rcu(zth, hashi, result);
79363+
79364+ if (parent != NULL)
79365+ ++parent->c_count;
79366+ }
79367+
79368+ add_x_ref(ZJNODE(result));
79369+
79370+ write_unlock_tree(tree);
79371+ }
79372+#if REISER4_DEBUG
71430cf6 79373+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
44254afd
MT
79374+ reiser4_check_block(blocknr, 1);
79375+#endif
79376+ /* Check for invalid tree level, return -EIO */
79377+ if (unlikely(znode_get_level(result) != level)) {
79378+ warning("jmacd-504",
79379+ "Wrong level for cached block %llu: %i expecting %i",
79380+ (unsigned long long)(*blocknr), znode_get_level(result),
79381+ level);
79382+ zput(result);
79383+ return ERR_PTR(RETERR(-EIO));
79384+ }
79385+
79386+ assert("nikita-1227", znode_invariant(result));
79387+
79388+ return result;
79389+}
79390+
79391+/* ZNODE PLUGINS/DATA */
79392+
79393+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
79394+ stored at the fixed offset from the beginning of the node. */
79395+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
79396+ * plugin of */ )
79397+{
79398+ reiser4_tree *tree;
79399+
79400+ assert("nikita-1053", node != NULL);
79401+ assert("nikita-1055", zdata(node) != NULL);
79402+
79403+ tree = znode_get_tree(node);
79404+ assert("umka-053", tree != NULL);
79405+
79406+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
79407+ return tree->nplug;
79408+ } else {
79409+ return node_plugin_by_disk_id
79410+ (tree, &((common_node_header *) zdata(node))->plugin_id);
79411+#ifdef GUESS_EXISTS
79412+ reiser4_plugin *plugin;
79413+
79414+ /* NOTE-NIKITA add locking here when dynamic plugins will be
79415+ * implemented */
79416+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
79417+ if ((plugin->u.node.guess != NULL)
79418+ && plugin->u.node.guess(node))
79419+ return plugin;
79420+ }
79421+ warning("nikita-1057", "Cannot guess node plugin");
79422+ print_znode("node", node);
79423+ return NULL;
79424+#endif
79425+ }
79426+}
79427+
79428+/* parse node header and install ->node_plugin */
79429+int zparse(znode * node /* znode to parse */ )
79430+{
79431+ int result;
79432+
79433+ assert("nikita-1233", node != NULL);
79434+ assert("nikita-2370", zdata(node) != NULL);
79435+
79436+ if (node->nplug == NULL) {
79437+ node_plugin *nplug;
79438+
79439+ nplug = znode_guess_plugin(node);
79440+ if (likely(nplug != NULL)) {
79441+ result = nplug->parse(node);
79442+ if (likely(result == 0))
79443+ node->nplug = nplug;
79444+ } else {
79445+ result = RETERR(-EIO);
79446+ }
79447+ } else
79448+ result = 0;
79449+ return result;
79450+}
79451+
79452+/* zload with readahead */
79453+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
79454+{
79455+ int result;
79456+
79457+ assert("nikita-484", node != NULL);
79458+ assert("nikita-1377", znode_invariant(node));
79459+ assert("jmacd-7771", !znode_above_root(node));
79460+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
71430cf6 79461+ assert("nikita-3016", reiser4_schedulable());
44254afd
MT
79462+
79463+ if (info)
79464+ formatted_readahead(node, info);
79465+
79466+ result = jload(ZJNODE(node));
79467+ assert("nikita-1378", znode_invariant(node));
79468+ return result;
79469+}
79470+
79471+/* load content of node into memory */
79472+int zload(znode * node)
79473+{
79474+ return zload_ra(node, NULL);
79475+}
79476+
79477+/* call node plugin to initialise newly allocated node. */
79478+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
79479+{
79480+ return jinit_new(ZJNODE(node), gfp_flags);
79481+}
79482+
79483+/* drop reference to node data. When last reference is dropped, data are
79484+ unloaded. */
79485+void zrelse(znode * node /* znode to release references to */ )
79486+{
79487+ assert("nikita-1381", znode_invariant(node));
79488+
79489+ jrelse(ZJNODE(node));
79490+}
79491+
79492+/* returns free space in node */
79493+unsigned znode_free_space(znode * node /* znode to query */ )
79494+{
79495+ assert("nikita-852", node != NULL);
79496+ return node_plugin_by_node(node)->free_space(node);
79497+}
79498+
79499+/* left delimiting key of znode */
79500+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
79501+{
79502+ assert("nikita-958", node != NULL);
79503+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79504+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
79505+ assert("nikita-30671", node->rd_key_version != 0);
79506+ return &node->rd_key;
79507+}
79508+
79509+/* right delimiting key of znode */
79510+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
79511+{
79512+ assert("nikita-974", node != NULL);
79513+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
79514+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
79515+ assert("nikita-30681", node->ld_key_version != 0);
79516+ return &node->ld_key;
79517+}
79518+
79519+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
79520+ )
79521+
79522+/* update right-delimiting key of @node */
79523+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
79524+{
79525+ assert("nikita-2937", node != NULL);
79526+ assert("nikita-2939", key != NULL);
79527+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79528+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
79529+ assert("nikita-2944",
79530+ znode_is_any_locked(node) ||
79531+ znode_get_level(node) != LEAF_LEVEL ||
79532+ keyge(key, &node->rd_key) ||
71430cf6 79533+ keyeq(&node->rd_key, reiser4_min_key()) ||
44254afd
MT
79534+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
79535+
79536+ node->rd_key = *key;
79537+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
79538+ return &node->rd_key;
79539+}
79540+
79541+/* update left-delimiting key of @node */
79542+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
79543+{
79544+ assert("nikita-2940", node != NULL);
79545+ assert("nikita-2941", key != NULL);
79546+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
79547+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
79548+ assert("nikita-2943",
71430cf6
MT
79549+ znode_is_any_locked(node) || keyeq(&node->ld_key,
79550+ reiser4_min_key()));
44254afd
MT
79551+
79552+ node->ld_key = *key;
79553+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
79554+ return &node->ld_key;
79555+}
79556+
79557+/* true if @key is inside key range for @node */
79558+int znode_contains_key(znode * node /* znode to look in */ ,
79559+ const reiser4_key * key /* key to look for */ )
79560+{
79561+ assert("nikita-1237", node != NULL);
79562+ assert("nikita-1238", key != NULL);
79563+
79564+ /* left_delimiting_key <= key <= right_delimiting_key */
79565+ return keyle(znode_get_ld_key(node), key)
79566+ && keyle(key, znode_get_rd_key(node));
79567+}
79568+
79569+/* same as znode_contains_key(), but lock dk lock */
79570+int znode_contains_key_lock(znode * node /* znode to look in */ ,
79571+ const reiser4_key * key /* key to look for */ )
79572+{
79573+ int result;
79574+
79575+ assert("umka-056", node != NULL);
79576+ assert("umka-057", key != NULL);
79577+
79578+ read_lock_dk(znode_get_tree(node));
79579+ result = znode_contains_key(node, key);
79580+ read_unlock_dk(znode_get_tree(node));
79581+ return result;
79582+}
79583+
79584+/* get parent pointer, assuming tree is not locked */
79585+znode *znode_parent_nolock(const znode * node /* child znode */ )
79586+{
79587+ assert("nikita-1444", node != NULL);
79588+ return node->in_parent.node;
79589+}
79590+
79591+/* get parent pointer of znode */
79592+znode *znode_parent(const znode * node /* child znode */ )
79593+{
79594+ assert("nikita-1226", node != NULL);
79595+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
79596+ return znode_parent_nolock(node);
79597+}
79598+
79599+/* detect uber znode used to protect in-superblock tree root pointer */
79600+int znode_above_root(const znode * node /* znode to query */ )
79601+{
79602+ assert("umka-059", node != NULL);
79603+
79604+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
79605+}
79606+
79607+/* check that @node is root---that its block number is recorder in the tree as
79608+ that of root node */
79609+#if REISER4_DEBUG
79610+static int znode_is_true_root(const znode * node /* znode to query */ )
79611+{
79612+ assert("umka-060", node != NULL);
79613+ assert("umka-061", current_tree != NULL);
79614+
79615+ return disk_addr_eq(znode_get_block(node),
79616+ &znode_get_tree(node)->root_block);
79617+}
79618+#endif
79619+
79620+/* check that @node is root */
79621+int znode_is_root(const znode * node /* znode to query */ )
79622+{
79623+ assert("nikita-1206", node != NULL);
79624+
79625+ return znode_get_level(node) == znode_get_tree(node)->height;
79626+}
79627+
79628+/* Returns true is @node was just created by zget() and wasn't ever loaded
79629+ into memory. */
79630+/* NIKITA-HANS: yes */
79631+int znode_just_created(const znode * node)
79632+{
79633+ assert("nikita-2188", node != NULL);
79634+ return (znode_page(node) == NULL);
79635+}
79636+
79637+/* obtain updated ->znode_epoch. See seal.c for description. */
79638+__u64 znode_build_version(reiser4_tree * tree)
79639+{
79640+ __u64 result;
79641+
79642+ spin_lock(&tree->epoch_lock);
79643+ result = ++tree->znode_epoch;
79644+ spin_unlock(&tree->epoch_lock);
79645+ return result;
79646+}
79647+
79648+void init_load_count(load_count * dh)
79649+{
79650+ assert("nikita-2105", dh != NULL);
79651+ memset(dh, 0, sizeof *dh);
79652+}
79653+
79654+void done_load_count(load_count * dh)
79655+{
79656+ assert("nikita-2106", dh != NULL);
79657+ if (dh->node != NULL) {
79658+ for (; dh->d_ref > 0; --dh->d_ref)
79659+ zrelse(dh->node);
79660+ dh->node = NULL;
79661+ }
79662+}
79663+
79664+static int incr_load_count(load_count * dh)
79665+{
79666+ int result;
79667+
79668+ assert("nikita-2110", dh != NULL);
79669+ assert("nikita-2111", dh->node != NULL);
79670+
79671+ result = zload(dh->node);
79672+ if (result == 0)
79673+ ++dh->d_ref;
79674+ return result;
79675+}
79676+
79677+int incr_load_count_znode(load_count * dh, znode * node)
79678+{
79679+ assert("nikita-2107", dh != NULL);
79680+ assert("nikita-2158", node != NULL);
79681+ assert("nikita-2109",
79682+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
79683+
79684+ dh->node = node;
79685+ return incr_load_count(dh);
79686+}
79687+
79688+int incr_load_count_jnode(load_count * dh, jnode * node)
79689+{
79690+ if (jnode_is_znode(node)) {
79691+ return incr_load_count_znode(dh, JZNODE(node));
79692+ }
79693+ return 0;
79694+}
79695+
79696+void copy_load_count(load_count * new, load_count * old)
79697+{
79698+ int ret = 0;
79699+ done_load_count(new);
79700+ new->node = old->node;
79701+ new->d_ref = 0;
79702+
79703+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
79704+ }
79705+
79706+ assert("jmacd-87589", ret == 0);
79707+}
79708+
79709+void move_load_count(load_count * new, load_count * old)
79710+{
79711+ done_load_count(new);
79712+ new->node = old->node;
79713+ new->d_ref = old->d_ref;
79714+ old->node = NULL;
79715+ old->d_ref = 0;
79716+}
79717+
79718+/* convert parent pointer into coord */
79719+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
79720+{
79721+ assert("nikita-3204", pcoord != NULL);
79722+ assert("nikita-3205", coord != NULL);
79723+
79724+ coord_init_first_unit_nocheck(coord, pcoord->node);
79725+ coord_set_item_pos(coord, pcoord->item_pos);
79726+ coord->between = AT_UNIT;
79727+}
79728+
79729+/* pack coord into parent_coord_t */
79730+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
79731+{
79732+ assert("nikita-3206", pcoord != NULL);
79733+ assert("nikita-3207", coord != NULL);
79734+
79735+ pcoord->node = coord->node;
79736+ pcoord->item_pos = coord->item_pos;
79737+}
79738+
79739+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
79740+ look for comments there) */
79741+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
79742+{
79743+ pcoord->node = (znode *) node;
79744+ pcoord->item_pos = (unsigned short)~0;
79745+}
79746+
79747+#if REISER4_DEBUG
79748+
79749+/* debugging aid: znode invariant */
79750+static int znode_invariant_f(const znode * node /* znode to check */ ,
79751+ char const **msg /* where to store error
79752+ * message, if any */ )
79753+{
79754+#define _ergo(ant, con) \
79755+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
79756+
79757+#define _equi(e1, e2) \
79758+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
79759+
79760+#define _check(exp) ((*msg) = #exp, (exp))
79761+
79762+ return jnode_invariant_f(ZJNODE(node), msg) &&
79763+ /* [znode-fake] invariant */
79764+ /* fake znode doesn't have a parent, and */
79765+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
79766+ /* there is another way to express this very check, and */
79767+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
79768+ /* it has special block number, and */
79769+ _ergo(znode_get_level(node) == 0,
79770+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
79771+ /* it is the only znode with such block number, and */
79772+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
79773+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
79774+ /* it is parent of the tree root node */
79775+ _ergo(znode_is_true_root(node),
79776+ znode_above_root(znode_parent(node))) &&
79777+ /* [znode-level] invariant */
79778+ /* level of parent znode is one larger than that of child,
79779+ except for the fake znode, and */
79780+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
79781+ znode_get_level(znode_parent(node)) ==
79782+ znode_get_level(node) + 1) &&
79783+ /* left neighbor is at the same level, and */
79784+ _ergo(znode_is_left_connected(node) && node->left != NULL,
79785+ znode_get_level(node) == znode_get_level(node->left)) &&
79786+ /* right neighbor is at the same level */
79787+ _ergo(znode_is_right_connected(node) && node->right != NULL,
79788+ znode_get_level(node) == znode_get_level(node->right)) &&
79789+ /* [znode-connected] invariant */
79790+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
79791+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
79792+ _ergo(!znode_is_root(node) && node->left != NULL,
79793+ znode_is_right_connected(node->left) &&
79794+ node->left->right == node) &&
79795+ _ergo(!znode_is_root(node) && node->right != NULL,
79796+ znode_is_left_connected(node->right) &&
79797+ node->right->left == node) &&
79798+ /* [znode-c_count] invariant */
79799+ /* for any znode, c_count of its parent is greater than 0 */
79800+ _ergo(znode_parent(node) != NULL &&
79801+ !znode_above_root(znode_parent(node)),
79802+ znode_parent(node)->c_count > 0) &&
79803+ /* leaves don't have children */
79804+ _ergo(znode_get_level(node) == LEAF_LEVEL,
79805+ node->c_count == 0) &&
79806+ _check(node->zjnode.jnodes.prev != NULL) &&
79807+ _check(node->zjnode.jnodes.next != NULL) &&
79808+ /* orphan doesn't have a parent */
79809+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
79810+ /* [znode-modify] invariant */
79811+ /* if znode is not write-locked, its checksum remains
79812+ * invariant */
79813+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
79814+ * cannot check this. */
79815+ /* [znode-refs] invariant */
79816+ /* only referenced znode can be long-term locked */
79817+ _ergo(znode_is_locked(node),
79818+ atomic_read(&ZJNODE(node)->x_count) != 0);
79819+}
79820+
79821+/* debugging aid: check znode invariant and panic if it doesn't hold */
79822+int znode_invariant(znode * node /* znode to check */ )
79823+{
79824+ char const *failed_msg;
79825+ int result;
79826+
79827+ assert("umka-063", node != NULL);
79828+ assert("umka-064", current_tree != NULL);
79829+
79830+ spin_lock_znode(node);
79831+ read_lock_tree(znode_get_tree(node));
79832+ result = znode_invariant_f(node, &failed_msg);
79833+ if (!result) {
79834+ /* print_znode("corrupted node", node); */
79835+ warning("jmacd-555", "Condition %s failed", failed_msg);
79836+ }
79837+ read_unlock_tree(znode_get_tree(node));
79838+ spin_unlock_znode(node);
79839+ return result;
79840+}
79841+
79842+/* return non-0 iff data are loaded into znode */
79843+int znode_is_loaded(const znode * node /* znode to query */ )
79844+{
79845+ assert("nikita-497", node != NULL);
79846+ return jnode_is_loaded(ZJNODE(node));
79847+}
79848+
79849+unsigned long znode_times_locked(const znode * z)
79850+{
79851+ return z->times_locked;
79852+}
79853+
79854+#endif /* REISER4_DEBUG */
79855+
79856+/* Make Linus happy.
79857+ Local variables:
79858+ c-indentation-style: "K&R"
79859+ mode-name: "LC"
79860+ c-basic-offset: 8
79861+ tab-width: 8
79862+ fill-column: 120
79863+ End:
79864+*/
71430cf6
MT
79865diff -urN linux-2.6.22.orig/fs/reiser4/znode.h linux-2.6.22/fs/reiser4/znode.h
79866--- linux-2.6.22.orig/fs/reiser4/znode.h 1970-01-01 03:00:00.000000000 +0300
79867+++ linux-2.6.22/fs/reiser4/znode.h 2007-07-29 00:25:35.052742032 +0400
44254afd
MT
79868@@ -0,0 +1,434 @@
79869+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
79870+ * reiser4/README */
79871+
79872+/* Declaration of znode (Zam's node). See znode.c for more details. */
79873+
79874+#ifndef __ZNODE_H__
79875+#define __ZNODE_H__
79876+
79877+#include "forward.h"
79878+#include "debug.h"
79879+#include "dformat.h"
79880+#include "key.h"
79881+#include "coord.h"
79882+#include "plugin/node/node.h"
79883+#include "jnode.h"
79884+#include "lock.h"
79885+#include "readahead.h"
79886+
79887+#include <linux/types.h>
79888+#include <linux/spinlock.h>
79889+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
79890+#include <asm/atomic.h>
79891+#include <asm/semaphore.h>
79892+
79893+/* znode tracks its position within parent (internal item in a parent node,
79894+ * that contains znode's block number). */
79895+typedef struct parent_coord {
79896+ znode *node;
79897+ pos_in_node_t item_pos;
79898+} parent_coord_t;
79899+
79900+/* &znode - node in a reiser4 tree.
79901+
79902+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
79903+ cacheline pressure.
79904+
79905+ Locking:
79906+
79907+ Long term: data in a disk node attached to this znode are protected
79908+ by long term, deadlock aware lock ->lock;
79909+
79910+ Spin lock: the following fields are protected by the spin lock:
79911+
79912+ ->lock
79913+
79914+ Following fields are protected by the global tree lock:
79915+
79916+ ->left
79917+ ->right
79918+ ->in_parent
79919+ ->c_count
79920+
79921+ Following fields are protected by the global delimiting key lock (dk_lock):
79922+
79923+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
79924+ ->rd_key
79925+
79926+ Following fields are protected by the long term lock:
79927+
79928+ ->nr_items
79929+
79930+ ->node_plugin is never changed once set. This means that after code made
79931+ itself sure that field is valid it can be accessed without any additional
79932+ locking.
79933+
79934+ ->level is immutable.
79935+
79936+ Invariants involving this data-type:
79937+
79938+ [znode-fake]
79939+ [znode-level]
79940+ [znode-connected]
79941+ [znode-c_count]
79942+ [znode-refs]
79943+ [jnode-refs]
79944+ [jnode-queued]
79945+ [znode-modify]
79946+
79947+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
79948+ Suggestions for how to do that are desired.*/
79949+struct znode {
79950+ /* Embedded jnode. */
79951+ jnode zjnode;
79952+
79953+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
79954+
79955+ pos_in_node and pos_in_unit are only hints that are cached to
79956+ speed up lookups during balancing. They are not required to be up to
79957+ date. Synched in find_child_ptr().
79958+
79959+ This value allows us to avoid expensive binary searches.
79960+
79961+ in_parent->node points to the parent of this node, and is NOT a
79962+ hint.
79963+ */
79964+ parent_coord_t in_parent;
79965+
79966+ /*
79967+ * sibling list pointers
79968+ */
79969+
79970+ /* left-neighbor */
79971+ znode *left;
79972+ /* right-neighbor */
79973+ znode *right;
79974+
79975+ /* long term lock on node content. This lock supports deadlock
79976+ detection. See lock.c
79977+ */
79978+ zlock lock;
79979+
79980+ /* You cannot remove from memory a node that has children in
79981+ memory. This is because we rely on the fact that parent of given
79982+ node can always be reached without blocking for io. When reading a
79983+ node into memory you must increase the c_count of its parent, when
79984+ removing it from memory you must decrease the c_count. This makes
79985+ the code simpler, and the cases where it is suboptimal are truly
79986+ obscure.
79987+ */
79988+ int c_count;
79989+
79990+ /* plugin of node attached to this znode. NULL if znode is not
79991+ loaded. */
79992+ node_plugin *nplug;
79993+
79994+ /* version of znode data. This is increased on each modification. This
79995+ * is necessary to implement seals (see seal.[ch]) efficiently. */
79996+ __u64 version;
79997+
79998+ /* left delimiting key. Necessary to efficiently perform
79999+ balancing with node-level locking. Kept in memory only. */
80000+ reiser4_key ld_key;
80001+ /* right delimiting key. */
80002+ reiser4_key rd_key;
80003+
80004+ /* znode's tree level */
80005+ __u16 level;
80006+ /* number of items in this node. This field is modified by node
80007+ * plugin. */
80008+ __u16 nr_items;
80009+
80010+#if REISER4_DEBUG
80011+ void *creator;
80012+ reiser4_key first_key;
80013+ unsigned long times_locked;
80014+ int left_version; /* when node->left was updated */
80015+ int right_version; /* when node->right was updated */
80016+ int ld_key_version; /* when node->ld_key was updated */
80017+ int rd_key_version; /* when node->rd_key was updated */
80018+#endif
80019+
80020+} __attribute__ ((aligned(16)));
80021+
80022+ON_DEBUG(extern atomic_t delim_key_version;
80023+ )
80024+
80025+/* In general I think these macros should not be exposed. */
80026+#define znode_is_locked(node) (lock_is_locked(&node->lock))
80027+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80028+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80029+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80030+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80031+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80032+/* Macros for accessing the znode state. */
80033+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80034+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80035+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80036+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80037+ znode * parent, tree_level level, gfp_t gfp_flag);
80038+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80039+extern int zload(znode * node);
80040+extern int zload_ra(znode * node, ra_info_t * info);
80041+extern int zinit_new(znode * node, gfp_t gfp_flags);
80042+extern void zrelse(znode * node);
80043+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80044+
80045+/* size of data in znode */
80046+static inline unsigned
80047+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80048+{
80049+ assert("nikita-1416", node != NULL);
80050+ return PAGE_CACHE_SIZE;
80051+}
80052+
80053+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80054+ coord_t * coord);
80055+extern void coord_to_parent_coord(const coord_t * coord,
80056+ parent_coord_t * pcoord);
80057+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80058+
80059+extern unsigned znode_free_space(znode * node);
80060+
80061+extern reiser4_key *znode_get_rd_key(znode * node);
80062+extern reiser4_key *znode_get_ld_key(znode * node);
80063+
80064+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80065+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80066+
80067+/* `connected' state checks */
80068+static inline int znode_is_right_connected(const znode * node)
80069+{
80070+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80071+}
80072+
80073+static inline int znode_is_left_connected(const znode * node)
80074+{
80075+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80076+}
80077+
80078+static inline int znode_is_connected(const znode * node)
80079+{
80080+ return znode_is_right_connected(node) && znode_is_left_connected(node);
80081+}
80082+
80083+extern int znode_shift_order;
80084+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80085+extern void znode_remove(znode *, reiser4_tree *);
80086+extern znode *znode_parent(const znode * node);
80087+extern znode *znode_parent_nolock(const znode * node);
80088+extern int znode_above_root(const znode * node);
80089+extern int init_znodes(void);
80090+extern void done_znodes(void);
80091+extern int znodes_tree_init(reiser4_tree * ztree);
80092+extern void znodes_tree_done(reiser4_tree * ztree);
80093+extern int znode_contains_key(znode * node, const reiser4_key * key);
80094+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80095+extern unsigned znode_save_free_space(znode * node);
80096+extern unsigned znode_recover_free_space(znode * node);
80097+extern znode *zalloc(gfp_t gfp_flag);
80098+extern void zinit(znode *, const znode * parent, reiser4_tree *);
80099+extern int zparse(znode * node);
80100+
44254afd
MT
80101+extern int znode_just_created(const znode * node);
80102+
80103+extern void zfree(znode * node);
80104+
80105+#if REISER4_DEBUG
80106+extern void print_znode(const char *prefix, const znode * node);
80107+#else
80108+#define print_znode( p, n ) noop
80109+#endif
80110+
80111+/* Make it look like various znode functions exist instead of treating znodes as
80112+ jnodes in znode-specific code. */
80113+#define znode_page(x) jnode_page ( ZJNODE(x) )
80114+#define zdata(x) jdata ( ZJNODE(x) )
80115+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80116+#define znode_created(x) jnode_created ( ZJNODE(x) )
80117+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80118+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80119+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80120+
80121+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80122+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80123+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80124+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80125+
80126+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80127+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80128+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80129+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80130+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80131+
80132+#if REISER4_DEBUG
80133+extern int znode_x_count_is_protected(const znode * node);
80134+extern int znode_invariant(znode * node);
80135+#endif
80136+
80137+/* acquire reference to @node */
80138+static inline znode *zref(znode * node)
80139+{
80140+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
80141+ return JZNODE(jref(ZJNODE(node)));
80142+}
80143+
80144+/* release reference to @node */
80145+static inline void zput(znode * node)
80146+{
80147+ assert("nikita-3564", znode_invariant(node));
80148+ jput(ZJNODE(node));
80149+}
80150+
80151+/* get the level field for a znode */
80152+static inline tree_level znode_get_level(const znode * node)
80153+{
80154+ return node->level;
80155+}
80156+
80157+/* get the level field for a jnode */
80158+static inline tree_level jnode_get_level(const jnode * node)
80159+{
80160+ if (jnode_is_znode(node))
80161+ return znode_get_level(JZNODE(node));
80162+ else
80163+ /* unformatted nodes are all at the LEAF_LEVEL and for
80164+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
80165+ return LEAF_LEVEL;
80166+}
80167+
80168+/* true if jnode is on leaf level */
80169+static inline int jnode_is_leaf(const jnode * node)
80170+{
80171+ if (jnode_is_znode(node))
80172+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80173+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80174+ return 1;
80175+ return 0;
80176+}
80177+
80178+/* return znode's tree */
80179+static inline reiser4_tree *znode_get_tree(const znode * node)
80180+{
80181+ assert("nikita-2692", node != NULL);
80182+ return jnode_get_tree(ZJNODE(node));
80183+}
80184+
80185+/* resolve race with zput */
80186+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80187+{
80188+ jnode *j;
80189+
80190+ j = jnode_rip_sync(tree, ZJNODE(node));
80191+ if (likely(j != NULL))
80192+ node = JZNODE(j);
80193+ else
80194+ node = NULL;
80195+ return node;
80196+}
80197+
80198+#if defined(REISER4_DEBUG)
80199+int znode_is_loaded(const znode * node /* znode to query */ );
80200+#endif
80201+
80202+extern __u64 znode_build_version(reiser4_tree * tree);
80203+
80204+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
80205+ must load the data for a node in many places. We could do this by simply calling
80206+ zload() everywhere, the difficulty arises when we must release the loaded data by
80207+ calling zrelse. In a function with many possible error/return paths, it requires extra
80208+ work to figure out which exit paths must call zrelse and those which do not. The data
80209+ handle automatically calls zrelse for every zload that it is responsible for. In that
80210+ sense, it acts much like a lock_handle.
80211+*/
80212+typedef struct load_count {
80213+ znode *node;
80214+ int d_ref;
80215+} load_count;
80216+
80217+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
80218+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
80219+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
80220+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
80221+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
80222+ * don't require zload/zrelse treatment). */
80223+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
80224+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
80225+
80226+/* Variable initializers for load_count. */
80227+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80228+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80229+/* A convenience macro for use in assertions or debug-only code, where loaded
80230+ data is only required to perform the debugging check. This macro
80231+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80232+#define WITH_DATA( node, exp ) \
80233+({ \
80234+ long __with_dh_result; \
80235+ znode *__with_dh_node; \
80236+ \
80237+ __with_dh_node = ( node ); \
80238+ __with_dh_result = zload( __with_dh_node ); \
80239+ if( __with_dh_result == 0 ) { \
80240+ __with_dh_result = ( long )( exp ); \
80241+ zrelse( __with_dh_node ); \
80242+ } \
80243+ __with_dh_result; \
80244+})
80245+
80246+/* Same as above, but accepts a return value in case zload fails. */
80247+#define WITH_DATA_RET( node, ret, exp ) \
80248+({ \
80249+ int __with_dh_result; \
80250+ znode *__with_dh_node; \
80251+ \
80252+ __with_dh_node = ( node ); \
80253+ __with_dh_result = zload( __with_dh_node ); \
80254+ if( __with_dh_result == 0 ) { \
80255+ __with_dh_result = ( int )( exp ); \
80256+ zrelse( __with_dh_node ); \
80257+ } else \
80258+ __with_dh_result = ( ret ); \
80259+ __with_dh_result; \
80260+})
80261+
80262+#define WITH_COORD(coord, exp) \
80263+({ \
80264+ coord_t *__coord; \
80265+ \
80266+ __coord = (coord); \
80267+ coord_clear_iplug(__coord); \
80268+ WITH_DATA(__coord->node, exp); \
80269+})
80270+
80271+#if REISER4_DEBUG
80272+#define STORE_COUNTERS \
71430cf6
MT
80273+ reiser4_lock_cnt_info __entry_counters = \
80274+ *reiser4_lock_counters()
80275+#define CHECK_COUNTERS \
80276+ON_DEBUG_CONTEXT( \
80277+({ \
80278+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
80279+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
80280+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
80281+ assert("nikita-2159", \
80282+ !memcmp(&__entry_counters, reiser4_lock_counters(), \
80283+ sizeof __entry_counters)); \
44254afd
MT
80284+}) )
80285+
80286+#else
80287+#define STORE_COUNTERS
80288+#define CHECK_COUNTERS noop
80289+#endif
80290+
80291+/* __ZNODE_H__ */
80292+#endif
80293+
80294+/* Make Linus happy.
80295+ Local variables:
80296+ c-indentation-style: "K&R"
80297+ mode-name: "LC"
80298+ c-basic-offset: 8
80299+ tab-width: 8
80300+ fill-column: 120
80301+ End:
80302+*/
71430cf6
MT
80303diff -urN linux-2.6.22.orig/include/linux/fs.h linux-2.6.22/include/linux/fs.h
80304--- linux-2.6.22.orig/include/linux/fs.h 2007-07-21 00:33:00.673389540 +0400
80305+++ linux-2.6.22/include/linux/fs.h 2007-07-29 00:25:35.056743067 +0400
80306@@ -1179,6 +1179,8 @@
44254afd 80307 void (*clear_inode) (struct inode *);
71430cf6 80308 void (*umount_begin) (struct vfsmount *, int);
44254afd
MT
80309
80310+ void (*sync_inodes) (struct super_block *sb,
80311+ struct writeback_control *wbc);
80312 int (*show_options)(struct seq_file *, struct vfsmount *);
71430cf6
MT
80313 int (*show_stats)(struct seq_file *, struct vfsmount *);
80314 #ifdef CONFIG_QUOTA
80315@@ -1630,6 +1632,7 @@
44254afd
MT
80316 extern int invalidate_inode_pages2_range(struct address_space *mapping,
80317 pgoff_t start, pgoff_t end);
80318 extern int write_inode_now(struct inode *, int);
80319+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80320 extern int filemap_fdatawrite(struct address_space *);
80321 extern int filemap_flush(struct address_space *);
80322 extern int filemap_fdatawait(struct address_space *);
71430cf6
MT
80323diff -urN linux-2.6.22.orig/lib/radix-tree.c linux-2.6.22/lib/radix-tree.c
80324--- linux-2.6.22.orig/lib/radix-tree.c 2007-07-21 00:33:01.265543326 +0400
80325+++ linux-2.6.22/lib/radix-tree.c 2007-07-29 00:25:35.060744102 +0400
80326@@ -151,6 +151,7 @@
80327 out:
80328 return ret;
44254afd
MT
80329 }
80330+EXPORT_SYMBOL(radix_tree_preload);
80331
71430cf6
MT
80332 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
80333 int offset)
80334diff -urN linux-2.6.22.orig/mm/filemap.c linux-2.6.22/mm/filemap.c
80335--- linux-2.6.22.orig/mm/filemap.c 2007-07-21 00:33:01.277546443 +0400
80336+++ linux-2.6.22/mm/filemap.c 2007-07-29 00:25:35.064745138 +0400
80337@@ -121,6 +121,7 @@
44254afd 80338 mapping->nrpages--;
71430cf6 80339 __dec_zone_page_state(page, NR_FILE_PAGES);
44254afd
MT
80340 }
80341+EXPORT_SYMBOL(__remove_from_page_cache);
80342
80343 void remove_from_page_cache(struct page *page)
80344 {
71430cf6 80345@@ -132,6 +133,7 @@
44254afd
MT
80346 __remove_from_page_cache(page);
80347 write_unlock_irq(&mapping->tree_lock);
80348 }
80349+EXPORT_SYMBOL(remove_from_page_cache);
80350
80351 static int sync_page(void *word)
80352 {
71430cf6
MT
80353@@ -719,6 +721,7 @@
80354 read_unlock_irq(&mapping->tree_lock);
44254afd
MT
80355 return ret;
80356 }
80357+EXPORT_SYMBOL(add_to_page_cache_lru);
80358
44254afd 80359 /**
71430cf6
MT
80360 * find_get_pages_contig - gang contiguous pagecache lookup
80361@@ -838,6 +841,7 @@
44254afd 80362
71430cf6 80363 ra->ra_pages /= 4;
44254afd 80364 }
71430cf6 80365+EXPORT_SYMBOL(find_get_pages);
44254afd
MT
80366
80367 /**
71430cf6
MT
80368 * do_generic_mapping_read - generic file read routine
80369diff -urN linux-2.6.22.orig/mm/readahead.c linux-2.6.22/mm/readahead.c
80370--- linux-2.6.22.orig/mm/readahead.c 2007-07-21 00:33:01.305553717 +0400
80371+++ linux-2.6.22/mm/readahead.c 2007-07-29 00:25:35.064745138 +0400
80372@@ -571,6 +571,7 @@
44254afd
MT
80373 ra->flags &= ~RA_FLAG_INCACHE;
80374 ra->cache_hit = 0;
80375 }
80376+EXPORT_SYMBOL_GPL(handle_ra_miss);
80377
80378 /*
80379 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a