Updated vlc to actual version
[ipfire-2.x.git] / src / patches / reiser4-for-2.6.20.patch
CommitLineData
c641d991
AF
1diff -urN linux-2.6.20.orig/arch/i386/lib/usercopy.c linux-2.6.20/arch/i386/lib/usercopy.c
2--- linux-2.6.20.orig/arch/i386/lib/usercopy.c 2006-11-30 00:57:37.000000000 +0300
3+++ linux-2.6.20/arch/i386/lib/usercopy.c 2007-05-06 14:50:43.658963226 +0400
4@@ -812,6 +812,7 @@
5 #endif
6 return n;
7 }
8+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
9
10 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
11 unsigned long n)
12@@ -827,6 +828,7 @@
13 #endif
14 return n;
15 }
16+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
17
18 /**
19 * copy_to_user: - Copy a block of data into user space.
20diff -urN linux-2.6.20.orig/Documentation/Changes linux-2.6.20/Documentation/Changes
21--- linux-2.6.20.orig/Documentation/Changes 2007-05-06 15:04:34.226399593 +0400
22+++ linux-2.6.20/Documentation/Changes 2007-05-06 14:50:43.658963226 +0400
23@@ -36,6 +36,7 @@
4ce37908
MT
24 o e2fsprogs 1.29 # tune2fs
25 o jfsutils 1.1.3 # fsck.jfs -V
26 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
27+o reiser4progs 1.0.0 # fsck.reiser4 -V
28 o xfsprogs 2.6.0 # xfs_db -V
29 o pcmciautils 004 # pccardctl -V
30 o quota-tools 3.09 # quota -V
c641d991 31@@ -144,6 +145,13 @@
4ce37908
MT
32 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
33 reiserfsck. These utils work on both i386 and alpha platforms.
34
35+Reiser4progs
36+------------
37+
38+The reiser4progs package contains utilities for the reiser4 file system.
39+Detailed instructions are provided in the README file located at:
40+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
41+
42 Xfsprogs
43 --------
44
c641d991 45@@ -322,6 +330,10 @@
4ce37908
MT
46 -------------
47 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
48
49+Reiser4progs
50+------------
51+o <ftp://ftp.namesys.com/pub/reiser4progs/>
52+
53 Xfsprogs
54 --------
55 o <ftp://oss.sgi.com/projects/xfs/download/>
c641d991
AF
56diff -urN linux-2.6.20.orig/Documentation/filesystems/reiser4.txt linux-2.6.20/Documentation/filesystems/reiser4.txt
57--- linux-2.6.20.orig/Documentation/filesystems/reiser4.txt 1970-01-01 03:00:00.000000000 +0300
58+++ linux-2.6.20/Documentation/filesystems/reiser4.txt 2007-05-06 14:50:43.658963226 +0400
4ce37908
MT
59@@ -0,0 +1,75 @@
60+Reiser4 filesystem
61+==================
62+Reiser4 is a file system based on dancing tree algorithms, and is
63+described at http://www.namesys.com
64+
65+
66+References
67+==========
68+web page http://namesys.com/v4/v4.html
69+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
70+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
71+install page http://www.namesys.com/install_v4.html
72+
73+Compile options
74+===============
75+Enable reiser4 debug mode
76+ This checks everything imaginable while reiser4
77+ runs
78+
79+Mount options
80+=============
81+tmgr.atom_max_size=N
82+ Atoms containing more than N blocks will be forced to commit.
83+ N is decimal.
84+ Default is nr_free_pagecache_pages() / 2 at mount time.
85+
86+tmgr.atom_max_age=N
87+ Atoms older than N seconds will be forced to commit. N is decimal.
88+ Default is 600.
89+
90+tmgr.atom_max_flushers=N
91+ Limit of concurrent flushers for one atom. 0 means no limit.
92+ Default is 0.
93+
94+tree.cbk_cache.nr_slots=N
95+ Number of slots in the cbk cache.
96+
97+flush.relocate_threshold=N
98+ If flush finds more than N adjacent dirty leaf-level blocks it
99+ will force them to be relocated.
100+ Default is 64.
101+
102+flush.relocate_distance=N
103+ If flush finds can find a block allocation closer than at most
104+ N from the preceder it will relocate to that position.
105+ Default is 64.
106+
107+flush.scan_maxnodes=N
108+ The maximum number of nodes to scan left on a level during
109+ flush.
110+ Default is 10000.
111+
112+optimal_io_size=N
113+ Preferred IO size. This value is used to set st_blksize of
114+ struct stat.
115+ Default is 65536.
116+
117+bsdgroups
118+ Turn on BSD-style gid assignment.
119+
120+32bittimes
121+ By default file in reiser4 have 64 bit timestamps. Files
122+ created when filesystem is mounted with 32bittimes mount
123+ option will get 32 bit timestamps.
124+
125+mtflush
126+ Turn off concurrent flushing.
127+
128+nopseudo
129+ Disable pseudo files support. See
130+ http://namesys.com/v4/pseudo.html for more about pseudo files.
131+
132+dont_load_bitmap
133+ Don't load all bitmap blocks at mount time, it is useful for
134+ machines with tiny RAM and large disks.
c641d991
AF
135diff -urN linux-2.6.20.orig/fs/fs-writeback.c linux-2.6.20/fs/fs-writeback.c
136--- linux-2.6.20.orig/fs/fs-writeback.c 2007-05-06 15:04:39.848155607 +0400
137+++ linux-2.6.20/fs/fs-writeback.c 2007-05-06 14:50:43.662964476 +0400
138@@ -296,8 +296,6 @@
4ce37908
MT
139 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
140 * that it can be located for waiting on in __writeback_single_inode().
141 *
142- * Called under inode_lock.
143- *
144 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
145 * This function assumes that the blockdev superblock's inodes are backed by
146 * a variety of queues, so all inodes are searched. For other superblocks,
c641d991 147@@ -313,11 +311,13 @@
4ce37908
MT
148 * on the writer throttling path, and we get decent balancing between many
149 * throttled threads: we don't want them all piling up on __wait_on_inode.
150 */
151-static void
152-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
153+void
154+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
155 {
156 const unsigned long start = jiffies; /* livelock avoidance */
157
158+ spin_lock(&inode_lock);
159+
160 if (!wbc->for_kupdate || list_empty(&sb->s_io))
161 list_splice_init(&sb->s_dirty, &sb->s_io);
162
c641d991 163@@ -397,8 +397,19 @@
4ce37908
MT
164 if (wbc->nr_to_write <= 0)
165 break;
166 }
167+ spin_unlock(&inode_lock);
168 return; /* Leave any unwritten inodes on s_io */
169 }
170+EXPORT_SYMBOL(generic_sync_sb_inodes);
171+
172+static void
173+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
174+{
175+ if (sb->s_op->sync_inodes)
176+ sb->s_op->sync_inodes(sb, wbc);
177+ else
178+ generic_sync_sb_inodes(sb, wbc);
179+}
180
181 /*
182 * Start writeback of dirty pagecache data against all unlocked inodes.
c641d991 183@@ -439,11 +450,8 @@
4ce37908
MT
184 * be unmounted by the time it is released.
185 */
186 if (down_read_trylock(&sb->s_umount)) {
187- if (sb->s_root) {
188- spin_lock(&inode_lock);
189+ if (sb->s_root)
190 sync_sb_inodes(sb, wbc);
191- spin_unlock(&inode_lock);
192- }
193 up_read(&sb->s_umount);
194 }
195 spin_lock(&sb_lock);
c641d991 196@@ -481,9 +489,7 @@
4ce37908
MT
197 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
198 nr_dirty + nr_unstable;
199 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
200- spin_lock(&inode_lock);
201 sync_sb_inodes(sb, &wbc);
202- spin_unlock(&inode_lock);
203 }
204
205 /*
c641d991
AF
206diff -urN linux-2.6.20.orig/fs/Kconfig linux-2.6.20/fs/Kconfig
207--- linux-2.6.20.orig/fs/Kconfig 2007-05-06 15:04:39.668099364 +0400
208+++ linux-2.6.20/fs/Kconfig 2007-05-06 14:50:43.662964476 +0400
209@@ -272,6 +272,8 @@
210 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
211 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
212
213+source "fs/reiser4/Kconfig"
4ce37908 214+
c641d991
AF
215 config REISERFS_FS
216 tristate "Reiserfs support"
217 help
218diff -urN linux-2.6.20.orig/fs/Makefile linux-2.6.20/fs/Makefile
219--- linux-2.6.20.orig/fs/Makefile 2007-05-06 15:04:39.668099364 +0400
220+++ linux-2.6.20/fs/Makefile 2007-05-06 14:50:43.666965726 +0400
221@@ -62,6 +62,7 @@
222
223 # Do not add any filesystems before this line
224 obj-$(CONFIG_REISERFS_FS) += reiserfs/
225+obj-$(CONFIG_REISER4_FS) += reiser4/
226 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
227 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
228 obj-$(CONFIG_JBD) += jbd/
229diff -urN linux-2.6.20.orig/fs/reiser4/as_ops.c linux-2.6.20/fs/reiser4/as_ops.c
230--- linux-2.6.20.orig/fs/reiser4/as_ops.c 1970-01-01 03:00:00.000000000 +0300
231+++ linux-2.6.20/fs/reiser4/as_ops.c 2007-05-06 14:50:43.666965726 +0400
232@@ -0,0 +1,337 @@
4ce37908
MT
233+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
234+
235+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
236+
237+#include "forward.h"
238+#include "debug.h"
239+#include "dformat.h"
240+#include "coord.h"
241+#include "plugin/item/item.h"
242+#include "plugin/file/file.h"
243+#include "plugin/security/perm.h"
244+#include "plugin/disk_format/disk_format.h"
245+#include "plugin/plugin.h"
246+#include "plugin/plugin_set.h"
247+#include "plugin/object.h"
248+#include "txnmgr.h"
249+#include "jnode.h"
250+#include "znode.h"
251+#include "block_alloc.h"
252+#include "tree.h"
253+#include "vfs_ops.h"
254+#include "inode.h"
255+#include "page_cache.h"
256+#include "ktxnmgrd.h"
257+#include "super.h"
258+#include "reiser4.h"
259+#include "entd.h"
260+
261+#include <linux/profile.h>
262+#include <linux/types.h>
263+#include <linux/mount.h>
264+#include <linux/vfs.h>
265+#include <linux/mm.h>
266+#include <linux/buffer_head.h>
267+#include <linux/dcache.h>
268+#include <linux/list.h>
269+#include <linux/pagemap.h>
270+#include <linux/slab.h>
271+#include <linux/seq_file.h>
272+#include <linux/init.h>
273+#include <linux/module.h>
274+#include <linux/writeback.h>
275+#include <linux/backing-dev.h>
276+#include <linux/quotaops.h>
277+#include <linux/security.h>
278+
279+/* address space operations */
280+
281+/**
282+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
283+ * @page: page to be dirtied
284+ *
285+ * Operation of struct address_space_operations. This implementation is used by
286+ * unix and cryptcompress file plugins.
287+ *
288+ * This is called when reiser4 page gets dirtied outside of reiser4, for
289+ * example, when dirty bit is moved from pte to physical page.
290+ *
291+ * Tags page in the mapping's page tree with special tag so that it is possible
292+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
293+ * capturing by an atom) later because it can not be done in the contexts where
294+ * set_page_dirty is called.
295+ */
296+int reiser4_set_page_dirty(struct page *page)
297+{
298+ /* this page can be unformatted only */
299+ assert("vs-1734", (page->mapping &&
300+ page->mapping->host &&
301+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
302+ page->mapping->host
303+ && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
304+ page->mapping->host
305+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
306+ page->mapping->host));
307+
308+ if (!TestSetPageDirty(page)) {
309+ struct address_space *mapping = page->mapping;
310+
311+ if (mapping) {
312+ write_lock_irq(&mapping->tree_lock);
313+
314+ /* check for race with truncate */
315+ if (page->mapping) {
316+ assert("vs-1652", page->mapping == mapping);
317+ if (mapping_cap_account_dirty(mapping))
318+ inc_zone_page_state(page,
c641d991 319+ NR_FILE_DIRTY);
4ce37908
MT
320+ radix_tree_tag_set(&mapping->page_tree,
321+ page->index,
322+ PAGECACHE_TAG_REISER4_MOVED);
323+ }
324+ write_unlock_irq(&mapping->tree_lock);
325+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
326+ }
327+ }
328+ return 0;
329+}
330+
331+/* ->invalidatepage method for reiser4 */
332+
333+/*
334+ * this is called for each truncated page from
335+ * truncate_inode_pages()->truncate_{complete,partial}_page().
336+ *
337+ * At the moment of call, page is under lock, and outstanding io (if any) has
338+ * completed.
339+ */
340+
341+/**
342+ * reiser4_invalidatepage
343+ * @page: page to invalidate
344+ * @offset: starting offset for partial invalidation
345+ *
346+ */
347+void reiser4_invalidatepage(struct page *page, unsigned long offset)
348+{
349+ int ret = 0;
350+ reiser4_context *ctx;
351+ struct inode *inode;
352+ jnode *node;
353+
354+ /*
355+ * This is called to truncate file's page.
356+ *
357+ * Originally, reiser4 implemented truncate in a standard way
358+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
359+ * first, then file system ->truncate() call-back is invoked).
360+ *
361+ * This lead to the problem when ->invalidatepage() was called on a
362+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
363+ * process. That is, truncate was bypassing transactions. To avoid
364+ * this, try_capture_page_to_invalidate() call was added here.
365+ *
366+ * After many troubles with vmtruncate() based truncate (including
367+ * races with flush, tail conversion, etc.) it was re-written in the
368+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
369+ * and pages belonging to extent are invalidated in kill_hook_extent().
370+ * So probably now additional call to capture is not needed here.
371+ */
372+
373+ assert("nikita-3137", PageLocked(page));
374+ assert("nikita-3138", !PageWriteback(page));
375+ inode = page->mapping->host;
376+
377+ /*
378+ * ->invalidatepage() should only be called for the unformatted
379+ * jnodes. Destruction of all other types of jnodes is performed
380+ * separately. But, during some corner cases (like handling errors
381+ * during mount) it is simpler to let ->invalidatepage to be called on
382+ * them. Check for this, and do nothing.
383+ */
384+ if (reiser4_get_super_fake(inode->i_sb) == inode)
385+ return;
386+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
387+ return;
388+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
389+ return;
390+ assert("vs-1426", PagePrivate(page));
391+ assert("vs-1427",
392+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
393+ assert("", jprivate(page) != NULL);
394+ assert("", ergo(inode_file_plugin(inode) !=
395+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
396+ offset == 0));
397+
398+ ctx = reiser4_init_context(inode->i_sb);
399+ if (IS_ERR(ctx))
400+ return;
401+
402+ node = jprivate(page);
403+ spin_lock_jnode(node);
404+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
405+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
406+ /* there is not need to capture */
407+ jref(node);
408+ JF_SET(node, JNODE_HEARD_BANSHEE);
409+ page_clear_jnode(page, node);
410+ reiser4_uncapture_jnode(node);
411+ unhash_unformatted_jnode(node);
412+ jput(node);
413+ reiser4_exit_context(ctx);
414+ return;
415+ }
416+ spin_unlock_jnode(node);
417+
418+ /* capture page being truncated. */
419+ ret = try_capture_page_to_invalidate(page);
420+ if (ret != 0)
421+ warning("nikita-3141", "Cannot capture: %i", ret);
422+
423+ if (offset == 0) {
424+ /* remove jnode from transaction and detach it from page. */
425+ jref(node);
426+ JF_SET(node, JNODE_HEARD_BANSHEE);
427+ /* page cannot be detached from jnode concurrently, because it
428+ * is locked */
429+ reiser4_uncapture_page(page);
430+
431+ /* this detaches page from jnode, so that jdelete will not try
432+ * to lock page which is already locked */
433+ spin_lock_jnode(node);
434+ page_clear_jnode(page, node);
435+ spin_unlock_jnode(node);
436+ unhash_unformatted_jnode(node);
437+
438+ jput(node);
439+ }
440+
441+ reiser4_exit_context(ctx);
442+}
443+
444+/* help function called from reiser4_releasepage(). It returns true if jnode
445+ * can be detached from its page and page released. */
446+int jnode_is_releasable(jnode * node /* node to check */ )
447+{
448+ assert("nikita-2781", node != NULL);
449+ assert_spin_locked(&(node->guard));
450+ assert_spin_locked(&(node->load));
451+
452+ /* is some thread is currently using jnode page, later cannot be
453+ * detached */
454+ if (atomic_read(&node->d_count) != 0) {
455+ return 0;
456+ }
457+
458+ assert("vs-1214", !jnode_is_loaded(node));
459+
460+ /*
461+ * can only release page if real block number is assigned to it. Simple
462+ * check for ->atom wouldn't do, because it is possible for node to be
463+ * clean, not it atom yet, and still having fake block number. For
464+ * example, node just created in jinit_new().
465+ */
466+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
467+ return 0;
468+
469+ /*
470+ * pages prepared for write can not be released anyway, so avoid
471+ * detaching jnode from the page
472+ */
473+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
474+ return 0;
475+
476+ /*
477+ * dirty jnode cannot be released. It can however be submitted to disk
478+ * as part of early flushing, but only after getting flush-prepped.
479+ */
480+ if (JF_ISSET(node, JNODE_DIRTY))
481+ return 0;
482+
483+ /* overwrite set is only written by log writer. */
484+ if (JF_ISSET(node, JNODE_OVRWR))
485+ return 0;
486+
487+ /* jnode is already under writeback */
488+ if (JF_ISSET(node, JNODE_WRITEBACK))
489+ return 0;
490+
491+ /* don't flush bitmaps or journal records */
492+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
493+ return 0;
494+
495+ return 1;
496+}
497+
498+/*
499+ * ->releasepage method for reiser4
500+ *
501+ * This is called by VM scanner when it comes across clean page. What we have
502+ * to do here is to check whether page can really be released (freed that is)
503+ * and if so, detach jnode from it and remove page from the page cache.
504+ *
505+ * Check for releasability is done by releasable() function.
506+ */
507+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
508+{
509+ jnode *node;
510+
511+ assert("nikita-2257", PagePrivate(page));
512+ assert("nikita-2259", PageLocked(page));
513+ assert("nikita-2892", !PageWriteback(page));
514+ assert("nikita-3019", reiser4_schedulable());
515+
516+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
517+ is not clear what to do in this case. A lot of deadlocks seems be
518+ possible. */
4ce37908
MT
519+
520+ node = jnode_by_page(page);
521+ assert("nikita-2258", node != NULL);
522+ assert("reiser4-4", page->mapping != NULL);
523+ assert("reiser4-5", page->mapping->host != NULL);
524+
525+ if (PageDirty(page))
526+ return 0;
527+
528+ /* extra page reference is used by reiser4 to protect
529+ * jnode<->page link from this ->releasepage(). */
530+ if (page_count(page) > 3)
531+ return 0;
532+
533+ /* releasable() needs jnode lock, because it looks at the jnode fields
534+ * and we need jload_lock here to avoid races with jload(). */
535+ spin_lock_jnode(node);
536+ spin_lock(&(node->load));
537+ if (jnode_is_releasable(node)) {
538+ struct address_space *mapping;
539+
540+ mapping = page->mapping;
541+ jref(node);
542+ /* there is no need to synchronize against
543+ * jnode_extent_write() here, because pages seen by
544+ * jnode_extent_write() are !releasable(). */
545+ page_clear_jnode(page, node);
546+ spin_unlock(&(node->load));
547+ spin_unlock_jnode(node);
548+
549+ /* we are under memory pressure so release jnode also. */
550+ jput(node);
551+
552+ return 1;
553+ } else {
554+ spin_unlock(&(node->load));
555+ spin_unlock_jnode(node);
556+ assert("nikita-3020", reiser4_schedulable());
557+ return 0;
558+ }
559+}
560+
561+/* Make Linus happy.
562+ Local variables:
563+ c-indentation-style: "K&R"
564+ mode-name: "LC"
565+ c-basic-offset: 8
566+ tab-width: 8
567+ fill-column: 120
568+ End:
569+*/
c641d991
AF
570diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.c linux-2.6.20/fs/reiser4/block_alloc.c
571--- linux-2.6.20.orig/fs/reiser4/block_alloc.c 1970-01-01 03:00:00.000000000 +0300
572+++ linux-2.6.20/fs/reiser4/block_alloc.c 2007-05-06 14:50:43.682970725 +0400
4ce37908
MT
573@@ -0,0 +1,1137 @@
574+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
575+
576+#include "debug.h"
577+#include "dformat.h"
578+#include "plugin/plugin.h"
579+#include "txnmgr.h"
580+#include "znode.h"
581+#include "block_alloc.h"
582+#include "tree.h"
583+#include "super.h"
584+
585+#include <linux/types.h> /* for __u?? */
586+#include <linux/fs.h> /* for struct super_block */
587+#include <linux/spinlock.h>
588+
589+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
590+
591+/* We need to be able to reserve enough disk space to ensure that an atomic
592+ operation will have enough disk space to flush (see flush.c and
593+ http://namesys.com/v4/v4.html) and commit it once it is started.
594+
595+ In our design a call for reserving disk space may fail but not an actual
596+ block allocation.
597+
598+ All free blocks, already allocated blocks, and all kinds of reserved blocks
599+ are counted in different per-fs block counters.
600+
601+ A reiser4 super block's set of block counters currently is:
602+
603+ free -- free blocks,
604+ used -- already allocated blocks,
605+
606+ grabbed -- initially reserved for performing an fs operation, those blocks
607+ are taken from free blocks, then grabbed disk space leaks from grabbed
608+ blocks counter to other counters like "fake allocated", "flush
609+ reserved", "used", the rest of not used grabbed space is returned to
610+ free space at the end of fs operation;
611+
612+ fake allocated -- counts all nodes without real disk block numbers assigned,
613+ we have separate accounting for formatted and unformatted
614+ nodes (for easier debugging);
615+
616+ flush reserved -- disk space needed for flushing and committing an atom.
617+ Each dirty already allocated block could be written as a
618+ part of atom's overwrite set or as a part of atom's
619+ relocate set. In both case one additional block is needed,
620+ it is used as a wandered block if we do overwrite or as a
621+ new location for a relocated block.
622+
623+ In addition, blocks in some states are counted on per-thread and per-atom
624+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
625+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
626+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
627+ blocks, which are reserved for flush processing and atom commit. */
628+
629+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
630+ number of blocks to grab for most expensive case of balancing when the leaf
631+ node we insert new item to gets split and new leaf node is allocated.
632+
633+ So, we need to grab blocks for
634+
635+ 1) one block for possible dirtying the node we insert an item to. That block
636+ would be used for node relocation at flush time or for allocating of a
637+ wandered one, it depends what will be a result (what set, relocate or
638+ overwrite the node gets assigned to) of the node processing by the flush
639+ algorithm.
640+
641+ 2) one block for either allocating a new node, or dirtying of right or left
642+ clean neighbor, only one case may happen.
643+
644+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
645+ node, and creation of new node. have I forgotten something? email me.
646+
647+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
648+ counter and in the fs-wide one (both ctx->grabbed_blocks and
649+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
650+ decremented by 2.
651+
652+ Suppose both two blocks were spent for dirtying of an already allocated clean
653+ node (one block went from "grabbed" to "flush reserved") and for new block
654+ allocating (one block went from "grabbed" to "fake allocated formatted").
655+
656+ Inserting of a child pointer to the parent node caused parent node to be
657+ split, the balancing code takes care about this grabbing necessary space
658+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
659+ "can use the 5% reserved disk space".
660+
661+ At this moment insertion completes and grabbed blocks (if they were not used)
662+ should be returned to the free space counter.
663+
664+ However the atom life-cycle is not completed. The atom had one "flush
665+ reserved" block added by our insertion and the new fake allocated node is
666+ counted as a "fake allocated formatted" one. The atom has to be fully
667+ processed by flush before commit. Suppose that the flush moved the first,
668+ already allocated node to the atom's overwrite list, the new fake allocated
669+ node, obviously, went into the atom relocate set. The reiser4 flush
670+ allocates the new node using one unit from "fake allocated formatted"
671+ counter, the log writer uses one from "flush reserved" for wandered block
672+ allocation.
673+
674+ And, it is not the end. When the wandered block is deallocated after the
675+ atom gets fully played (see wander.c for term description), the disk space
676+ occupied for it is returned to free blocks. */
677+
678+/* BLOCK NUMBERS */
679+
680+/* Any reiser4 node has a block number assigned to it. We use these numbers for
681+ indexing in hash tables, so if a block has not yet been assigned a location
682+ on disk we need to give it a temporary fake block number.
683+
684+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
685+ use highest bit in 64-bit block number to distinguish fake and real block
686+ numbers. So, only 63 bits may be used to addressing of real device
687+ blocks. That "fake" block numbers space is divided into subspaces of fake
688+ block numbers for data blocks and for shadow (working) bitmap blocks.
689+
690+ Fake block numbers for data blocks are generated by a cyclic counter, which
691+ gets incremented after each real block allocation. We assume that it is
692+ impossible to overload this counter during one transaction life. */
693+
694+/* Initialize a blocknr hint. */
695+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
696+{
697+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
698+}
699+
700+/* Release any resources of a blocknr hint. */
701+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
702+{
703+ /* No resources should be freed in current blocknr_hint implementation. */
704+}
705+
706+/* see above for explanation of fake block number. */
707+/* Audited by: green(2002.06.11) */
708+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
709+{
710+ /* The reason for not simply returning result of '&' operation is that
711+ while return value is (possibly 32bit) int, the reiser4_block_nr is
712+ at least 64 bits long, and high bit (which is the only possible
713+ non zero bit after the masking) would be stripped off */
714+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
715+}
716+
717+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
718+ arithmetic. Mostly, they are isolated to not to code same assertions in
719+ several places. */
720+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
721+{
722+ BUG_ON(ctx->grabbed_blocks < count);
723+ assert("zam-527", ctx->grabbed_blocks >= count);
724+ ctx->grabbed_blocks -= count;
725+}
726+
727+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
728+{
729+ ctx->grabbed_blocks += count;
730+}
731+
732+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
733+{
734+ assert("zam-525", sbinfo->blocks_grabbed >= count);
735+ sbinfo->blocks_grabbed -= count;
736+}
737+
738+/* Decrease the counter of block reserved for flush in super block. */
739+static void
740+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
741+{
742+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
743+ sbinfo->blocks_flush_reserved -= count;
744+}
745+
746+static void
747+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
748+ reiser4_ba_flags_t flags)
749+{
750+ if (flags & BA_FORMATTED) {
751+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
752+ sbinfo->blocks_fake_allocated -= count;
753+ } else {
754+ assert("zam-528",
755+ sbinfo->blocks_fake_allocated_unformatted >= count);
756+ sbinfo->blocks_fake_allocated_unformatted -= count;
757+ }
758+}
759+
760+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
761+{
762+ assert("zam-530",
763+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
764+ sbinfo->blocks_used -= count;
765+}
766+
767+static void
768+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
769+{
770+ assert("edward-501", sbinfo->blocks_clustered >= count);
771+ sbinfo->blocks_clustered -= count;
772+}
773+
774+/* Increase the counter of block reserved for flush in atom. */
775+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
776+{
777+ assert("zam-772", atom != NULL);
778+ assert_spin_locked(&(atom->alock));
779+ atom->flush_reserved += count;
780+}
781+
782+/* Decrease the counter of block reserved for flush in atom. */
783+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
784+{
785+ assert("zam-774", atom != NULL);
786+ assert_spin_locked(&(atom->alock));
787+ assert("nikita-2790", atom->flush_reserved >= count);
788+ atom->flush_reserved -= count;
789+}
790+
791+/* super block has 6 counters: free, used, grabbed, fake allocated
792+ (formatted and unformatted) and flush reserved. Their sum must be
793+ number of blocks on a device. This function checks this */
794+int reiser4_check_block_counters(const struct super_block *super)
795+{
796+ __u64 sum;
797+
798+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
799+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
800+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
801+ reiser4_clustered_blocks(super);
802+ if (reiser4_block_count(super) != sum) {
803+ printk("super block counters: "
804+ "used %llu, free %llu, "
805+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
806+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
807+ (unsigned long long)reiser4_data_blocks(super),
808+ (unsigned long long)reiser4_free_blocks(super),
809+ (unsigned long long)reiser4_grabbed_blocks(super),
810+ (unsigned long long)reiser4_fake_allocated(super),
811+ (unsigned long long)
812+ reiser4_fake_allocated_unformatted(super),
813+ (unsigned long long)reiser4_flush_reserved(super),
814+ (unsigned long long)reiser4_clustered_blocks(super),
815+ (unsigned long long)sum,
816+ (unsigned long long)reiser4_block_count(super));
817+ return 0;
818+ }
819+ return 1;
820+}
821+
822+/* Adjust "working" free blocks counter for number of blocks we are going to
823+ allocate. Record number of grabbed blocks in fs-wide and per-thread
824+ counters. This function should be called before bitmap scanning or
825+ allocating fake block numbers
826+
827+ @super -- pointer to reiser4 super block;
828+ @count -- number of blocks we reserve;
829+
830+ @return -- 0 if success, -ENOSPC, if all
831+ free blocks are preserved or already allocated.
832+*/
833+
834+static int
835+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
836+{
837+ __u64 free_blocks;
838+ int ret = 0, use_reserved = flags & BA_RESERVED;
839+ reiser4_super_info_data *sbinfo;
840+
841+ assert("vs-1276", ctx == get_current_context());
842+
843+ /* Do not grab anything on ro-mounted fs. */
844+ if (rofs_super(ctx->super)) {
845+ ctx->grab_enabled = 0;
846+ return 0;
847+ }
848+
849+ sbinfo = get_super_private(ctx->super);
850+
851+ spin_lock_reiser4_super(sbinfo);
852+
853+ free_blocks = sbinfo->blocks_free;
854+
855+ if ((use_reserved && free_blocks < count) ||
856+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
857+ ret = RETERR(-ENOSPC);
858+ goto unlock_and_ret;
859+ }
860+
861+ add_to_ctx_grabbed(ctx, count);
862+
863+ sbinfo->blocks_grabbed += count;
864+ sbinfo->blocks_free -= count;
865+
866+#if REISER4_DEBUG
867+ if (ctx->grabbed_initially == 0)
868+ ctx->grabbed_initially = count;
869+#endif
870+
871+ assert("nikita-2986", reiser4_check_block_counters(ctx->super));
872+
873+ /* disable grab space in current context */
874+ ctx->grab_enabled = 0;
875+
876+ unlock_and_ret:
877+ spin_unlock_reiser4_super(sbinfo);
878+
879+ return ret;
880+}
881+
882+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
883+{
884+ int ret;
885+ reiser4_context *ctx;
886+
887+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
888+ lock_stack_isclean(get_current_lock_stack
889+ ())));
890+ ctx = get_current_context();
891+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
892+ return 0;
893+ }
894+
895+ ret = reiser4_grab(ctx, count, flags);
896+ if (ret == -ENOSPC) {
897+
898+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
899+ if (flags & BA_CAN_COMMIT) {
900+ txnmgr_force_commit_all(ctx->super, 0);
901+ ctx->grab_enabled = 1;
902+ ret = reiser4_grab(ctx, count, flags);
903+ }
904+ }
905+ /*
906+ * allocation from reserved pool cannot fail. This is severe error.
907+ */
908+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
909+ return ret;
910+}
911+
912+/*
913+ * SPACE RESERVED FOR UNLINK/TRUNCATE
914+ *
915+ * Unlink and truncate require space in transaction (to update stat data, at
916+ * least). But we don't want rm(1) to fail with "No space on device" error.
917+ *
918+ * Solution is to reserve 5% of disk space for truncates and
919+ * unlinks. Specifically, normal space grabbing requests don't grab space from
920+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
921+ * drain it. Per super block delete mutex is used to allow only one
922+ * thread at a time to grab from reserved area.
923+ *
924+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
925+ * flag.
926+ *
927+ */
928+
929+int reiser4_grab_reserved(struct super_block *super,
930+ __u64 count, reiser4_ba_flags_t flags)
931+{
932+ reiser4_super_info_data *sbinfo = get_super_private(super);
933+
934+ assert("nikita-3175", flags & BA_CAN_COMMIT);
935+
936+ /* Check the delete mutex already taken by us, we assume that
937+ * reading of machine word is atomic. */
938+ if (sbinfo->delete_mutex_owner == current) {
939+ if (reiser4_grab_space
940+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
941+ warning("zam-1003",
942+ "nested call of grab_reserved fails count=(%llu)",
943+ (unsigned long long)count);
944+ reiser4_release_reserved(super);
945+ return RETERR(-ENOSPC);
946+ }
947+ return 0;
948+ }
949+
950+ if (reiser4_grab_space(count, flags)) {
951+ mutex_lock(&sbinfo->delete_mutex);
952+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
953+ sbinfo->delete_mutex_owner = current;
954+
955+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
956+ warning("zam-833",
957+ "reserved space is not enough (%llu)",
958+ (unsigned long long)count);
959+ reiser4_release_reserved(super);
960+ return RETERR(-ENOSPC);
961+ }
962+ }
963+ return 0;
964+}
965+
966+void reiser4_release_reserved(struct super_block *super)
967+{
968+ reiser4_super_info_data *info;
969+
970+ info = get_super_private(super);
971+ if (info->delete_mutex_owner == current) {
972+ info->delete_mutex_owner = NULL;
973+ mutex_unlock(&info->delete_mutex);
974+ }
975+}
976+
977+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
978+{
979+ reiser4_context *ctx;
980+ reiser4_super_info_data *sbinfo;
981+
982+ ctx = get_current_context();
983+ sub_from_ctx_grabbed(ctx, count);
984+
985+ sbinfo = get_super_private(ctx->super);
986+ spin_lock_reiser4_super(sbinfo);
987+
988+ sub_from_sb_grabbed(sbinfo, count);
989+ /* return sbinfo locked */
990+ return sbinfo;
991+}
992+
993+/* is called after @count fake block numbers are allocated and pointer to
994+ those blocks are inserted into tree. */
995+static void grabbed2fake_allocated_formatted(void)
996+{
997+ reiser4_super_info_data *sbinfo;
998+
999+ sbinfo = grabbed2fake_allocated_head(1);
1000+ sbinfo->blocks_fake_allocated++;
1001+
1002+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1003+
1004+ spin_unlock_reiser4_super(sbinfo);
1005+}
1006+
1007+/**
1008+ * grabbed2fake_allocated_unformatted
1009+ * @count:
1010+ *
1011+ */
1012+static void grabbed2fake_allocated_unformatted(int count)
1013+{
1014+ reiser4_super_info_data *sbinfo;
1015+
1016+ sbinfo = grabbed2fake_allocated_head(count);
1017+ sbinfo->blocks_fake_allocated_unformatted += count;
1018+
1019+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1020+
1021+ spin_unlock_reiser4_super(sbinfo);
1022+}
1023+
1024+void grabbed2cluster_reserved(int count)
1025+{
1026+ reiser4_context *ctx;
1027+ reiser4_super_info_data *sbinfo;
1028+
1029+ ctx = get_current_context();
1030+ sub_from_ctx_grabbed(ctx, count);
1031+
1032+ sbinfo = get_super_private(ctx->super);
1033+ spin_lock_reiser4_super(sbinfo);
1034+
1035+ sub_from_sb_grabbed(sbinfo, count);
1036+ sbinfo->blocks_clustered += count;
1037+
1038+ assert("edward-504", reiser4_check_block_counters(ctx->super));
1039+
1040+ spin_unlock_reiser4_super(sbinfo);
1041+}
1042+
1043+void cluster_reserved2grabbed(int count)
1044+{
1045+ reiser4_context *ctx;
1046+ reiser4_super_info_data *sbinfo;
1047+
1048+ ctx = get_current_context();
1049+
1050+ sbinfo = get_super_private(ctx->super);
1051+ spin_lock_reiser4_super(sbinfo);
1052+
1053+ sub_from_cluster_reserved(sbinfo, count);
1054+ sbinfo->blocks_grabbed += count;
1055+
1056+ assert("edward-505", reiser4_check_block_counters(ctx->super));
1057+
1058+ spin_unlock_reiser4_super(sbinfo);
1059+ add_to_ctx_grabbed(ctx, count);
1060+}
1061+
1062+void cluster_reserved2free(int count)
1063+{
1064+ reiser4_context *ctx;
1065+ reiser4_super_info_data *sbinfo;
1066+
1067+ ctx = get_current_context();
1068+ sbinfo = get_super_private(ctx->super);
1069+
1070+ cluster_reserved2grabbed(count);
1071+ grabbed2free(ctx, sbinfo, count);
1072+}
1073+
1074+static DEFINE_SPINLOCK(fake_lock);
1075+static reiser4_block_nr fake_gen = 0;
1076+
1077+/**
1078+ * assign_fake_blocknr
1079+ * @blocknr:
1080+ * @count:
1081+ *
1082+ * Obtain a fake block number for new node which will be used to refer to
1083+ * this newly allocated node until real allocation is done.
1084+ */
1085+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1086+{
1087+ spin_lock(&fake_lock);
1088+ *blocknr = fake_gen;
1089+ fake_gen += count;
1090+ spin_unlock(&fake_lock);
1091+
1092+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1093+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1094+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1095+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1096+}
1097+
1098+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1099+{
1100+ assign_fake_blocknr(blocknr, 1);
1101+ grabbed2fake_allocated_formatted();
1102+ return 0;
1103+}
1104+
1105+/**
1106+ * fake_blocknrs_unformatted
1107+ * @count: number of fake numbers to get
1108+ *
1109+ * Allocates @count fake block numbers which will be assigned to jnodes
1110+ */
1111+reiser4_block_nr fake_blocknr_unformatted(int count)
1112+{
1113+ reiser4_block_nr blocknr;
1114+
1115+ assign_fake_blocknr(&blocknr, count);
1116+ grabbed2fake_allocated_unformatted(count);
1117+
1118+ return blocknr;
1119+}
1120+
1121+/* adjust sb block counters, if real (on-disk) block allocation immediately
1122+ follows grabbing of free disk space. */
1123+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1124+ __u64 count)
1125+{
1126+ sub_from_ctx_grabbed(ctx, count);
1127+
1128+ spin_lock_reiser4_super(sbinfo);
1129+
1130+ sub_from_sb_grabbed(sbinfo, count);
1131+ sbinfo->blocks_used += count;
1132+
1133+ assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1134+
1135+ spin_unlock_reiser4_super(sbinfo);
1136+}
1137+
1138+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1139+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1140+ reiser4_ba_flags_t flags)
1141+{
1142+ spin_lock_reiser4_super(sbinfo);
1143+
1144+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1145+ sbinfo->blocks_used += count;
1146+
1147+ assert("nikita-2680",
1148+ reiser4_check_block_counters(reiser4_get_current_sb()));
1149+
1150+ spin_unlock_reiser4_super(sbinfo);
1151+}
1152+
1153+static void flush_reserved2used(txn_atom * atom, __u64 count)
1154+{
1155+ reiser4_super_info_data *sbinfo;
1156+
1157+ assert("zam-787", atom != NULL);
1158+ assert_spin_locked(&(atom->alock));
1159+
1160+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1161+
1162+ sbinfo = get_current_super_private();
1163+ spin_lock_reiser4_super(sbinfo);
1164+
1165+ sub_from_sb_flush_reserved(sbinfo, count);
1166+ sbinfo->blocks_used += count;
1167+
1168+ assert("zam-789",
1169+ reiser4_check_block_counters(reiser4_get_current_sb()));
1170+
1171+ spin_unlock_reiser4_super(sbinfo);
1172+}
1173+
1174+/* update the per fs blocknr hint default value. */
1175+void
1176+update_blocknr_hint_default(const struct super_block *s,
1177+ const reiser4_block_nr * block)
1178+{
1179+ reiser4_super_info_data *sbinfo = get_super_private(s);
1180+
1181+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1182+
1183+ spin_lock_reiser4_super(sbinfo);
1184+ if (*block < sbinfo->block_count) {
1185+ sbinfo->blocknr_hint_default = *block;
1186+ } else {
1187+ warning("zam-676",
1188+ "block number %llu is too large to be used in a blocknr hint\n",
1189+ (unsigned long long)*block);
1190+ dump_stack();
1191+ DEBUGON(1);
1192+ }
1193+ spin_unlock_reiser4_super(sbinfo);
1194+}
1195+
1196+/* get current value of the default blocknr hint. */
1197+void get_blocknr_hint_default(reiser4_block_nr * result)
1198+{
1199+ reiser4_super_info_data *sbinfo = get_current_super_private();
1200+
1201+ spin_lock_reiser4_super(sbinfo);
1202+ *result = sbinfo->blocknr_hint_default;
1203+ assert("zam-677", *result < sbinfo->block_count);
1204+ spin_unlock_reiser4_super(sbinfo);
1205+}
1206+
1207+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1208+ * method. Blocks are allocated in one contiguous disk region. The plugin
1209+ * independent part accounts blocks by subtracting allocated amount from grabbed
1210+ * or fake block counter and add the same amount to the counter of allocated
1211+ * blocks.
1212+ *
1213+ * @hint -- a reiser4 blocknr hint object which contains further block
1214+ * allocation hints and parameters (search start, a stage of block
1215+ * which will be mapped to disk, etc.),
1216+ * @blk -- an out parameter for the beginning of the allocated region,
1217+ * @len -- in/out parameter, it should contain the maximum number of allocated
1218+ * blocks, after block allocation completes, it contains the length of
1219+ * allocated disk region.
1220+ * @flags -- see reiser4_ba_flags_t description.
1221+ *
1222+ * @return -- 0 if success, error code otherwise.
1223+ */
1224+int
1225+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1226+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1227+{
1228+ __u64 needed = *len;
1229+ reiser4_context *ctx;
1230+ reiser4_super_info_data *sbinfo;
1231+ int ret;
1232+
1233+ assert("zam-986", hint != NULL);
1234+
1235+ ctx = get_current_context();
1236+ sbinfo = get_super_private(ctx->super);
1237+
1238+ /* For write-optimized data we use default search start value, which is
1239+ * close to last write location. */
1240+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1241+ get_blocknr_hint_default(&hint->blk);
1242+ }
1243+
1244+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1245+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1246+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1247+ ret = reiser4_grab_space_force(*len, flags);
1248+ if (ret != 0)
1249+ return ret;
1250+ }
1251+
1252+ ret =
1253+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1254+ hint, (int)needed, blk, len);
1255+
1256+ if (!ret) {
1257+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1258+ assert("zam-681",
1259+ *blk + *len <= reiser4_block_count(ctx->super));
1260+
1261+ if (flags & BA_PERMANENT) {
1262+ /* we assume that current atom exists at this moment */
1263+ txn_atom *atom = get_current_atom_locked();
1264+ atom->nr_blocks_allocated += *len;
1265+ spin_unlock_atom(atom);
1266+ }
1267+
1268+ switch (hint->block_stage) {
1269+ case BLOCK_NOT_COUNTED:
1270+ case BLOCK_GRABBED:
1271+ grabbed2used(ctx, sbinfo, *len);
1272+ break;
1273+ case BLOCK_UNALLOCATED:
1274+ fake_allocated2used(sbinfo, *len, flags);
1275+ break;
1276+ case BLOCK_FLUSH_RESERVED:
1277+ {
1278+ txn_atom *atom = get_current_atom_locked();
1279+ flush_reserved2used(atom, *len);
1280+ spin_unlock_atom(atom);
1281+ }
1282+ break;
1283+ default:
1284+ impossible("zam-531", "wrong block stage");
1285+ }
1286+ } else {
1287+ assert("zam-821",
1288+ ergo(hint->max_dist == 0
1289+ && !hint->backward, ret != -ENOSPC));
1290+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1291+ grabbed2free(ctx, sbinfo, needed);
1292+ }
1293+
1294+ return ret;
1295+}
1296+
1297+/* used -> fake_allocated -> grabbed -> free */
1298+
1299+/* adjust sb block counters when @count unallocated blocks get unmapped from
1300+ disk */
1301+static void
1302+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1303+ int formatted)
1304+{
1305+ spin_lock_reiser4_super(sbinfo);
1306+
1307+ if (formatted)
1308+ sbinfo->blocks_fake_allocated += count;
1309+ else
1310+ sbinfo->blocks_fake_allocated_unformatted += count;
1311+
1312+ sub_from_sb_used(sbinfo, count);
1313+
1314+ assert("nikita-2681",
1315+ reiser4_check_block_counters(reiser4_get_current_sb()));
1316+
1317+ spin_unlock_reiser4_super(sbinfo);
1318+}
1319+
1320+static void
1321+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1322+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1323+{
1324+ assert("nikita-2791", atom != NULL);
1325+ assert_spin_locked(&(atom->alock));
1326+
1327+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1328+
1329+ spin_lock_reiser4_super(sbinfo);
1330+
1331+ sbinfo->blocks_flush_reserved += count;
1332+ /*add_to_sb_flush_reserved(sbinfo, count); */
1333+ sub_from_sb_used(sbinfo, count);
1334+
1335+ assert("nikita-2681",
1336+ reiser4_check_block_counters(reiser4_get_current_sb()));
1337+
1338+ spin_unlock_reiser4_super(sbinfo);
1339+}
1340+
1341+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1342+static void
1343+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1344+ __u64 count, reiser4_ba_flags_t flags)
1345+{
1346+ add_to_ctx_grabbed(ctx, count);
1347+
1348+ spin_lock_reiser4_super(sbinfo);
1349+
1350+ assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1351+
1352+ sbinfo->blocks_grabbed += count;
1353+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1354+
1355+ assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1356+
1357+ spin_unlock_reiser4_super(sbinfo);
1358+}
1359+
1360+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1361+{
1362+ reiser4_context *ctx;
1363+ reiser4_super_info_data *sbinfo;
1364+
1365+ ctx = get_current_context();
1366+ sbinfo = get_super_private(ctx->super);
1367+
1368+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1369+ grabbed2free(ctx, sbinfo, count);
1370+}
1371+
1372+void grabbed2free_mark(__u64 mark)
1373+{
1374+ reiser4_context *ctx;
1375+ reiser4_super_info_data *sbinfo;
1376+
1377+ ctx = get_current_context();
1378+ sbinfo = get_super_private(ctx->super);
1379+
1380+ assert("nikita-3007", (__s64) mark >= 0);
1381+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1382+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1383+}
1384+
1385+/**
1386+ * grabbed2free - adjust grabbed and free block counters
1387+ * @ctx: context to update grabbed block counter of
1388+ * @sbinfo: super block to update grabbed and free block counters of
1389+ * @count: number of blocks to adjust counters by
1390+ *
1391+ * Decreases context's and per filesystem's counters of grabbed
1392+ * blocks. Increases per filesystem's counter of free blocks.
1393+ */
1394+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1395+ __u64 count)
1396+{
1397+ sub_from_ctx_grabbed(ctx, count);
1398+
1399+ spin_lock_reiser4_super(sbinfo);
1400+
1401+ sub_from_sb_grabbed(sbinfo, count);
1402+ sbinfo->blocks_free += count;
1403+ assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1404+
1405+ spin_unlock_reiser4_super(sbinfo);
1406+}
1407+
1408+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1409+{
1410+ reiser4_context *ctx;
1411+ reiser4_super_info_data *sbinfo;
1412+
1413+ assert("vs-1095", atom);
1414+
1415+ ctx = get_current_context();
1416+ sbinfo = get_super_private(ctx->super);
1417+
1418+ sub_from_ctx_grabbed(ctx, count);
1419+
1420+ add_to_atom_flush_reserved_nolock(atom, count);
1421+
1422+ spin_lock_reiser4_super(sbinfo);
1423+
1424+ sbinfo->blocks_flush_reserved += count;
1425+ sub_from_sb_grabbed(sbinfo, count);
1426+
1427+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1428+
1429+ spin_unlock_reiser4_super(sbinfo);
1430+}
1431+
1432+void grabbed2flush_reserved(__u64 count)
1433+{
1434+ txn_atom *atom = get_current_atom_locked();
1435+
1436+ grabbed2flush_reserved_nolock(atom, count);
1437+
1438+ spin_unlock_atom(atom);
1439+}
1440+
1441+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1442+{
1443+ reiser4_context *ctx;
1444+ reiser4_super_info_data *sbinfo;
1445+
1446+ assert("nikita-2788", atom != NULL);
1447+ assert_spin_locked(&(atom->alock));
1448+
1449+ ctx = get_current_context();
1450+ sbinfo = get_super_private(ctx->super);
1451+
1452+ add_to_ctx_grabbed(ctx, count);
1453+
1454+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1455+
1456+ spin_lock_reiser4_super(sbinfo);
1457+
1458+ sbinfo->blocks_grabbed += count;
1459+ sub_from_sb_flush_reserved(sbinfo, count);
1460+
1461+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1462+
1463+ spin_unlock_reiser4_super(sbinfo);
1464+}
1465+
1466+/**
1467+ * all_grabbed2free - releases all blocks grabbed in context
1468+ *
1469+ * Decreases context's and super block's grabbed block counters by number of
1470+ * blocks grabbed by current context and increases super block's free block
1471+ * counter correspondingly.
1472+ */
1473+void all_grabbed2free(void)
1474+{
1475+ reiser4_context *ctx = get_current_context();
1476+
1477+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1478+}
1479+
1480+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1481+ after freeing, @count blocks become "grabbed". */
1482+static void
1483+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1484+ __u64 count)
1485+{
1486+ add_to_ctx_grabbed(ctx, count);
1487+
1488+ spin_lock_reiser4_super(sbinfo);
1489+
1490+ sbinfo->blocks_grabbed += count;
1491+ sub_from_sb_used(sbinfo, count);
1492+
1493+ assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1494+
1495+ spin_unlock_reiser4_super(sbinfo);
1496+}
1497+
1498+/* this used to be done through used2grabbed and grabbed2free*/
1499+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1500+{
1501+ spin_lock_reiser4_super(sbinfo);
1502+
1503+ sbinfo->blocks_free += count;
1504+ sub_from_sb_used(sbinfo, count);
1505+
1506+ assert("nikita-2685",
1507+ reiser4_check_block_counters(reiser4_get_current_sb()));
1508+
1509+ spin_unlock_reiser4_super(sbinfo);
1510+}
1511+
1512+#if REISER4_DEBUG
1513+
1514+/* check "allocated" state of given block range */
1515+static void
1516+reiser4_check_blocks(const reiser4_block_nr * start,
1517+ const reiser4_block_nr * len, int desired)
1518+{
1519+ sa_check_blocks(start, len, desired);
1520+}
1521+
1522+/* check "allocated" state of given block */
1523+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1524+{
1525+ const reiser4_block_nr one = 1;
1526+
1527+ reiser4_check_blocks(block, &one, desired);
1528+}
1529+
1530+#endif
1531+
1532+/* Blocks deallocation function may do an actual deallocation through space
1533+ plugin allocation or store deleted block numbers in atom's delete_set data
1534+ structure depend on @defer parameter. */
1535+
1536+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1537+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1538+ freed but disk space is still grabbed by current thread, or these blocks must
1539+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
1540+
1541+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1542+ distinguish blocks allocated for unformatted and formatted nodes */
1543+
1544+int
1545+reiser4_dealloc_blocks(const reiser4_block_nr * start,
1546+ const reiser4_block_nr * len,
1547+ block_stage_t target_stage, reiser4_ba_flags_t flags)
1548+{
1549+ txn_atom *atom = NULL;
1550+ int ret;
1551+ reiser4_context *ctx;
1552+ reiser4_super_info_data *sbinfo;
1553+
1554+ ctx = get_current_context();
1555+ sbinfo = get_super_private(ctx->super);
1556+
1557+ if (REISER4_DEBUG) {
1558+ assert("zam-431", *len != 0);
1559+ assert("zam-432", *start != 0);
1560+ assert("zam-558", !reiser4_blocknr_is_fake(start));
1561+
1562+ spin_lock_reiser4_super(sbinfo);
1563+ assert("zam-562", *start < sbinfo->block_count);
1564+ spin_unlock_reiser4_super(sbinfo);
1565+ }
1566+
1567+ if (flags & BA_DEFER) {
1568+ blocknr_set_entry *bsep = NULL;
1569+
1570+ /* storing deleted block numbers in a blocknr set
1571+ datastructure for further actual deletion */
1572+ do {
1573+ atom = get_current_atom_locked();
1574+ assert("zam-430", atom != NULL);
1575+
1576+ ret =
1577+ blocknr_set_add_extent(atom, &atom->delete_set,
1578+ &bsep, start, len);
1579+
1580+ if (ret == -ENOMEM)
1581+ return ret;
1582+
1583+ /* This loop might spin at most two times */
1584+ } while (ret == -E_REPEAT);
1585+
1586+ assert("zam-477", ret == 0);
1587+ assert("zam-433", atom != NULL);
1588+
1589+ spin_unlock_atom(atom);
1590+
1591+ } else {
1592+ assert("zam-425", get_current_super_private() != NULL);
1593+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1594+ *start, *len);
1595+
1596+ if (flags & BA_PERMANENT) {
1597+ /* These blocks were counted as allocated, we have to revert it
1598+ * back if allocation is discarded. */
1599+ txn_atom *atom = get_current_atom_locked();
1600+ atom->nr_blocks_allocated -= *len;
1601+ spin_unlock_atom(atom);
1602+ }
1603+
1604+ switch (target_stage) {
1605+ case BLOCK_NOT_COUNTED:
1606+ assert("vs-960", flags & BA_FORMATTED);
1607+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1608+ used2free(sbinfo, *len);
1609+ break;
1610+
1611+ case BLOCK_GRABBED:
1612+ used2grabbed(ctx, sbinfo, *len);
1613+ break;
1614+
1615+ case BLOCK_UNALLOCATED:
1616+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1617+ break;
1618+
1619+ case BLOCK_FLUSH_RESERVED:{
1620+ txn_atom *atom;
1621+
1622+ atom = get_current_atom_locked();
1623+ used2flush_reserved(sbinfo, atom, *len,
1624+ flags & BA_FORMATTED);
1625+ spin_unlock_atom(atom);
1626+ break;
1627+ }
1628+ default:
1629+ impossible("zam-532", "wrong block stage");
1630+ }
1631+ }
1632+
1633+ return 0;
1634+}
1635+
1636+/* wrappers for block allocator plugin methods */
1637+int reiser4_pre_commit_hook(void)
1638+{
1639+ assert("zam-502", get_current_super_private() != NULL);
1640+ sa_pre_commit_hook();
1641+ return 0;
1642+}
1643+
1644+/* an actor which applies delete set to block allocator data */
1645+static int
1646+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1647+ const reiser4_block_nr * b, void *data UNUSED_ARG)
1648+{
1649+ reiser4_context *ctx;
1650+ reiser4_super_info_data *sbinfo;
1651+
1652+ __u64 len = 1;
1653+
1654+ ctx = get_current_context();
1655+ sbinfo = get_super_private(ctx->super);
1656+
1657+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1658+ assert("zam-552", sbinfo != NULL);
1659+
1660+ if (b != NULL)
1661+ len = *b;
1662+
1663+ if (REISER4_DEBUG) {
1664+ spin_lock_reiser4_super(sbinfo);
1665+
1666+ assert("zam-554", *a < reiser4_block_count(ctx->super));
1667+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1668+
1669+ spin_unlock_reiser4_super(sbinfo);
1670+ }
1671+
1672+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1673+ /* adjust sb block counters */
1674+ used2free(sbinfo, len);
1675+ return 0;
1676+}
1677+
1678+void reiser4_post_commit_hook(void)
1679+{
1680+ txn_atom *atom;
1681+
1682+ atom = get_current_atom_locked();
1683+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1684+ spin_unlock_atom(atom);
1685+
1686+ /* do the block deallocation which was deferred
1687+ until commit is done */
1688+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1689+
1690+ assert("zam-504", get_current_super_private() != NULL);
1691+ sa_post_commit_hook();
1692+}
1693+
1694+void reiser4_post_write_back_hook(void)
1695+{
1696+ assert("zam-504", get_current_super_private() != NULL);
1697+
1698+ sa_post_commit_hook();
1699+}
1700+
1701+/*
1702+ Local variables:
1703+ c-indentation-style: "K&R"
1704+ mode-name: "LC"
1705+ c-basic-offset: 8
1706+ tab-width: 8
1707+ fill-column: 120
1708+ scroll-step: 1
1709+ End:
1710+*/
c641d991
AF
1711diff -urN linux-2.6.20.orig/fs/reiser4/block_alloc.h linux-2.6.20/fs/reiser4/block_alloc.h
1712--- linux-2.6.20.orig/fs/reiser4/block_alloc.h 1970-01-01 03:00:00.000000000 +0300
1713+++ linux-2.6.20/fs/reiser4/block_alloc.h 2007-05-06 14:50:43.682970725 +0400
4ce37908
MT
1714@@ -0,0 +1,175 @@
1715+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1716+
1717+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1718+#define __FS_REISER4_BLOCK_ALLOC_H__
1719+
1720+#include "dformat.h"
1721+#include "forward.h"
1722+
1723+#include <linux/types.h> /* for __u?? */
1724+#include <linux/fs.h>
1725+
1726+/* Mask when is applied to given block number shows is that block number is a fake one */
1727+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
1728+/* Mask which isolates a type of object this fake block number was assigned to */
1729+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1730+
1731+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1732+ against these two values to understand is the object unallocated or bitmap
1733+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1734+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
1735+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
1736+
1737+/* specification how block allocation was counted in sb block counters */
1738+typedef enum {
1739+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
1740+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
1741+ of this block */
1742+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
1743+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
1744+ ( unallocated formatted or unformatted
1745+ node) */
1746+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
1747+ number assigned */
1748+} block_stage_t;
1749+
1750+/* a hint for block allocator */
1751+struct reiser4_blocknr_hint {
1752+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
1753+ is to prevent jnode_flush() calls from interleaving allocations on the same
1754+ bitmap, once a hint is established. */
1755+
1756+ /* search start hint */
1757+ reiser4_block_nr blk;
1758+ /* if not zero, it is a region size we search for free blocks in */
1759+ reiser4_block_nr max_dist;
1760+ /* level for allocation, may be useful have branch-level and higher
1761+ write-optimized. */
1762+ tree_level level;
1763+ /* block allocator assumes that blocks, which will be mapped to disk,
1764+ are in this specified block_stage */
1765+ block_stage_t block_stage;
1766+ /* If direction = 1 allocate blocks in backward direction from the end
1767+ * of disk to the beginning of disk. */
1768+ unsigned int backward:1;
1769+
1770+};
1771+
1772+/* These flags control block allocation/deallocation behavior */
1773+enum reiser4_ba_flags {
1774+ /* do allocatations from reserved (5%) area */
1775+ BA_RESERVED = (1 << 0),
1776+
1777+ /* block allocator can do commit trying to recover free space */
1778+ BA_CAN_COMMIT = (1 << 1),
1779+
1780+ /* if operation will be applied to formatted block */
1781+ BA_FORMATTED = (1 << 2),
1782+
1783+ /* defer actual block freeing until transaction commit */
1784+ BA_DEFER = (1 << 3),
1785+
1786+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
1787+ wandered of log blocks */
1788+ BA_PERMANENT = (1 << 4),
1789+
1790+ /* grab space even it was disabled */
1791+ BA_FORCE = (1 << 5),
1792+
1793+ /* use default start value for free blocks search. */
1794+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1795+};
1796+
1797+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1798+
1799+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1800+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1801+extern void update_blocknr_hint_default(const struct super_block *,
1802+ const reiser4_block_nr *);
1803+extern void get_blocknr_hint_default(reiser4_block_nr *);
1804+
1805+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1806+
1807+int assign_fake_blocknr_formatted(reiser4_block_nr *);
1808+reiser4_block_nr fake_blocknr_unformatted(int);
1809+
1810+/* free -> grabbed -> fake_allocated -> used */
1811+
1812+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1813+void all_grabbed2free(void);
1814+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1815+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1816+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1817+void grabbed2flush_reserved(__u64 count);
1818+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1819+ reiser4_block_nr * start,
1820+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
1821+int reiser4_dealloc_blocks(const reiser4_block_nr *,
1822+ const reiser4_block_nr *,
1823+ block_stage_t, reiser4_ba_flags_t flags);
1824+
1825+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1826+ reiser4_block_nr * start,
1827+ reiser4_ba_flags_t flags)
1828+{
1829+ reiser4_block_nr one = 1;
1830+ return reiser4_alloc_blocks(hint, start, &one, flags);
1831+}
1832+
1833+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1834+ block_stage_t stage,
1835+ reiser4_ba_flags_t flags)
1836+{
1837+ const reiser4_block_nr one = 1;
1838+ return reiser4_dealloc_blocks(block, &one, stage, flags);
1839+}
1840+
1841+#define reiser4_grab_space_force(count, flags) \
1842+ reiser4_grab_space(count, flags | BA_FORCE)
1843+
1844+extern void grabbed2free_mark(__u64 mark);
1845+extern int reiser4_grab_reserved(struct super_block *,
1846+ __u64, reiser4_ba_flags_t);
1847+extern void reiser4_release_reserved(struct super_block *super);
1848+
1849+/* grabbed -> fake_allocated */
1850+
1851+/* fake_allocated -> used */
1852+
1853+/* used -> fake_allocated -> grabbed -> free */
1854+
1855+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1856+
1857+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1858+
1859+extern void grabbed2cluster_reserved(int count);
1860+extern void cluster_reserved2grabbed(int count);
1861+extern void cluster_reserved2free(int count);
1862+
1863+extern int reiser4_check_block_counters(const struct super_block *);
1864+
1865+#if REISER4_DEBUG
1866+
1867+extern void reiser4_check_block(const reiser4_block_nr *, int);
1868+
1869+#else
1870+
1871+# define reiser4_check_block(beg, val) noop
1872+
1873+#endif
1874+
1875+extern int reiser4_pre_commit_hook(void);
1876+extern void reiser4_post_commit_hook(void);
1877+extern void reiser4_post_write_back_hook(void);
1878+
1879+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
1880+
1881+/* Make Linus happy.
1882+ Local variables:
1883+ c-indentation-style: "K&R"
1884+ mode-name: "LC"
1885+ c-basic-offset: 8
1886+ tab-width: 8
1887+ fill-column: 120
1888+ End:
1889+*/
c641d991
AF
1890diff -urN linux-2.6.20.orig/fs/reiser4/blocknrset.c linux-2.6.20/fs/reiser4/blocknrset.c
1891--- linux-2.6.20.orig/fs/reiser4/blocknrset.c 1970-01-01 03:00:00.000000000 +0300
1892+++ linux-2.6.20/fs/reiser4/blocknrset.c 2007-05-06 14:50:43.686971975 +0400
4ce37908
MT
1893@@ -0,0 +1,368 @@
1894+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1895+
1896+/* This file contains code for various block number sets used by the atom to
1897+ track the deleted set and wandered block mappings. */
1898+
1899+#include "debug.h"
1900+#include "dformat.h"
1901+#include "txnmgr.h"
1902+#include "context.h"
1903+
1904+#include <linux/slab.h>
1905+
1906+/* The proposed data structure for storing unordered block number sets is a
1907+ list of elements, each of which contains an array of block number or/and
1908+ array of block number pairs. That element called blocknr_set_entry is used
1909+ to store block numbers from the beginning and for extents from the end of
1910+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1911+ count numbers of blocks and extents.
1912+
1913+ +------------------- blocknr_set_entry->data ------------------+
1914+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1915+ +------------------------------------------------------------+
1916+
1917+ When current blocknr_set_entry is full, allocate a new one. */
1918+
1919+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1920+ * set (single blocks and block extents), in that case blocknr pair represent an
1921+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1922+ * there represent a (real block) -> (wandered block) mapping. */
1923+
1924+/* Protection: blocknr sets belong to reiser4 atom, and
1925+ * their modifications are performed with the atom lock held */
1926+
1927+typedef struct blocknr_pair blocknr_pair;
1928+
1929+/* The total size of a blocknr_set_entry. */
1930+#define BLOCKNR_SET_ENTRY_SIZE 128
1931+
1932+/* The number of blocks that can fit the blocknr data area. */
1933+#define BLOCKNR_SET_ENTRIES_NUMBER \
1934+ ((BLOCKNR_SET_ENTRY_SIZE - \
1935+ 2 * sizeof (unsigned) - \
1936+ sizeof(struct list_head)) / \
1937+ sizeof(reiser4_block_nr))
1938+
1939+/* An entry of the blocknr_set */
1940+struct blocknr_set_entry {
1941+ unsigned nr_singles;
1942+ unsigned nr_pairs;
1943+ struct list_head link;
1944+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1945+};
1946+
1947+/* A pair of blocks as recorded in the blocknr_set_entry data. */
1948+struct blocknr_pair {
1949+ reiser4_block_nr a;
1950+ reiser4_block_nr b;
1951+};
1952+
1953+/* Return the number of blocknr slots available in a blocknr_set_entry. */
1954+/* Audited by: green(2002.06.11) */
1955+static unsigned bse_avail(blocknr_set_entry * bse)
1956+{
1957+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1958+
1959+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1960+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1961+
1962+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
1963+}
1964+
1965+/* Initialize a blocknr_set_entry. */
1966+static void bse_init(blocknr_set_entry *bse)
1967+{
1968+ bse->nr_singles = 0;
1969+ bse->nr_pairs = 0;
1970+ INIT_LIST_HEAD(&bse->link);
1971+}
1972+
1973+/* Allocate and initialize a blocknr_set_entry. */
1974+/* Audited by: green(2002.06.11) */
1975+static blocknr_set_entry *bse_alloc(void)
1976+{
1977+ blocknr_set_entry *e;
1978+
1979+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
1980+ reiser4_ctx_gfp_mask_get())) == NULL)
1981+ return NULL;
1982+
1983+ bse_init(e);
1984+
1985+ return e;
1986+}
1987+
1988+/* Free a blocknr_set_entry. */
1989+/* Audited by: green(2002.06.11) */
1990+static void bse_free(blocknr_set_entry * bse)
1991+{
1992+ kfree(bse);
1993+}
1994+
1995+/* Add a block number to a blocknr_set_entry */
1996+/* Audited by: green(2002.06.11) */
1997+static void
1998+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
1999+{
2000+ assert("jmacd-5099", bse_avail(bse) >= 1);
2001+
2002+ bse->entries[bse->nr_singles++] = *block;
2003+}
2004+
2005+/* Get a pair of block numbers */
2006+/* Audited by: green(2002.06.11) */
2007+static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2008+{
2009+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2010+
2011+ return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2012+ 2 * (pno + 1));
2013+}
2014+
2015+/* Add a pair of block numbers to a blocknr_set_entry */
2016+/* Audited by: green(2002.06.11) */
2017+static void
2018+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2019+ const reiser4_block_nr * b)
2020+{
2021+ blocknr_pair *pair;
2022+
2023+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2024+
2025+ pair = bse_get_pair(bse, bse->nr_pairs++);
2026+
2027+ pair->a = *a;
2028+ pair->b = *b;
2029+}
2030+
2031+/* Add either a block or pair of blocks to the block number set. The first
2032+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2033+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2034+ the call is made with the atom lock held. There may not be enough space in
2035+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2036+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2037+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2038+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2039+ returned with the atom unlocked for the operation to be tried again. If
2040+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2041+ used during the call, it will be freed automatically. */
2042+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2043+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2044+ const reiser4_block_nr *b)
2045+{
2046+ blocknr_set_entry *bse;
2047+ unsigned entries_needed;
2048+
2049+ assert("jmacd-5101", a != NULL);
2050+
2051+ entries_needed = (b == NULL) ? 1 : 2;
2052+ if (list_empty(bset) ||
2053+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2054+ /* See if a bse was previously allocated. */
2055+ if (*new_bsep == NULL) {
2056+ spin_unlock_atom(atom);
2057+ *new_bsep = bse_alloc();
2058+ return (*new_bsep != NULL) ? -E_REPEAT :
2059+ RETERR(-ENOMEM);
2060+ }
2061+
2062+ /* Put it on the head of the list. */
2063+ list_add(&((*new_bsep)->link), bset);
2064+
2065+ *new_bsep = NULL;
2066+ }
2067+
2068+ /* Add the single or pair. */
2069+ bse = list_entry(bset->next, blocknr_set_entry, link);
2070+ if (b == NULL) {
2071+ bse_put_single(bse, a);
2072+ } else {
2073+ bse_put_pair(bse, a, b);
2074+ }
2075+
2076+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2077+ if (*new_bsep != NULL) {
2078+ bse_free(*new_bsep);
2079+ *new_bsep = NULL;
2080+ }
2081+
2082+ return 0;
2083+}
2084+
2085+/* Add an extent to the block set. If the length is 1, it is treated as a
2086+ single block (e.g., reiser4_set_add_block). */
2087+/* Audited by: green(2002.06.11) */
2088+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2089+ kmalloc might schedule. The only exception is atom spinlock, which is
2090+ properly freed. */
2091+int
2092+blocknr_set_add_extent(txn_atom * atom,
2093+ struct list_head * bset,
2094+ blocknr_set_entry ** new_bsep,
2095+ const reiser4_block_nr * start,
2096+ const reiser4_block_nr * len)
2097+{
2098+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2099+ return blocknr_set_add(atom, bset, new_bsep, start,
2100+ *len == 1 ? NULL : len);
2101+}
2102+
2103+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2104+ * by an assertion that both arguments are not null.*/
2105+/* Audited by: green(2002.06.11) */
2106+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2107+ kmalloc might schedule. The only exception is atom spinlock, which is
2108+ properly freed. */
2109+int
2110+blocknr_set_add_pair(txn_atom * atom,
2111+ struct list_head * bset,
2112+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2113+ const reiser4_block_nr * b)
2114+{
2115+ assert("jmacd-5103", a != NULL && b != NULL);
2116+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2117+}
2118+
2119+/* Initialize a blocknr_set. */
2120+void blocknr_set_init(struct list_head *bset)
2121+{
2122+ INIT_LIST_HEAD(bset);
2123+}
2124+
2125+/* Release the entries of a blocknr_set. */
2126+void blocknr_set_destroy(struct list_head *bset)
2127+{
2128+ blocknr_set_entry *bse;
2129+
2130+ while (!list_empty(bset)) {
2131+ bse = list_entry(bset->next, blocknr_set_entry, link);
2132+ list_del_init(&bse->link);
2133+ bse_free(bse);
2134+ }
2135+}
2136+
2137+/* Merge blocknr_set entries out of @from into @into. */
2138+/* Audited by: green(2002.06.11) */
2139+/* Auditor comments: This merge does not know if merged sets contain
2140+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2141+ overlapping ranges if there is some. So I believe it may lead to
2142+ some blocks being presented several times in one blocknr_set. To help
2143+ debugging such problems it might help to check for duplicate entries on
2144+ actual processing of this set. Testing this kind of stuff right here is
2145+ also complicated by the fact that these sets are not sorted and going
2146+ through whole set on each element addition is going to be CPU-heavy task */
2147+void blocknr_set_merge(struct list_head * from, struct list_head * into)
2148+{
2149+ blocknr_set_entry *bse_into = NULL;
2150+
2151+ /* If @from is empty, no work to perform. */
2152+ if (list_empty(from))
2153+ return;
2154+ /* If @into is not empty, try merging partial-entries. */
2155+ if (!list_empty(into)) {
2156+
2157+ /* Neither set is empty, pop the front to members and try to combine them. */
2158+ blocknr_set_entry *bse_from;
2159+ unsigned into_avail;
2160+
2161+ bse_into = list_entry(into->next, blocknr_set_entry, link);
2162+ list_del_init(&bse_into->link);
2163+ bse_from = list_entry(from->next, blocknr_set_entry, link);
2164+ list_del_init(&bse_from->link);
2165+
2166+ /* Combine singles. */
2167+ for (into_avail = bse_avail(bse_into);
2168+ into_avail != 0 && bse_from->nr_singles != 0;
2169+ into_avail -= 1) {
2170+ bse_put_single(bse_into,
2171+ &bse_from->entries[--bse_from->
2172+ nr_singles]);
2173+ }
2174+
2175+ /* Combine pairs. */
2176+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2177+ into_avail -= 2) {
2178+ blocknr_pair *pair =
2179+ bse_get_pair(bse_from, --bse_from->nr_pairs);
2180+ bse_put_pair(bse_into, &pair->a, &pair->b);
2181+ }
2182+
2183+ /* If bse_from is empty, delete it now. */
2184+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2185+ bse_free(bse_from);
2186+ } else {
2187+ /* Otherwise, bse_into is full or nearly full (e.g.,
2188+ it could have one slot avail and bse_from has one
2189+ pair left). Push it back onto the list. bse_from
2190+ becomes bse_into, which will be the new partial. */
2191+ list_add(&bse_into->link, into);
2192+ bse_into = bse_from;
2193+ }
2194+ }
2195+
2196+ /* Splice lists together. */
2197+ list_splice_init(from, into->prev);
2198+
2199+ /* Add the partial entry back to the head of the list. */
2200+ if (bse_into != NULL)
2201+ list_add(&bse_into->link, into);
2202+}
2203+
2204+/* Iterate over all blocknr set elements. */
2205+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2206+ blocknr_set_actor_f actor, void *data, int delete)
2207+{
2208+
2209+ blocknr_set_entry *entry;
2210+
2211+ assert("zam-429", atom != NULL);
2212+ assert("zam-430", atom_is_protected(atom));
2213+ assert("zam-431", bset != 0);
2214+ assert("zam-432", actor != NULL);
2215+
2216+ entry = list_entry(bset->next, blocknr_set_entry, link);
2217+ while (bset != &entry->link) {
2218+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2219+ unsigned int i;
2220+ int ret;
2221+
2222+ for (i = 0; i < entry->nr_singles; i++) {
2223+ ret = actor(atom, &entry->entries[i], NULL, data);
2224+
2225+ /* We can't break a loop if delete flag is set. */
2226+ if (ret != 0 && !delete)
2227+ return ret;
2228+ }
2229+
2230+ for (i = 0; i < entry->nr_pairs; i++) {
2231+ struct blocknr_pair *ab;
2232+
2233+ ab = bse_get_pair(entry, i);
2234+
2235+ ret = actor(atom, &ab->a, &ab->b, data);
2236+
2237+ if (ret != 0 && !delete)
2238+ return ret;
2239+ }
2240+
2241+ if (delete) {
2242+ list_del(&entry->link);
2243+ bse_free(entry);
2244+ }
2245+
2246+ entry = tmp;
2247+ }
2248+
2249+ return 0;
2250+}
2251+
2252+/*
2253+ * Local variables:
2254+ * c-indentation-style: "K&R"
2255+ * mode-name: "LC"
2256+ * c-basic-offset: 8
2257+ * tab-width: 8
2258+ * fill-column: 79
2259+ * scroll-step: 1
2260+ * End:
2261+ */
c641d991
AF
2262diff -urN linux-2.6.20.orig/fs/reiser4/carry.c linux-2.6.20/fs/reiser4/carry.c
2263--- linux-2.6.20.orig/fs/reiser4/carry.c 1970-01-01 03:00:00.000000000 +0300
2264+++ linux-2.6.20/fs/reiser4/carry.c 2007-05-06 14:50:43.686971975 +0400
4ce37908
MT
2265@@ -0,0 +1,1391 @@
2266+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2267+/* Functions to "carry" tree modification(s) upward. */
2268+/* Tree is modified one level at a time. As we modify a level we accumulate a
2269+ set of changes that need to be propagated to the next level. We manage
2270+ node locking such that any searches that collide with carrying are
2271+ restarted, from the root if necessary.
2272+
2273+ Insertion of a new item may result in items being moved among nodes and
2274+ this requires the delimiting key to be updated at the least common parent
2275+ of the nodes modified to preserve search tree invariants. Also, insertion
2276+ may require allocation of a new node. A pointer to the new node has to be
2277+ inserted into some node on the parent level, etc.
2278+
2279+ Tree carrying is meant to be analogous to arithmetic carrying.
2280+
2281+ A carry operation is always associated with some node (&carry_node).
2282+
2283+ Carry process starts with some initial set of operations to be performed
2284+ and an initial set of already locked nodes. Operations are performed one
2285+ by one. Performing each single operation has following possible effects:
2286+
2287+ - content of carry node associated with operation is modified
2288+ - new carry nodes are locked and involved into carry process on this level
2289+ - new carry operations are posted to the next level
2290+
2291+ After all carry operations on this level are done, process is repeated for
2292+ the accumulated sequence on carry operations for the next level. This
2293+ starts by trying to lock (in left to right order) all carry nodes
2294+ associated with carry operations on the parent level. After this, we decide
2295+ whether more nodes are required on the left of already locked set. If so,
2296+ all locks taken on the parent level are released, new carry nodes are
2297+ added, and locking process repeats.
2298+
2299+ It may happen that balancing process fails owing to unrecoverable error on
2300+ some of upper levels of a tree (possible causes are io error, failure to
2301+ allocate new node, etc.). In this case we should unmount the filesystem,
2302+ rebooting if it is the root, and possibly advise the use of fsck.
2303+
2304+ USAGE:
2305+
2306+ int some_tree_operation( znode *node, ... )
2307+ {
2308+ // Allocate on a stack pool of carry objects: operations and nodes.
2309+ // Most carry processes will only take objects from here, without
2310+ // dynamic allocation.
2311+
2312+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2313+
2314+ carry_pool pool;
2315+ carry_level lowest_level;
2316+ carry_op *op;
2317+
2318+ init_carry_pool( &pool );
2319+ init_carry_level( &lowest_level, &pool );
2320+
2321+ // operation may be one of:
2322+ // COP_INSERT --- insert new item into node
2323+ // COP_CUT --- remove part of or whole node
2324+ // COP_PASTE --- increase size of item
2325+ // COP_DELETE --- delete pointer from parent node
2326+ // COP_UPDATE --- update delimiting key in least
2327+ // common ancestor of two
2328+
2329+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2330+ if( IS_ERR( op ) || ( op == NULL ) ) {
2331+ handle error
2332+ } else {
2333+ // fill in remaining fields in @op, according to carry.h:carry_op
2334+ result = carry( &lowest_level, NULL );
2335+ }
2336+ done_carry_pool( &pool );
2337+ }
2338+
2339+ When you are implementing node plugin method that participates in carry
2340+ (shifting, insertion, deletion, etc.), do the following:
2341+
2342+ int foo_node_method( znode *node, ..., carry_level *todo )
2343+ {
2344+ carry_op *op;
2345+
2346+ ....
2347+
2348+ // note, that last argument to reiser4_post_carry() is non-null
2349+ // here, because @op is to be applied to the parent of @node, rather
2350+ // than to the @node itself as in the previous case.
2351+
2352+ op = node_post_carry( todo, operation, node, 1 );
2353+ // fill in remaining fields in @op, according to carry.h:carry_op
2354+
2355+ ....
2356+
2357+ }
2358+
2359+ BATCHING:
2360+
2361+ One of the main advantages of level-by-level balancing implemented here is
2362+ ability to batch updates on a parent level and to peform them more
2363+ efficiently as a result.
2364+
2365+ Description To Be Done (TBD).
2366+
2367+ DIFFICULTIES AND SUBTLE POINTS:
2368+
2369+ 1. complex plumbing is required, because:
2370+
2371+ a. effective allocation through pools is needed
2372+
2373+ b. target of operation is not exactly known when operation is
2374+ posted. This is worked around through bitfields in &carry_node and
2375+ logic in lock_carry_node()
2376+
2377+ c. of interaction with locking code: node should be added into sibling
2378+ list when pointer to it is inserted into its parent, which is some time
2379+ after node was created. Between these moments, node is somewhat in
2380+ suspended state and is only registered in the carry lists
2381+
2382+ 2. whole balancing logic is implemented here, in particular, insertion
2383+ logic is coded in make_space().
2384+
2385+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2386+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2387+ (insert_paste()) have to be handled.
2388+
2389+ 4. there is non-trivial interdependency between allocation of new nodes
2390+ and almost everything else. This is mainly due to the (1.c) above. I shall
2391+ write about this later.
2392+
2393+*/
2394+
2395+#include "forward.h"
2396+#include "debug.h"
2397+#include "key.h"
2398+#include "coord.h"
2399+#include "plugin/item/item.h"
2400+#include "plugin/item/extent.h"
2401+#include "plugin/node/node.h"
2402+#include "jnode.h"
2403+#include "znode.h"
2404+#include "tree_mod.h"
2405+#include "tree_walk.h"
2406+#include "block_alloc.h"
2407+#include "pool.h"
2408+#include "tree.h"
2409+#include "carry.h"
2410+#include "carry_ops.h"
2411+#include "super.h"
2412+#include "reiser4.h"
2413+
2414+#include <linux/types.h>
2415+
2416+/* level locking/unlocking */
2417+static int lock_carry_level(carry_level * level);
2418+static void unlock_carry_level(carry_level * level, int failure);
2419+static void done_carry_level(carry_level * level);
2420+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2421+
2422+int lock_carry_node(carry_level * level, carry_node * node);
2423+int lock_carry_node_tail(carry_node * node);
2424+
2425+/* carry processing proper */
2426+static int carry_on_level(carry_level * doing, carry_level * todo);
2427+
2428+static carry_op *add_op(carry_level * level, pool_ordering order,
2429+ carry_op * reference);
2430+
2431+/* handlers for carry operations. */
2432+
2433+static void fatal_carry_error(carry_level * doing, int ecode);
2434+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2435+
2436+static void print_level(const char *prefix, carry_level * level);
2437+
2438+#if REISER4_DEBUG
2439+typedef enum {
2440+ CARRY_TODO,
2441+ CARRY_DOING
2442+} carry_queue_state;
2443+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2444+#endif
2445+
2446+/* main entry point for tree balancing.
2447+
2448+ Tree carry performs operations from @doing and while doing so accumulates
2449+ information about operations to be performed on the next level ("carried"
2450+ to the parent level). Carried operations are performed, causing possibly
2451+ more operations to be carried upward etc. carry() takes care about
2452+ locking and pinning znodes while operating on them.
2453+
2454+ For usage, see comment at the top of fs/reiser4/carry.c
2455+
2456+*/
2457+int reiser4_carry(carry_level * doing /* set of carry operations to be
2458+ * performed */ ,
2459+ carry_level * done /* set of nodes, already performed
2460+ * at the previous level.
2461+ * NULL in most cases */)
2462+{
2463+ int result = 0;
2464+ /* queue of new requests */
2465+ carry_level *todo;
2466+ ON_DEBUG(STORE_COUNTERS);
2467+
2468+ assert("nikita-888", doing != NULL);
2469+ BUG_ON(done != NULL);
2470+
2471+ todo = doing + 1;
2472+ init_carry_level(todo, doing->pool);
2473+
2474+ /* queue of requests preformed on the previous level */
2475+ done = todo + 1;
2476+ init_carry_level(done, doing->pool);
2477+
2478+ /* iterate until there is nothing more to do */
2479+ while (result == 0 && doing->ops_num > 0) {
2480+ carry_level *tmp;
2481+
2482+ /* at this point @done is locked. */
2483+ /* repeat lock/do/unlock while
2484+
2485+ (1) lock_carry_level() fails due to deadlock avoidance, or
2486+
2487+ (2) carry_on_level() decides that more nodes have to
2488+ be involved.
2489+
2490+ (3) some unexpected error occurred while balancing on the
2491+ upper levels. In this case all changes are rolled back.
2492+
2493+ */
2494+ while (1) {
2495+ result = lock_carry_level(doing);
2496+ if (result == 0) {
2497+ /* perform operations from @doing and
2498+ accumulate new requests in @todo */
2499+ result = carry_on_level(doing, todo);
2500+ if (result == 0)
2501+ break;
2502+ else if (result != -E_REPEAT ||
2503+ !doing->restartable) {
2504+ warning("nikita-1043",
2505+ "Fatal error during carry: %i",
2506+ result);
2507+ print_level("done", done);
2508+ print_level("doing", doing);
2509+ print_level("todo", todo);
2510+ /* do some rough stuff like aborting
2511+ all pending transcrashes and thus
2512+ pushing tree back to the consistent
2513+ state. Alternatvely, just panic.
2514+ */
2515+ fatal_carry_error(doing, result);
2516+ return result;
2517+ }
2518+ } else if (result != -E_REPEAT) {
2519+ fatal_carry_error(doing, result);
2520+ return result;
2521+ }
2522+ unlock_carry_level(doing, 1);
2523+ }
2524+ /* at this point @done can be safely unlocked */
2525+ done_carry_level(done);
2526+
2527+ /* cyclically shift queues */
2528+ tmp = done;
2529+ done = doing;
2530+ doing = todo;
2531+ todo = tmp;
2532+ init_carry_level(todo, doing->pool);
2533+
2534+ /* give other threads chance to run */
2535+ reiser4_preempt_point();
2536+ }
2537+ done_carry_level(done);
2538+
2539+ /* all counters, but x_refs should remain the same. x_refs can change
2540+ owing to transaction manager */
2541+ ON_DEBUG(CHECK_COUNTERS);
2542+ return result;
2543+}
2544+
2545+/* perform carry operations on given level.
2546+
2547+ Optimizations proposed by pooh:
2548+
2549+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2550+ required;
2551+
2552+ (2) unlock node if there are no more operations to be performed upon it and
2553+ node didn't add any operation to @todo. This can be implemented by
2554+ attaching to each node two counters: counter of operaions working on this
2555+ node and counter and operations carried upward from this node.
2556+
2557+*/
2558+static int carry_on_level(carry_level * doing /* queue of carry operations to
2559+ * do on this level */ ,
2560+ carry_level * todo /* queue where new carry
2561+ * operations to be performed on
2562+ * the * parent level are
2563+ * accumulated during @doing
2564+ * processing. */ )
2565+{
2566+ int result;
2567+ int (*f) (carry_op *, carry_level *, carry_level *);
2568+ carry_op *op;
2569+ carry_op *tmp_op;
2570+
2571+ assert("nikita-1034", doing != NULL);
2572+ assert("nikita-1035", todo != NULL);
2573+
2574+ /* @doing->nodes are locked. */
2575+
2576+ /* This function can be split into two phases: analysis and modification.
2577+
2578+ Analysis calculates precisely what items should be moved between
2579+ nodes. This information is gathered in some structures attached to
2580+ each carry_node in a @doing queue. Analysis also determines whether
2581+ new nodes are to be allocated etc.
2582+
2583+ After analysis is completed, actual modification is performed. Here
2584+ we can take advantage of "batch modification": if there are several
2585+ operations acting on the same node, modifications can be performed
2586+ more efficiently when batched together.
2587+
2588+ Above is an optimization left for the future.
2589+ */
2590+ /* Important, but delayed optimization: it's possible to batch
2591+ operations together and perform them more efficiently as a
2592+ result. For example, deletion of several neighboring items from a
2593+ node can be converted to a single ->cut() operation.
2594+
2595+ Before processing queue, it should be scanned and "mergeable"
2596+ operations merged.
2597+ */
2598+ result = 0;
2599+ for_all_ops(doing, op, tmp_op) {
2600+ carry_opcode opcode;
2601+
2602+ assert("nikita-1041", op != NULL);
2603+ opcode = op->op;
2604+ assert("nikita-1042", op->op < COP_LAST_OP);
2605+ f = op_dispatch_table[op->op].handler;
2606+ result = f(op, doing, todo);
2607+ /* locking can fail with -E_REPEAT. Any different error is fatal
2608+ and will be handled by fatal_carry_error() sledgehammer.
2609+ */
2610+ if (result != 0)
2611+ break;
2612+ }
2613+ if (result == 0) {
2614+ carry_plugin_info info;
2615+ carry_node *scan;
2616+ carry_node *tmp_scan;
2617+
2618+ info.doing = doing;
2619+ info.todo = todo;
2620+
2621+ assert("nikita-3002",
2622+ carry_level_invariant(doing, CARRY_DOING));
2623+ for_all_nodes(doing, scan, tmp_scan) {
2624+ znode *node;
2625+
2626+ node = reiser4_carry_real(scan);
2627+ assert("nikita-2547", node != NULL);
2628+ if (node_is_empty(node)) {
2629+ result =
2630+ node_plugin_by_node(node)->
2631+ prepare_removal(node, &info);
2632+ if (result != 0)
2633+ break;
2634+ }
2635+ }
2636+ }
2637+ return result;
2638+}
2639+
2640+/* post carry operation
2641+
2642+ This is main function used by external carry clients: node layout plugins
2643+ and tree operations to create new carry operation to be performed on some
2644+ level.
2645+
2646+ New operation will be included in the @level queue. To actually perform it,
2647+ call carry( level, ... ). This function takes write lock on @node. Carry
2648+ manages all its locks by itself, don't worry about this.
2649+
2650+ This function adds operation and node at the end of the queue. It is up to
2651+ caller to guarantee proper ordering of node queue.
2652+
2653+*/
2654+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2655+ * is to be posted at */ ,
2656+ carry_opcode op /* opcode of operation */ ,
2657+ znode * node /* node on which this operation
2658+ * will operate */ ,
2659+ int apply_to_parent_p /* whether operation will
2660+ * operate directly on @node
2661+ * or on it parent. */)
2662+{
2663+ carry_op *result;
2664+ carry_node *child;
2665+
2666+ assert("nikita-1046", level != NULL);
2667+ assert("nikita-1788", znode_is_write_locked(node));
2668+
2669+ result = add_op(level, POOLO_LAST, NULL);
2670+ if (IS_ERR(result))
2671+ return result;
2672+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
2673+ if (IS_ERR(child)) {
2674+ reiser4_pool_free(&level->pool->op_pool, &result->header);
2675+ return (carry_op *) child;
2676+ }
2677+ result->node = child;
2678+ result->op = op;
2679+ child->parent = apply_to_parent_p;
2680+ if (ZF_ISSET(node, JNODE_ORPHAN))
2681+ child->left_before = 1;
2682+ child->node = node;
2683+ return result;
2684+}
2685+
2686+/* initialize carry queue */
2687+void init_carry_level(carry_level * level /* level to initialize */ ,
2688+ carry_pool * pool /* pool @level will allocate objects
2689+ * from */ )
2690+{
2691+ assert("nikita-1045", level != NULL);
2692+ assert("nikita-967", pool != NULL);
2693+
2694+ memset(level, 0, sizeof *level);
2695+ level->pool = pool;
2696+
2697+ INIT_LIST_HEAD(&level->nodes);
2698+ INIT_LIST_HEAD(&level->ops);
2699+}
2700+
2701+/* allocate carry pool and initialize pools within queue */
2702+carry_pool *init_carry_pool(int size)
2703+{
2704+ carry_pool *pool;
2705+
2706+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2707+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2708+ if (pool == NULL)
2709+ return ERR_PTR(RETERR(-ENOMEM));
2710+
2711+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2712+ (char *)pool->op);
2713+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2714+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2715+ return pool;
2716+}
2717+
2718+/* finish with queue pools */
2719+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2720+{
2721+ reiser4_done_pool(&pool->op_pool);
2722+ reiser4_done_pool(&pool->node_pool);
2723+ kfree(pool);
2724+}
2725+
2726+/* add new carry node to the @level.
2727+
2728+ Returns pointer to the new carry node allocated from pool. It's up to
2729+ callers to maintain proper order in the @level. Assumption is that if carry
2730+ nodes on one level are already sorted and modifications are peroformed from
2731+ left to right, carry nodes added on the parent level will be ordered
2732+ automatically. To control ordering use @order and @reference parameters.
2733+
2734+*/
2735+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2736+ * node to */ ,
2737+ pool_ordering order /* where to insert:
2738+ * at the beginning of
2739+ * @level,
2740+ * before @reference,
2741+ * after @reference,
2742+ * at the end of @level
2743+ */ ,
2744+ carry_node * reference/* reference node for
2745+ * insertion */)
2746+{
2747+ ON_DEBUG(carry_node * orig_ref = reference);
2748+
2749+ if (order == POOLO_BEFORE) {
2750+ reference = find_left_carry(reference, level);
2751+ if (reference == NULL)
2752+ reference = list_entry(level->nodes.next, carry_node,
2753+ header.level_linkage);
2754+ else
2755+ reference = list_entry(reference->header.level_linkage.next,
2756+ carry_node, header.level_linkage);
2757+ } else if (order == POOLO_AFTER) {
2758+ reference = find_right_carry(reference, level);
2759+ if (reference == NULL)
2760+ reference = list_entry(level->nodes.prev, carry_node,
2761+ header.level_linkage);
2762+ else
2763+ reference = list_entry(reference->header.level_linkage.prev,
2764+ carry_node, header.level_linkage);
2765+ }
2766+ assert("nikita-2209",
2767+ ergo(orig_ref != NULL,
2768+ reiser4_carry_real(reference) ==
2769+ reiser4_carry_real(orig_ref)));
2770+ return reiser4_add_carry(level, order, reference);
2771+}
2772+
2773+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
2774+ * to */ ,
2775+ pool_ordering order /* where to insert: at the
2776+ * beginning of @level, before
2777+ * @reference, after @reference,
2778+ * at the end of @level */ ,
2779+ carry_node * reference /* reference node for
2780+ * insertion */ )
2781+{
2782+ carry_node *result;
2783+
2784+ result =
2785+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2786+ &level->nodes,
2787+ order, &reference->header);
2788+ if (!IS_ERR(result) && (result != NULL))
2789+ ++level->nodes_num;
2790+ return result;
2791+}
2792+
2793+/* add new carry operation to the @level.
2794+
2795+ Returns pointer to the new carry operations allocated from pool. It's up to
2796+ callers to maintain proper order in the @level. To control ordering use
2797+ @order and @reference parameters.
2798+
2799+*/
2800+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2801+ pool_ordering order /* where to insert: at the beginning of
2802+ * @level, before @reference, after
2803+ * @reference, at the end of @level */ ,
2804+ carry_op *
2805+ reference /* reference node for insertion */ )
2806+{
2807+ carry_op *result;
2808+
2809+ result =
2810+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2811+ order, &reference->header);
2812+ if (!IS_ERR(result) && (result != NULL))
2813+ ++level->ops_num;
2814+ return result;
2815+}
2816+
2817+/* Return node on the right of which @node was created.
2818+
2819+ Each node is created on the right of some existing node (or it is new root,
2820+ which is special case not handled here).
2821+
2822+ @node is new node created on some level, but not yet inserted into its
2823+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2824+
2825+*/
2826+static carry_node *find_begetting_brother(carry_node * node /* node to start search
2827+ * from */ ,
2828+ carry_level * kin UNUSED_ARG /* level to
2829+ * scan */ )
2830+{
2831+ carry_node *scan;
2832+
2833+ assert("nikita-1614", node != NULL);
2834+ assert("nikita-1615", kin != NULL);
2835+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2836+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2837+ ZF_ISSET(reiser4_carry_real(node),
2838+ JNODE_ORPHAN)));
2839+ for (scan = node;;
2840+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
2841+ header.level_linkage)) {
2842+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2843+ if ((scan->node != node->node) &&
2844+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2845+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2846+ break;
2847+ }
2848+ }
2849+ return scan;
2850+}
2851+
2852+static cmp_t
2853+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2854+{
2855+ assert("nikita-2199", n1 != NULL);
2856+ assert("nikita-2200", n2 != NULL);
2857+
2858+ if (n1 == n2)
2859+ return EQUAL_TO;
2860+ while (1) {
2861+ n1 = carry_node_next(n1);
2862+ if (carry_node_end(level, n1))
2863+ return GREATER_THAN;
2864+ if (n1 == n2)
2865+ return LESS_THAN;
2866+ }
2867+ impossible("nikita-2201", "End of level reached");
2868+}
2869+
2870+carry_node *find_carry_node(carry_level * level, const znode * node)
2871+{
2872+ carry_node *scan;
2873+ carry_node *tmp_scan;
2874+
2875+ assert("nikita-2202", level != NULL);
2876+ assert("nikita-2203", node != NULL);
2877+
2878+ for_all_nodes(level, scan, tmp_scan) {
2879+ if (reiser4_carry_real(scan) == node)
2880+ return scan;
2881+ }
2882+ return NULL;
2883+}
2884+
2885+znode *reiser4_carry_real(const carry_node * node)
2886+{
2887+ assert("nikita-3061", node != NULL);
2888+
2889+ return node->lock_handle.node;
2890+}
2891+
2892+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2893+ const znode * node)
2894+{
2895+ carry_node *base;
2896+ carry_node *scan;
2897+ carry_node *tmp_scan;
2898+ carry_node *proj;
2899+
2900+ base = find_carry_node(doing, node);
2901+ assert("nikita-2204", base != NULL);
2902+
2903+ for_all_nodes(todo, scan, tmp_scan) {
2904+ proj = find_carry_node(doing, scan->node);
2905+ assert("nikita-2205", proj != NULL);
2906+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2907+ break;
2908+ }
2909+ return scan;
2910+}
2911+
2912+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2913+ znode * node)
2914+{
2915+ carry_node *reference;
2916+
2917+ assert("nikita-2994", doing != NULL);
2918+ assert("nikita-2995", todo != NULL);
2919+ assert("nikita-2996", node != NULL);
2920+
2921+ reference = insert_carry_node(doing, todo, node);
2922+ assert("nikita-2997", reference != NULL);
2923+
2924+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2925+}
2926+
2927+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2928+ This function is different from reiser4_post_carry() in that it finds proper
2929+ place to insert node in the queue. */
2930+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
2931+ * passed down to node
2932+ * plugin */ ,
2933+ carry_opcode op /* opcode of operation */ ,
2934+ znode * node /* node on which this
2935+ * operation will operate */ ,
2936+ int apply_to_parent_p /* whether operation will
2937+ * operate directly on @node
2938+ * or on it parent. */ )
2939+{
2940+ carry_op *result;
2941+ carry_node *child;
2942+
2943+ assert("nikita-2207", info != NULL);
2944+ assert("nikita-2208", info->todo != NULL);
2945+
2946+ if (info->doing == NULL)
2947+ return reiser4_post_carry(info->todo, op, node,
2948+ apply_to_parent_p);
2949+
2950+ result = add_op(info->todo, POOLO_LAST, NULL);
2951+ if (IS_ERR(result))
2952+ return result;
2953+ child = add_carry_atplace(info->doing, info->todo, node);
2954+ if (IS_ERR(child)) {
2955+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2956+ return (carry_op *) child;
2957+ }
2958+ result->node = child;
2959+ result->op = op;
2960+ child->parent = apply_to_parent_p;
2961+ if (ZF_ISSET(node, JNODE_ORPHAN))
2962+ child->left_before = 1;
2963+ child->node = node;
2964+ return result;
2965+}
2966+
2967+/* lock all carry nodes in @level */
2968+static int lock_carry_level(carry_level * level /* level to lock */ )
2969+{
2970+ int result;
2971+ carry_node *node;
2972+ carry_node *tmp_node;
2973+
2974+ assert("nikita-881", level != NULL);
2975+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
2976+
2977+ /* lock nodes from left to right */
2978+ result = 0;
2979+ for_all_nodes(level, node, tmp_node) {
2980+ result = lock_carry_node(level, node);
2981+ if (result != 0)
2982+ break;
2983+ }
2984+ return result;
2985+}
2986+
2987+/* Synchronize delimiting keys between @node and its left neighbor.
2988+
2989+ To reduce contention on dk key and simplify carry code, we synchronize
2990+ delimiting keys only when carry ultimately leaves tree level (carrying
2991+ changes upward) and unlocks nodes at this level.
2992+
2993+ This function first finds left neighbor of @node and then updates left
2994+ neighbor's right delimiting key to conincide with least key in @node.
2995+
2996+*/
2997+
2998+ON_DEBUG(extern atomic_t delim_key_version;
2999+ )
3000+
3001+static void sync_dkeys(znode * spot /* node to update */ )
3002+{
3003+ reiser4_key pivot;
3004+ reiser4_tree *tree;
3005+
3006+ assert("nikita-1610", spot != NULL);
3007+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3008+
3009+ tree = znode_get_tree(spot);
3010+ read_lock_tree(tree);
3011+ write_lock_dk(tree);
3012+
3013+ assert("nikita-2192", znode_is_loaded(spot));
3014+
3015+ /* sync left delimiting key of @spot with key in its leftmost item */
3016+ if (node_is_empty(spot))
3017+ pivot = *znode_get_rd_key(spot);
3018+ else
3019+ leftmost_key_in_node(spot, &pivot);
3020+
3021+ znode_set_ld_key(spot, &pivot);
3022+
3023+ /* there can be sequence of empty nodes pending removal on the left of
3024+ @spot. Scan them and update their left and right delimiting keys to
3025+ match left delimiting key of @spot. Also, update right delimiting
3026+ key of first non-empty left neighbor.
3027+ */
3028+ while (1) {
3029+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3030+ break;
3031+
3032+ spot = spot->left;
3033+ if (spot == NULL)
3034+ break;
3035+
3036+ znode_set_rd_key(spot, &pivot);
3037+ /* don't sink into the domain of another balancing */
3038+ if (!znode_is_write_locked(spot))
3039+ break;
3040+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3041+ znode_set_ld_key(spot, &pivot);
3042+ else
3043+ break;
3044+ }
3045+
3046+ write_unlock_dk(tree);
3047+ read_unlock_tree(tree);
3048+}
3049+
3050+/* unlock all carry nodes in @level */
3051+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3052+ int failure /* true if unlocking owing to
3053+ * failure */ )
3054+{
3055+ carry_node *node;
3056+ carry_node *tmp_node;
3057+
3058+ assert("nikita-889", level != NULL);
3059+
3060+ if (!failure) {
3061+ znode *spot;
3062+
3063+ spot = NULL;
3064+ /* update delimiting keys */
3065+ for_all_nodes(level, node, tmp_node) {
3066+ if (reiser4_carry_real(node) != spot) {
3067+ spot = reiser4_carry_real(node);
3068+ sync_dkeys(spot);
3069+ }
3070+ }
3071+ }
3072+
3073+ /* nodes can be unlocked in arbitrary order. In preemptible
3074+ environment it's better to unlock in reverse order of locking,
3075+ though.
3076+ */
3077+ for_all_nodes_back(level, node, tmp_node) {
3078+ /* all allocated nodes should be already linked to their
3079+ parents at this moment. */
3080+ assert("nikita-1631",
3081+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3082+ JNODE_ORPHAN)));
3083+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3084+ unlock_carry_node(level, node, failure);
3085+ }
3086+ level->new_root = NULL;
3087+}
3088+
3089+/* finish with @level
3090+
3091+ Unlock nodes and release all allocated resources */
3092+static void done_carry_level(carry_level * level /* level to finish */ )
3093+{
3094+ carry_node *node;
3095+ carry_node *tmp_node;
3096+ carry_op *op;
3097+ carry_op *tmp_op;
3098+
3099+ assert("nikita-1076", level != NULL);
3100+
3101+ unlock_carry_level(level, 0);
3102+ for_all_nodes(level, node, tmp_node) {
3103+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3104+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3105+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3106+ }
3107+ for_all_ops(level, op, tmp_op)
3108+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3109+}
3110+
3111+/* helper function to complete locking of carry node
3112+
3113+ Finish locking of carry node. There are several ways in which new carry
3114+ node can be added into carry level and locked. Normal is through
3115+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3116+ function factors out common final part of all locking scenarios. It
3117+ supposes that @node -> lock_handle is lock handle for lock just taken and
3118+ fills ->real_node from this lock handle.
3119+
3120+*/
3121+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3122+{
3123+ assert("nikita-1052", node != NULL);
3124+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
3125+ assert("nikita-1188", !node->unlock);
3126+
3127+ node->unlock = 1;
3128+ /* Load node content into memory and install node plugin by
3129+ looking at the node header.
3130+
3131+ Most of the time this call is cheap because the node is
3132+ already in memory.
3133+
3134+ Corresponding zrelse() is in unlock_carry_node()
3135+ */
3136+ return zload(reiser4_carry_real(node));
3137+}
3138+
3139+/* lock carry node
3140+
3141+ "Resolve" node to real znode, lock it and mark as locked.
3142+ This requires recursive locking of znodes.
3143+
3144+ When operation is posted to the parent level, node it will be applied to is
3145+ not yet known. For example, when shifting data between two nodes,
3146+ delimiting has to be updated in parent or parents of nodes involved. But
3147+ their parents is not yet locked and, moreover said nodes can be reparented
3148+ by concurrent balancing.
3149+
3150+ To work around this, carry operation is applied to special "carry node"
3151+ rather than to the znode itself. Carry node consists of some "base" or
3152+ "reference" znode and flags indicating how to get to the target of carry
3153+ operation (->real_node field of carry_node) from base.
3154+
3155+*/
3156+int lock_carry_node(carry_level * level /* level @node is in */ ,
3157+ carry_node * node /* node to lock */ )
3158+{
3159+ int result;
3160+ znode *reference_point;
3161+ lock_handle lh;
3162+ lock_handle tmp_lh;
3163+ reiser4_tree *tree;
3164+
3165+ assert("nikita-887", level != NULL);
3166+ assert("nikita-882", node != NULL);
3167+
3168+ result = 0;
3169+ reference_point = node->node;
3170+ init_lh(&lh);
3171+ init_lh(&tmp_lh);
3172+ if (node->left_before) {
3173+ /* handling of new nodes, allocated on the previous level:
3174+
3175+ some carry ops were propably posted from the new node, but
3176+ this node neither has parent pointer set, nor is
3177+ connected. This will be done in ->create_hook() for
3178+ internal item.
3179+
3180+ No then less, parent of new node has to be locked. To do
3181+ this, first go to the "left" in the carry order. This
3182+ depends on the decision to always allocate new node on the
3183+ right of existing one.
3184+
3185+ Loop handles case when multiple nodes, all orphans, were
3186+ inserted.
3187+
3188+ Strictly speaking, taking tree lock is not necessary here,
3189+ because all nodes scanned by loop in
3190+ find_begetting_brother() are write-locked by this thread,
3191+ and thus, their sibling linkage cannot change.
3192+
3193+ */
3194+ tree = znode_get_tree(reference_point);
3195+ read_lock_tree(tree);
3196+ reference_point = find_begetting_brother(node, level)->node;
3197+ read_unlock_tree(tree);
3198+ assert("nikita-1186", reference_point != NULL);
3199+ }
3200+ if (node->parent && (result == 0)) {
3201+ result =
3202+ reiser4_get_parent(&tmp_lh, reference_point,
3203+ ZNODE_WRITE_LOCK);
3204+ if (result != 0) {
3205+ ; /* nothing */
3206+ } else if (znode_get_level(tmp_lh.node) == 0) {
3207+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3208+ result = add_new_root(level, node, tmp_lh.node);
3209+ if (result == 0) {
3210+ reference_point = level->new_root;
3211+ move_lh(&lh, &node->lock_handle);
3212+ }
3213+ } else if ((level->new_root != NULL)
3214+ && (level->new_root !=
3215+ znode_parent_nolock(reference_point))) {
3216+ /* parent of node exists, but this level aready
3217+ created different new root, so */
3218+ warning("nikita-1109",
3219+ /* it should be "radicis", but tradition is
3220+ tradition. do banshees read latin? */
3221+ "hodie natus est radici frater");
3222+ result = -EIO;
3223+ } else {
3224+ move_lh(&lh, &tmp_lh);
3225+ reference_point = lh.node;
3226+ }
3227+ }
3228+ if (node->left && (result == 0)) {
3229+ assert("nikita-1183", node->parent);
3230+ assert("nikita-883", reference_point != NULL);
3231+ result =
3232+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3233+ ZNODE_WRITE_LOCK,
3234+ GN_CAN_USE_UPPER_LEVELS);
3235+ if (result == 0) {
3236+ done_lh(&lh);
3237+ move_lh(&lh, &tmp_lh);
3238+ reference_point = lh.node;
3239+ }
3240+ }
3241+ if (!node->parent && !node->left && !node->left_before) {
3242+ result =
3243+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3244+ ZNODE_LOCK_HIPRI);
3245+ }
3246+ if (result == 0) {
3247+ move_lh(&node->lock_handle, &lh);
3248+ result = lock_carry_node_tail(node);
3249+ }
3250+ done_lh(&tmp_lh);
3251+ done_lh(&lh);
3252+ return result;
3253+}
3254+
3255+/* release a lock on &carry_node.
3256+
3257+ Release if necessary lock on @node. This opearion is pair of
3258+ lock_carry_node() and is idempotent: you can call it more than once on the
3259+ same node.
3260+
3261+*/
3262+static void
3263+unlock_carry_node(carry_level * level,
3264+ carry_node * node /* node to be released */ ,
3265+ int failure /* 0 if node is unlocked due
3266+ * to some error */ )
3267+{
3268+ znode *real_node;
3269+
3270+ assert("nikita-884", node != NULL);
3271+
3272+ real_node = reiser4_carry_real(node);
3273+ /* pair to zload() in lock_carry_node_tail() */
3274+ zrelse(real_node);
3275+ if (node->unlock && (real_node != NULL)) {
3276+ assert("nikita-899", real_node == node->lock_handle.node);
3277+ longterm_unlock_znode(&node->lock_handle);
3278+ }
3279+ if (failure) {
3280+ if (node->deallocate && (real_node != NULL)) {
3281+ /* free node in bitmap
3282+
3283+ Prepare node for removal. Last zput() will finish
3284+ with it.
3285+ */
3286+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3287+ }
3288+ if (node->free) {
3289+ assert("nikita-2177",
3290+ list_empty_careful(&node->lock_handle.locks_link));
3291+ assert("nikita-2112",
3292+ list_empty_careful(&node->lock_handle.owners_link));
3293+ reiser4_pool_free(&level->pool->node_pool,
3294+ &node->header);
3295+ }
3296+ }
3297+}
3298+
3299+/* fatal_carry_error() - all-catching error handling function
3300+
3301+ It is possible that carry faces unrecoverable error, like unability to
3302+ insert pointer at the internal level. Our simple solution is just panic in
3303+ this situation. More sophisticated things like attempt to remount
3304+ file-system as read-only can be implemented without much difficlties.
3305+
3306+ It is believed, that:
3307+
3308+ 1. in stead of panicking, all current transactions can be aborted rolling
3309+ system back to the consistent state.
3310+
3311+Umm, if you simply panic without doing anything more at all, then all current
3312+transactions are aborted and the system is rolled back to a consistent state,
3313+by virtue of the design of the transactional mechanism. Well, wait, let's be
3314+precise. If an internal node is corrupted on disk due to hardware failure,
3315+then there may be no consistent state that can be rolled back to, so instead
3316+we should say that it will rollback the transactions, which barring other
3317+factors means rolling back to a consistent state.
3318+
3319+# Nikita: there is a subtle difference between panic and aborting
3320+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3321+# don't using reiser4 (not that we care about such processes), or using other
3322+# reiser4 mounts (about them we do care) will simply continue to run. With
3323+# some luck, even application using aborted file system can survive: it will
3324+# get some error, like EBADF, from each file descriptor on failed file system,
3325+# but applications that do care about tolerance will cope with this (squid
3326+# will).
3327+
3328+It would be a nice feature though to support rollback without rebooting
3329+followed by remount, but this can wait for later versions.
3330+
3331+ 2. once isolated transactions will be implemented it will be possible to
3332+ roll back offending transaction.
3333+
3334+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3335+it more before deciding if it should be done. -Hans
3336+
3337+*/
3338+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3339+ * where
3340+ * unrecoverable
3341+ * error
3342+ * occurred */ ,
3343+ int ecode /* error code */ )
3344+{
3345+ assert("nikita-1230", doing != NULL);
3346+ assert("nikita-1231", ecode < 0);
3347+
3348+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3349+}
3350+
3351+/* add new root to the tree
3352+
3353+ This function itself only manages changes in carry structures and delegates
3354+ all hard work (allocation of znode for new root, changes of parent and
3355+ sibling pointers to the reiser4_add_tree_root().
3356+
3357+ Locking: old tree root is locked by carry at this point. Fake znode is also
3358+ locked.
3359+
3360+*/
3361+static int add_new_root(carry_level * level /* carry level in context of which
3362+ * operation is performed */ ,
3363+ carry_node * node /* carry node for existing root */ ,
3364+ znode * fake /* "fake" znode already locked by
3365+ * us */ )
3366+{
3367+ int result;
3368+
3369+ assert("nikita-1104", level != NULL);
3370+ assert("nikita-1105", node != NULL);
3371+
3372+ assert("nikita-1403", znode_is_write_locked(node->node));
3373+ assert("nikita-1404", znode_is_write_locked(fake));
3374+
3375+ /* trying to create new root. */
3376+ /* @node is root and it's already locked by us. This
3377+ means that nobody else can be trying to add/remove
3378+ tree root right now.
3379+ */
3380+ if (level->new_root == NULL)
3381+ level->new_root = reiser4_add_tree_root(node->node, fake);
3382+ if (!IS_ERR(level->new_root)) {
3383+ assert("nikita-1210", znode_is_root(level->new_root));
3384+ node->deallocate = 1;
3385+ result =
3386+ longterm_lock_znode(&node->lock_handle, level->new_root,
3387+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3388+ if (result == 0)
3389+ zput(level->new_root);
3390+ } else {
3391+ result = PTR_ERR(level->new_root);
3392+ level->new_root = NULL;
3393+ }
3394+ return result;
3395+}
3396+
3397+/* allocate new znode and add the operation that inserts the
3398+ pointer to it into the parent node into the todo level
3399+
3400+ Allocate new znode, add it into carry queue and post into @todo queue
3401+ request to add pointer to new node into its parent.
3402+
3403+ This is carry related routing that calls reiser4_new_node() to allocate new
3404+ node.
3405+*/
3406+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3407+ * node */ ,
3408+ carry_node * ref /* carry node after which new
3409+ * carry node is to be inserted
3410+ * into queue. This affects
3411+ * locking. */ ,
3412+ carry_level * doing /* carry queue where new node is
3413+ * to be added */ ,
3414+ carry_level * todo /* carry queue where COP_INSERT
3415+ * operation to add pointer to
3416+ * new node will ne added */ )
3417+{
3418+ carry_node *fresh;
3419+ znode *new_znode;
3420+ carry_op *add_pointer;
3421+ carry_plugin_info info;
3422+
3423+ assert("nikita-1048", brother != NULL);
3424+ assert("nikita-1049", todo != NULL);
3425+
3426+ /* There is a lot of possible variations here: to what parent
3427+ new node will be attached and where. For simplicity, always
3428+ do the following:
3429+
3430+ (1) new node and @brother will have the same parent.
3431+
3432+ (2) new node is added on the right of @brother
3433+
3434+ */
3435+
3436+ fresh = reiser4_add_carry_skip(doing,
3437+ ref ? POOLO_AFTER : POOLO_LAST, ref);
3438+ if (IS_ERR(fresh))
3439+ return fresh;
3440+
3441+ fresh->deallocate = 1;
3442+ fresh->free = 1;
3443+
3444+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
3445+ if (IS_ERR(new_znode))
3446+ /* @fresh will be deallocated automatically by error
3447+ handling code in the caller. */
3448+ return (carry_node *) new_znode;
3449+
3450+ /* new_znode returned znode with x_count 1. Caller has to decrease
3451+ it. make_space() does. */
3452+
3453+ ZF_SET(new_znode, JNODE_ORPHAN);
3454+ fresh->node = new_znode;
3455+
3456+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3457+ ref = carry_node_prev(ref);
3458+ assert("nikita-1606", !carry_node_end(doing, ref));
3459+ }
3460+
3461+ info.todo = todo;
3462+ info.doing = doing;
3463+ add_pointer = node_post_carry(&info, COP_INSERT,
3464+ reiser4_carry_real(ref), 1);
3465+ if (IS_ERR(add_pointer)) {
3466+ /* no need to deallocate @new_znode here: it will be
3467+ deallocated during carry error handling. */
3468+ return (carry_node *) add_pointer;
3469+ }
3470+
3471+ add_pointer->u.insert.type = COPT_CHILD;
3472+ add_pointer->u.insert.child = fresh;
3473+ add_pointer->u.insert.brother = brother;
3474+ /* initially new node spawns empty key range */
3475+ write_lock_dk(znode_get_tree(brother));
3476+ znode_set_ld_key(new_znode,
3477+ znode_set_rd_key(new_znode,
3478+ znode_get_rd_key(brother)));
3479+ write_unlock_dk(znode_get_tree(brother));
3480+ return fresh;
3481+}
3482+
3483+/* DEBUGGING FUNCTIONS.
3484+
3485+ Probably we also should leave them on even when
3486+ debugging is turned off to print dumps at errors.
3487+*/
3488+#if REISER4_DEBUG
3489+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3490+{
3491+ carry_node *node;
3492+ carry_node *tmp_node;
3493+
3494+ if (level == NULL)
3495+ return 0;
3496+
3497+ if (level->track_type != 0 &&
3498+ level->track_type != CARRY_TRACK_NODE &&
3499+ level->track_type != CARRY_TRACK_CHANGE)
3500+ return 0;
3501+
3502+ /* check that nodes are in ascending order */
3503+ for_all_nodes(level, node, tmp_node) {
3504+ znode *left;
3505+ znode *right;
3506+
3507+ reiser4_key lkey;
3508+ reiser4_key rkey;
3509+
3510+ if (node != carry_node_front(level)) {
3511+ if (state == CARRY_TODO) {
3512+ right = node->node;
3513+ left = carry_node_prev(node)->node;
3514+ } else {
3515+ right = reiser4_carry_real(node);
3516+ left = reiser4_carry_real(carry_node_prev(node));
3517+ }
3518+ if (right == NULL || left == NULL)
3519+ continue;
3520+ if (node_is_empty(right) || node_is_empty(left))
3521+ continue;
3522+ if (!keyle(leftmost_key_in_node(left, &lkey),
3523+ leftmost_key_in_node(right, &rkey))) {
3524+ warning("", "wrong key order");
3525+ return 0;
3526+ }
3527+ }
3528+ }
3529+ return 1;
3530+}
3531+#endif
3532+
3533+/* get symbolic name for boolean */
3534+static const char *tf(int boolean /* truth value */ )
3535+{
3536+ return boolean ? "t" : "f";
3537+}
3538+
3539+/* symbolic name for carry operation */
3540+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3541+{
3542+ switch (op) {
3543+ case COP_INSERT:
3544+ return "COP_INSERT";
3545+ case COP_DELETE:
3546+ return "COP_DELETE";
3547+ case COP_CUT:
3548+ return "COP_CUT";
3549+ case COP_PASTE:
3550+ return "COP_PASTE";
3551+ case COP_UPDATE:
3552+ return "COP_UPDATE";
3553+ case COP_EXTENT:
3554+ return "COP_EXTENT";
3555+ case COP_INSERT_FLOW:
3556+ return "COP_INSERT_FLOW";
3557+ default:{
3558+ /* not mt safe, but who cares? */
3559+ static char buf[20];
3560+
3561+ sprintf(buf, "unknown op: %x", op);
3562+ return buf;
3563+ }
3564+ }
3565+}
3566+
3567+/* dump information about carry node */
3568+static void print_carry(const char *prefix /* prefix to print */ ,
3569+ carry_node * node /* node to print */ )
3570+{
3571+ if (node == NULL) {
3572+ printk("%s: null\n", prefix);
3573+ return;
3574+ }
3575+ printk
3576+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3577+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3578+ tf(node->free), tf(node->deallocate));
3579+}
3580+
3581+/* dump information about carry operation */
3582+static void print_op(const char *prefix /* prefix to print */ ,
3583+ carry_op * op /* operation to print */ )
3584+{
3585+ if (op == NULL) {
3586+ printk("%s: null\n", prefix);
3587+ return;
3588+ }
3589+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3590+ print_carry("\tnode", op->node);
3591+ switch (op->op) {
3592+ case COP_INSERT:
3593+ case COP_PASTE:
3594+ print_coord("\tcoord",
3595+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3596+ reiser4_print_key("\tkey",
3597+ op->u.insert.d ? op->u.insert.d->key : NULL);
3598+ print_carry("\tchild", op->u.insert.child);
3599+ break;
3600+ case COP_DELETE:
3601+ print_carry("\tchild", op->u.delete.child);
3602+ break;
3603+ case COP_CUT:
3604+ if (op->u.cut_or_kill.is_cut) {
3605+ print_coord("\tfrom",
3606+ op->u.cut_or_kill.u.kill->params.from, 0);
3607+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3608+ 0);
3609+ } else {
3610+ print_coord("\tfrom",
3611+ op->u.cut_or_kill.u.cut->params.from, 0);
3612+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3613+ 0);
3614+ }
3615+ break;
3616+ case COP_UPDATE:
3617+ print_carry("\tleft", op->u.update.left);
3618+ break;
3619+ default:
3620+ /* do nothing */
3621+ break;
3622+ }
3623+}
3624+
3625+/* dump information about all nodes and operations in a @level */
3626+static void print_level(const char *prefix /* prefix to print */ ,
3627+ carry_level * level /* level to print */ )
3628+{
3629+ carry_node *node;
3630+ carry_node *tmp_node;
3631+ carry_op *op;
3632+ carry_op *tmp_op;
3633+
3634+ if (level == NULL) {
3635+ printk("%s: null\n", prefix);
3636+ return;
3637+ }
3638+ printk("%s: %p, restartable: %s\n",
3639+ prefix, level, tf(level->restartable));
3640+
3641+ for_all_nodes(level, node, tmp_node)
3642+ print_carry("\tcarry node", node);
3643+ for_all_ops(level, op, tmp_op)
3644+ print_op("\tcarry op", op);
3645+}
3646+
3647+/* Make Linus happy.
3648+ Local variables:
3649+ c-indentation-style: "K&R"
3650+ mode-name: "LC"
3651+ c-basic-offset: 8
3652+ tab-width: 8
3653+ fill-column: 120
3654+ scroll-step: 1
3655+ End:
3656+*/
c641d991
AF
3657diff -urN linux-2.6.20.orig/fs/reiser4/carry.h linux-2.6.20/fs/reiser4/carry.h
3658--- linux-2.6.20.orig/fs/reiser4/carry.h 1970-01-01 03:00:00.000000000 +0300
3659+++ linux-2.6.20/fs/reiser4/carry.h 2007-05-06 14:50:43.690973225 +0400
4ce37908
MT
3660@@ -0,0 +1,442 @@
3661+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3662+
3663+/* Functions and data types to "carry" tree modification(s) upward.
3664+ See fs/reiser4/carry.c for details. */
3665+
3666+#if !defined( __FS_REISER4_CARRY_H__ )
3667+#define __FS_REISER4_CARRY_H__
3668+
3669+#include "forward.h"
3670+#include "debug.h"
3671+#include "pool.h"
3672+#include "znode.h"
3673+
3674+#include <linux/types.h>
3675+
3676+/* &carry_node - "location" of carry node.
3677+
3678+ "location" of node that is involved or going to be involved into
3679+ carry process. Node where operation will be carried to on the
3680+ parent level cannot be recorded explicitly. Operation will be carried
3681+ usually to the parent of some node (where changes are performed at
3682+ the current level) or, to the left neighbor of its parent. But while
3683+ modifications are performed at the current level, parent may
3684+ change. So, we have to allow some indirection (or, positevly,
3685+ flexibility) in locating carry nodes.
3686+
3687+*/
3688+typedef struct carry_node {
3689+ /* pool linkage */
3690+ reiser4_pool_header header;
3691+
3692+ /* base node from which real_node is calculated. See
3693+ fs/reiser4/carry.c:lock_carry_node(). */
3694+ znode *node;
3695+
3696+ /* how to get ->real_node */
3697+ /* to get ->real_node obtain parent of ->node */
3698+ __u32 parent:1;
3699+ /* to get ->real_node obtain left neighbor of parent of
3700+ ->node */
3701+ __u32 left:1;
3702+ __u32 left_before:1;
3703+
3704+ /* locking */
3705+
3706+ /* this node was locked by carry process and should be
3707+ unlocked when carry leaves a level */
3708+ __u32 unlock:1;
3709+
3710+ /* disk block for this node was allocated by carry process and
3711+ should be deallocated when carry leaves a level */
3712+ __u32 deallocate:1;
3713+ /* this carry node was allocated by carry process and should be
3714+ freed when carry leaves a level */
3715+ __u32 free:1;
3716+
3717+ /* type of lock we want to take on this node */
3718+ lock_handle lock_handle;
3719+} carry_node;
3720+
3721+/* &carry_opcode - elementary operations that can be carried upward
3722+
3723+ Operations that carry() can handle. This list is supposed to be
3724+ expanded.
3725+
3726+ Each carry operation (cop) is handled by appropriate function defined
3727+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
3728+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3729+ call plugins of nodes affected by operation to modify nodes' content
3730+ and to gather operations to be performed on the next level.
3731+
3732+*/
3733+typedef enum {
3734+ /* insert new item into node. */
3735+ COP_INSERT,
3736+ /* delete pointer from parent node */
3737+ COP_DELETE,
3738+ /* remove part of or whole node. */
3739+ COP_CUT,
3740+ /* increase size of item. */
3741+ COP_PASTE,
3742+ /* insert extent (that is sequence of unformatted nodes). */
3743+ COP_EXTENT,
3744+ /* update delimiting key in least common ancestor of two
3745+ nodes. This is performed when items are moved between two
3746+ nodes.
3747+ */
3748+ COP_UPDATE,
3749+ /* insert flow */
3750+ COP_INSERT_FLOW,
3751+ COP_LAST_OP,
3752+} carry_opcode;
3753+
3754+#define CARRY_FLOW_NEW_NODES_LIMIT 20
3755+
3756+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3757+ item is determined. */
3758+typedef enum {
3759+ /* target item is one containing pointer to the ->child node */
3760+ COPT_CHILD,
3761+ /* target item is given explicitly by @coord */
3762+ COPT_ITEM_DATA,
3763+ /* target item is given by key */
3764+ COPT_KEY,
3765+ /* see insert_paste_common() for more comments on this. */
3766+ COPT_PASTE_RESTARTED,
3767+} cop_insert_pos_type;
3768+
3769+/* flags to cut and delete */
3770+typedef enum {
3771+ /* don't kill node even if it became completely empty as results of
3772+ * cut. This is needed for eottl handling. See carry_extent() for
3773+ * details. */
3774+ DELETE_RETAIN_EMPTY = (1 << 0)
3775+} cop_delete_flag;
3776+
3777+/*
3778+ * carry() implements "lock handle tracking" feature.
3779+ *
3780+ * Callers supply carry with node where to perform initial operation and lock
3781+ * handle on this node. Trying to optimize node utilization carry may actually
3782+ * move insertion point to different node. Callers expect that lock handle
3783+ * will rebe transferred to the new node also.
3784+ *
3785+ */
3786+typedef enum {
3787+ /* transfer lock handle along with insertion point */
3788+ CARRY_TRACK_CHANGE = 1,
3789+ /* acquire new lock handle to the node where insertion point is. This
3790+ * is used when carry() client doesn't initially possess lock handle
3791+ * on the insertion point node, for example, by extent insertion
3792+ * code. See carry_extent(). */
3793+ CARRY_TRACK_NODE = 2