]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/reiser4-for-2.6.20.patch
Enables sdl for directfb
[ipfire-2.x.git] / src / patches / reiser4-for-2.6.20.patch
CommitLineData
4ce37908
MT
1 Documentation/Changes | 12 +
2 Documentation/filesystems/reiser4.txt | 75 +
3 arch/i386/lib/usercopy.c | 2 +
4 fs/Kconfig | 2 +
5 fs/Makefile | 1 +
6 fs/fs-writeback.c | 26 +-
7 fs/reiser4/Kconfig | 32 +
8 fs/reiser4/Makefile | 99 +
9 fs/reiser4/README | 125 +
10 fs/reiser4/as_ops.c | 339 +++
11 fs/reiser4/block_alloc.c | 1137 ++++++++
12 fs/reiser4/block_alloc.h | 175 ++
13 fs/reiser4/blocknrset.c | 368 +++
14 fs/reiser4/carry.c | 1391 +++++++++
15 fs/reiser4/carry.h | 442 +++
16 fs/reiser4/carry_ops.c | 2131 ++++++++++++++
17 fs/reiser4/carry_ops.h | 42 +
18 fs/reiser4/context.c | 288 ++
19 fs/reiser4/context.h | 228 ++
20 fs/reiser4/coord.c | 935 ++++++
21 fs/reiser4/coord.h | 389 +++
22 fs/reiser4/debug.c | 308 ++
23 fs/reiser4/debug.h | 350 +++
24 fs/reiser4/dformat.h | 70 +
25 fs/reiser4/dscale.c | 174 ++
26 fs/reiser4/dscale.h | 27 +
27 fs/reiser4/entd.c | 335 +++
28 fs/reiser4/entd.h | 90 +
29 fs/reiser4/eottl.c | 509 ++++
30 fs/reiser4/estimate.c | 120 +
31 fs/reiser4/export_ops.c | 295 ++
32 fs/reiser4/flush.c | 3622 ++++++++++++++++++++++++
33 fs/reiser4/flush.h | 274 ++
34 fs/reiser4/flush_queue.c | 680 +++++
35 fs/reiser4/forward.h | 256 ++
36 fs/reiser4/fsdata.c | 804 ++++++
37 fs/reiser4/fsdata.h | 207 ++
38 fs/reiser4/init_super.c | 750 +++++
39 fs/reiser4/inode.c | 709 +++++
40 fs/reiser4/inode.h | 438 +++
41 fs/reiser4/ioctl.h | 41 +
42 fs/reiser4/jnode.c | 1925 +++++++++++++
43 fs/reiser4/jnode.h | 705 +++++
44 fs/reiser4/kassign.c | 661 +++++
45 fs/reiser4/kassign.h | 110 +
46 fs/reiser4/key.c | 137 +
47 fs/reiser4/key.h | 384 +++
48 fs/reiser4/ktxnmgrd.c | 215 ++
49 fs/reiser4/ktxnmgrd.h | 52 +
50 fs/reiser4/lock.c | 1232 ++++++++
51 fs/reiser4/lock.h | 249 ++
52 fs/reiser4/oid.c | 141 +
53 fs/reiser4/page_cache.c | 736 +++++
54 fs/reiser4/page_cache.h | 68 +
55 fs/reiser4/plugin/Makefile | 26 +
56 fs/reiser4/plugin/cluster.c | 71 +
57 fs/reiser4/plugin/cluster.h | 343 +++
58 fs/reiser4/plugin/compress/Makefile | 6 +
59 fs/reiser4/plugin/compress/compress.c | 381 +++
60 fs/reiser4/plugin/compress/compress.h | 38 +
61 fs/reiser4/plugin/compress/compress_mode.c | 162 ++
62 fs/reiser4/plugin/compress/lzoconf.h | 216 ++
63 fs/reiser4/plugin/compress/minilzo.c | 1967 +++++++++++++
64 fs/reiser4/plugin/compress/minilzo.h | 70 +
65 fs/reiser4/plugin/crypto/cipher.c | 37 +
66 fs/reiser4/plugin/crypto/cipher.h | 55 +
67 fs/reiser4/plugin/crypto/digest.c | 58 +
68 fs/reiser4/plugin/dir/Makefile | 5 +
69 fs/reiser4/plugin/dir/dir.h | 36 +
70 fs/reiser4/plugin/dir/hashed_dir.c | 81 +
71 fs/reiser4/plugin/dir/seekable_dir.c | 46 +
72 fs/reiser4/plugin/dir_plugin_common.c | 872 ++++++
73 fs/reiser4/plugin/disk_format/Makefile | 5 +
74 fs/reiser4/plugin/disk_format/disk_format.c | 38 +
75 fs/reiser4/plugin/disk_format/disk_format.h | 27 +
76 fs/reiser4/plugin/disk_format/disk_format40.c | 655 +++++
77 fs/reiser4/plugin/disk_format/disk_format40.h | 109 +
78 fs/reiser4/plugin/fibration.c | 175 ++
79 fs/reiser4/plugin/fibration.h | 37 +
80 fs/reiser4/plugin/file/Makefile | 7 +
81 fs/reiser4/plugin/file/cryptcompress.c | 3760 +++++++++++++++++++++++++
82 fs/reiser4/plugin/file/cryptcompress.h | 554 ++++
83 fs/reiser4/plugin/file/file.c | 2820 ++++++++++++++++++
84 fs/reiser4/plugin/file/file.h | 272 ++
85 fs/reiser4/plugin/file/file_conversion.c | 594 ++++
86 fs/reiser4/plugin/file/invert.c | 493 ++++
87 fs/reiser4/plugin/file/symfile.c | 87 +
88 fs/reiser4/plugin/file/symlink.c | 95 +
89 fs/reiser4/plugin/file/tail_conversion.c | 726 +++++
90 fs/reiser4/plugin/file_ops.c | 168 ++
91 fs/reiser4/plugin/file_ops_readdir.c | 657 +++++
92 fs/reiser4/plugin/file_plugin_common.c | 1007 +++++++
93 fs/reiser4/plugin/hash.c | 353 +++
94 fs/reiser4/plugin/inode_ops.c | 897 ++++++
95 fs/reiser4/plugin/inode_ops_rename.c | 914 ++++++
96 fs/reiser4/plugin/item/Makefile | 18 +
97 fs/reiser4/plugin/item/acl.h | 66 +
98 fs/reiser4/plugin/item/blackbox.c | 142 +
99 fs/reiser4/plugin/item/blackbox.h | 33 +
100 fs/reiser4/plugin/item/cde.c | 1008 +++++++
101 fs/reiser4/plugin/item/cde.h | 87 +
102 fs/reiser4/plugin/item/ctail.c | 1570 +++++++++++
103 fs/reiser4/plugin/item/ctail.h | 97 +
104 fs/reiser4/plugin/item/extent.c | 197 ++
105 fs/reiser4/plugin/item/extent.h | 231 ++
106 fs/reiser4/plugin/item/extent_file_ops.c | 1435 ++++++++++
107 fs/reiser4/plugin/item/extent_flush_ops.c | 1028 +++++++
108 fs/reiser4/plugin/item/extent_item_ops.c | 889 ++++++
109 fs/reiser4/plugin/item/internal.c | 396 +++
110 fs/reiser4/plugin/item/internal.h | 57 +
111 fs/reiser4/plugin/item/item.c | 719 +++++
112 fs/reiser4/plugin/item/item.h | 400 +++
113 fs/reiser4/plugin/item/sde.c | 190 ++
114 fs/reiser4/plugin/item/sde.h | 66 +
115 fs/reiser4/plugin/item/static_stat.c | 1106 ++++++++
116 fs/reiser4/plugin/item/static_stat.h | 224 ++
117 fs/reiser4/plugin/item/tail.c | 812 ++++++
118 fs/reiser4/plugin/item/tail.h | 58 +
119 fs/reiser4/plugin/node/Makefile | 5 +
120 fs/reiser4/plugin/node/node.c | 131 +
121 fs/reiser4/plugin/node/node.h | 272 ++
122 fs/reiser4/plugin/node/node40.c | 2924 +++++++++++++++++++
123 fs/reiser4/plugin/node/node40.h | 125 +
124 fs/reiser4/plugin/object.c | 516 ++++
125 fs/reiser4/plugin/object.h | 121 +
126 fs/reiser4/plugin/plugin.c | 578 ++++
127 fs/reiser4/plugin/plugin.h | 920 ++++++
128 fs/reiser4/plugin/plugin_header.h | 144 +
129 fs/reiser4/plugin/plugin_set.c | 379 +++
130 fs/reiser4/plugin/plugin_set.h | 77 +
131 fs/reiser4/plugin/security/Makefile | 4 +
132 fs/reiser4/plugin/security/perm.c | 44 +
133 fs/reiser4/plugin/security/perm.h | 82 +
134 fs/reiser4/plugin/space/Makefile | 4 +
135 fs/reiser4/plugin/space/bitmap.c | 1585 +++++++++++
136 fs/reiser4/plugin/space/bitmap.h | 47 +
137 fs/reiser4/plugin/space/space_allocator.h | 80 +
138 fs/reiser4/plugin/tail_policy.c | 113 +
139 fs/reiser4/pool.c | 234 ++
140 fs/reiser4/pool.h | 55 +
141 fs/reiser4/readahead.c | 138 +
142 fs/reiser4/readahead.h | 48 +
143 fs/reiser4/reiser4.h | 269 ++
144 fs/reiser4/safe_link.c | 351 +++
145 fs/reiser4/safe_link.h | 29 +
146 fs/reiser4/seal.c | 218 ++
147 fs/reiser4/seal.h | 49 +
148 fs/reiser4/search.c | 1611 +++++++++++
149 fs/reiser4/status_flags.c | 175 ++
150 fs/reiser4/status_flags.h | 43 +
151 fs/reiser4/super.c | 316 +++
152 fs/reiser4/super.h | 464 +++
153 fs/reiser4/super_ops.c | 730 +++++
154 fs/reiser4/tap.c | 377 +++
155 fs/reiser4/tap.h | 70 +
156 fs/reiser4/tree.c | 1876 ++++++++++++
157 fs/reiser4/tree.h | 577 ++++
158 fs/reiser4/tree_mod.c | 386 +++
159 fs/reiser4/tree_mod.h | 29 +
160 fs/reiser4/tree_walk.c | 927 ++++++
161 fs/reiser4/tree_walk.h | 125 +
162 fs/reiser4/txnmgr.c | 3164 +++++++++++++++++++++
163 fs/reiser4/txnmgr.h | 708 +++++
164 fs/reiser4/type_safe_hash.h | 320 +++
165 fs/reiser4/vfs_ops.c | 259 ++
166 fs/reiser4/vfs_ops.h | 53 +
167 fs/reiser4/wander.c | 1797 ++++++++++++
168 fs/reiser4/wander.h | 135 +
169 fs/reiser4/writeout.h | 21 +
170 fs/reiser4/znode.c | 1029 +++++++
171 fs/reiser4/znode.h | 434 +++
172 include/linux/fs.h | 3 +
173 lib/radix-tree.c | 1 +
174 mm/filemap.c | 5 +
175 mm/readahead.c | 1 +
176 175 files changed, 79830 insertions(+), 10 deletions(-)
177
178diff --git a/Documentation/Changes b/Documentation/Changes
179index 73a8617..49ee889 100644
180--- a/Documentation/Changes
181+++ b/Documentation/Changes
182@@ -36,6 +36,7 @@ o module-init-tools 0.9.10 # depmod -V
183 o e2fsprogs 1.29 # tune2fs
184 o jfsutils 1.1.3 # fsck.jfs -V
185 o reiserfsprogs 3.6.3 # reiserfsck -V 2>&1|grep reiserfsprogs
186+o reiser4progs 1.0.0 # fsck.reiser4 -V
187 o xfsprogs 2.6.0 # xfs_db -V
188 o pcmciautils 004 # pccardctl -V
189 o quota-tools 3.09 # quota -V
190@@ -144,6 +145,13 @@ The reiserfsprogs package should be used for reiserfs-3.6.x
191 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
192 reiserfsck. These utils work on both i386 and alpha platforms.
193
194+Reiser4progs
195+------------
196+
197+The reiser4progs package contains utilities for the reiser4 file system.
198+Detailed instructions are provided in the README file located at:
199+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
200+
201 Xfsprogs
202 --------
203
204@@ -322,6 +330,10 @@ Reiserfsprogs
205 -------------
206 o <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
207
208+Reiser4progs
209+------------
210+o <ftp://ftp.namesys.com/pub/reiser4progs/>
211+
212 Xfsprogs
213 --------
214 o <ftp://oss.sgi.com/projects/xfs/download/>
215diff --git a/Documentation/filesystems/reiser4.txt b/Documentation/filesystems/reiser4.txt
216new file mode 100644
217index 0000000..8e07c9e
218--- /dev/null
219+++ b/Documentation/filesystems/reiser4.txt
220@@ -0,0 +1,75 @@
221+Reiser4 filesystem
222+==================
223+Reiser4 is a file system based on dancing tree algorithms, and is
224+described at http://www.namesys.com
225+
226+
227+References
228+==========
229+web page http://namesys.com/v4/v4.html
230+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
231+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
232+install page http://www.namesys.com/install_v4.html
233+
234+Compile options
235+===============
236+Enable reiser4 debug mode
237+ This checks everything imaginable while reiser4
238+ runs
239+
240+Mount options
241+=============
242+tmgr.atom_max_size=N
243+ Atoms containing more than N blocks will be forced to commit.
244+ N is decimal.
245+ Default is nr_free_pagecache_pages() / 2 at mount time.
246+
247+tmgr.atom_max_age=N
248+ Atoms older than N seconds will be forced to commit. N is decimal.
249+ Default is 600.
250+
251+tmgr.atom_max_flushers=N
252+ Limit of concurrent flushers for one atom. 0 means no limit.
253+ Default is 0.
254+
255+tree.cbk_cache.nr_slots=N
256+ Number of slots in the cbk cache.
257+
258+flush.relocate_threshold=N
259+ If flush finds more than N adjacent dirty leaf-level blocks it
260+ will force them to be relocated.
261+ Default is 64.
262+
263+flush.relocate_distance=N
264+ If flush finds can find a block allocation closer than at most
265+ N from the preceder it will relocate to that position.
266+ Default is 64.
267+
268+flush.scan_maxnodes=N
269+ The maximum number of nodes to scan left on a level during
270+ flush.
271+ Default is 10000.
272+
273+optimal_io_size=N
274+ Preferred IO size. This value is used to set st_blksize of
275+ struct stat.
276+ Default is 65536.
277+
278+bsdgroups
279+ Turn on BSD-style gid assignment.
280+
281+32bittimes
282+ By default file in reiser4 have 64 bit timestamps. Files
283+ created when filesystem is mounted with 32bittimes mount
284+ option will get 32 bit timestamps.
285+
286+mtflush
287+ Turn off concurrent flushing.
288+
289+nopseudo
290+ Disable pseudo files support. See
291+ http://namesys.com/v4/pseudo.html for more about pseudo files.
292+
293+dont_load_bitmap
294+ Don't load all bitmap blocks at mount time, it is useful for
295+ machines with tiny RAM and large disks.
296diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
297index d22cfc9..bb4a75a 100644
298--- a/arch/i386/lib/usercopy.c
299+++ b/arch/i386/lib/usercopy.c
300@@ -812,6 +812,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
301 #endif
302 return n;
303 }
304+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
305
306 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
307 unsigned long n)
308@@ -827,6 +828,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
309 #endif
310 return n;
311 }
312+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
313
314 /**
315 * copy_to_user: - Copy a block of data into user space.
316diff --git a/fs/Kconfig b/fs/Kconfig
317index 8cd2417..5a97039 100644
318--- a/fs/Kconfig
319+++ b/fs/Kconfig
320@@ -272,6 +272,8 @@ config FS_MBCACHE
321 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
322 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
323
324+source "fs/reiser4/Kconfig"
325+
326 config REISERFS_FS
327 tristate "Reiserfs support"
328 help
329diff --git a/fs/Makefile b/fs/Makefile
330index b9ffa63..b4c08ce 100644
331--- a/fs/Makefile
332+++ b/fs/Makefile
333@@ -62,6 +62,7 @@ obj-$(CONFIG_DLM) += dlm/
334
335 # Do not add any filesystems before this line
336 obj-$(CONFIG_REISERFS_FS) += reiserfs/
337+obj-$(CONFIG_REISER4_FS) += reiser4/
338 obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
339 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
340 obj-$(CONFIG_JBD) += jbd/
341diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
342index a4b142a..cdcff8c 100644
343--- a/fs/fs-writeback.c
344+++ b/fs/fs-writeback.c
345@@ -296,8 +296,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
346 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
347 * that it can be located for waiting on in __writeback_single_inode().
348 *
349- * Called under inode_lock.
350- *
351 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
352 * This function assumes that the blockdev superblock's inodes are backed by
353 * a variety of queues, so all inodes are searched. For other superblocks,
354@@ -313,11 +311,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
355 * on the writer throttling path, and we get decent balancing between many
356 * throttled threads: we don't want them all piling up on __wait_on_inode.
357 */
358-static void
359-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
360+void
361+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
362 {
363 const unsigned long start = jiffies; /* livelock avoidance */
364
365+ spin_lock(&inode_lock);
366+
367 if (!wbc->for_kupdate || list_empty(&sb->s_io))
368 list_splice_init(&sb->s_dirty, &sb->s_io);
369
370@@ -397,8 +397,19 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
371 if (wbc->nr_to_write <= 0)
372 break;
373 }
374+ spin_unlock(&inode_lock);
375 return; /* Leave any unwritten inodes on s_io */
376 }
377+EXPORT_SYMBOL(generic_sync_sb_inodes);
378+
379+static void
380+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
381+{
382+ if (sb->s_op->sync_inodes)
383+ sb->s_op->sync_inodes(sb, wbc);
384+ else
385+ generic_sync_sb_inodes(sb, wbc);
386+}
387
388 /*
389 * Start writeback of dirty pagecache data against all unlocked inodes.
390@@ -439,11 +450,8 @@ restart:
391 * be unmounted by the time it is released.
392 */
393 if (down_read_trylock(&sb->s_umount)) {
394- if (sb->s_root) {
395- spin_lock(&inode_lock);
396+ if (sb->s_root)
397 sync_sb_inodes(sb, wbc);
398- spin_unlock(&inode_lock);
399- }
400 up_read(&sb->s_umount);
401 }
402 spin_lock(&sb_lock);
403@@ -481,9 +489,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
404 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
405 nr_dirty + nr_unstable;
406 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
407- spin_lock(&inode_lock);
408 sync_sb_inodes(sb, &wbc);
409- spin_unlock(&inode_lock);
410 }
411
412 /*
413diff --git a/fs/reiser4/Kconfig b/fs/reiser4/Kconfig
414new file mode 100644
415index 0000000..f6e5195
416--- /dev/null
417+++ b/fs/reiser4/Kconfig
418@@ -0,0 +1,32 @@
419+config REISER4_FS
420+ tristate "Reiser4 (EXPERIMENTAL)"
421+ depends on EXPERIMENTAL
422+ select ZLIB_INFLATE
423+ select ZLIB_DEFLATE
424+ select CRYPTO
425+ help
426+ Reiser4 is a filesystem that performs all filesystem operations
427+ as atomic transactions, which means that it either performs a
428+ write, or it does not, and in the event of a crash it does not
429+ partially perform it or corrupt it.
430+
431+ It stores files in dancing trees, which are like balanced trees but
432+ faster. It packs small files together so that they share blocks
433+ without wasting space. This means you can use it to store really
434+ small files. It also means that it saves you disk space. It avoids
435+ hassling you with anachronisms like having a maximum number of
436+ inodes, and wasting space if you use less than that number.
437+
438+ Reiser4 is a distinct filesystem type from reiserfs (V3).
439+ It's therefore not possible to use reiserfs file systems
440+ with reiser4.
441+
442+ To learn more about reiser4, go to http://www.namesys.com
443+
444+config REISER4_DEBUG
445+ bool "Enable reiser4 debug mode"
446+ depends on REISER4_FS
447+ help
448+ Don't use this unless you are debugging reiser4.
449+
450+ If unsure, say N.
451diff --git a/fs/reiser4/Makefile b/fs/reiser4/Makefile
452new file mode 100644
453index 0000000..e78441e
454--- /dev/null
455+++ b/fs/reiser4/Makefile
456@@ -0,0 +1,99 @@
457+#
458+# reiser4/Makefile
459+#
460+
461+obj-$(CONFIG_REISER4_FS) += reiser4.o
462+
463+reiser4-y := \
464+ debug.o \
465+ jnode.o \
466+ znode.o \
467+ key.o \
468+ pool.o \
469+ tree_mod.o \
470+ estimate.o \
471+ carry.o \
472+ carry_ops.o \
473+ lock.o \
474+ tree.o \
475+ context.o \
476+ tap.o \
477+ coord.o \
478+ block_alloc.o \
479+ txnmgr.o \
480+ kassign.o \
481+ flush.o \
482+ wander.o \
483+ eottl.o \
484+ search.o \
485+ page_cache.o \
486+ seal.o \
487+ dscale.o \
488+ flush_queue.o \
489+ ktxnmgrd.o \
490+ blocknrset.o \
491+ super.o \
492+ super_ops.o \
493+ fsdata.o \
494+ export_ops.o \
495+ oid.o \
496+ tree_walk.o \
497+ inode.o \
498+ vfs_ops.o \
499+ as_ops.o \
500+ entd.o\
501+ readahead.o \
502+ status_flags.o \
503+ init_super.o \
504+ safe_link.o \
505+ \
506+ plugin/plugin.o \
507+ plugin/plugin_set.o \
508+ plugin/node/node.o \
509+ plugin/object.o \
510+ plugin/cluster.o \
511+ plugin/inode_ops.o \
512+ plugin/inode_ops_rename.o \
513+ plugin/file_ops.o \
514+ plugin/file_ops_readdir.o \
515+ plugin/file_plugin_common.o \
516+ plugin/file/file.o \
517+ plugin/file/tail_conversion.o \
518+ plugin/file/file_conversion.o \
519+ plugin/file/symlink.o \
520+ plugin/file/cryptcompress.o \
521+ plugin/dir_plugin_common.o \
522+ plugin/dir/hashed_dir.o \
523+ plugin/dir/seekable_dir.o \
524+ plugin/node/node40.o \
525+ \
526+ plugin/crypto/cipher.o \
527+ plugin/crypto/digest.o \
528+ \
529+ plugin/compress/minilzo.o \
530+ plugin/compress/compress.o \
531+ plugin/compress/compress_mode.o \
532+ \
533+ plugin/item/static_stat.o \
534+ plugin/item/sde.o \
535+ plugin/item/cde.o \
536+ plugin/item/blackbox.o \
537+ plugin/item/internal.o \
538+ plugin/item/tail.o \
539+ plugin/item/ctail.o \
540+ plugin/item/extent.o \
541+ plugin/item/extent_item_ops.o \
542+ plugin/item/extent_file_ops.o \
543+ plugin/item/extent_flush_ops.o \
544+ \
545+ plugin/hash.o \
546+ plugin/fibration.o \
547+ plugin/tail_policy.o \
548+ plugin/item/item.o \
549+ \
550+ plugin/security/perm.o \
551+ plugin/space/bitmap.o \
552+ \
553+ plugin/disk_format/disk_format40.o \
554+ plugin/disk_format/disk_format.o
555+
556diff --git a/fs/reiser4/README b/fs/reiser4/README
557new file mode 100644
558index 0000000..4637f59
559--- /dev/null
560+++ b/fs/reiser4/README
561@@ -0,0 +1,125 @@
562+[LICENSING]
563+
564+Reiser4 is hereby licensed under the GNU General
565+Public License version 2.
566+
567+Source code files that contain the phrase "licensing governed by
568+reiser4/README" are "governed files" throughout this file. Governed
569+files are licensed under the GPL. The portions of them owned by Hans
570+Reiser, or authorized to be licensed by him, have been in the past,
571+and likely will be in the future, licensed to other parties under
572+other licenses. If you add your code to governed files, and don't
573+want it to be owned by Hans Reiser, put your copyright label on that
574+code so the poor blight and his customers can keep things straight.
575+All portions of governed files not labeled otherwise are owned by Hans
576+Reiser, and by adding your code to it, widely distributing it to
577+others or sending us a patch, and leaving the sentence in stating that
578+licensing is governed by the statement in this file, you accept this.
579+It will be a kindness if you identify whether Hans Reiser is allowed
580+to license code labeled as owned by you on your behalf other than
581+under the GPL, because he wants to know if it is okay to do so and put
582+a check in the mail to you (for non-trivial improvements) when he
583+makes his next sale. He makes no guarantees as to the amount if any,
584+though he feels motivated to motivate contributors, and you can surely
585+discuss this with him before or after contributing. You have the
586+right to decline to allow him to license your code contribution other
587+than under the GPL.
588+
589+Further licensing options are available for commercial and/or other
590+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
591+the GPL as not allowing those additional licensing options, you read
592+it wrongly, and Richard Stallman agrees with me, when carefully read
593+you can see that those restrictions on additional terms do not apply
594+to the owner of the copyright, and my interpretation of this shall
595+govern for this license.
596+
597+[END LICENSING]
598+
599+Reiser4 is a file system based on dancing tree algorithms, and is
600+described at http://www.namesys.com
601+
602+mkfs.reiser4 and other utilities are on our webpage or wherever your
603+Linux provider put them. You really want to be running the latest
604+version off the website if you use fsck.
605+
606+Yes, if you update your reiser4 kernel module you do have to
607+recompile your kernel, most of the time. The errors you get will be
608+quite cryptic if your forget to do so.
609+
610+Hideous Commercial Pitch: Spread your development costs across other OS
611+vendors. Select from the best in the world, not the best in your
612+building, by buying from third party OS component suppliers. Leverage
613+the software component development power of the internet. Be the most
614+aggressive in taking advantage of the commercial possibilities of
615+decentralized internet development, and add value through your branded
616+integration that you sell as an operating system. Let your competitors
617+be the ones to compete against the entire internet by themselves. Be
618+hip, get with the new economic trend, before your competitors do. Send
619+email to reiser@namesys.com
620+
621+Hans Reiser was the primary architect of Reiser4, but a whole team
622+chipped their ideas in. He invested everything he had into Namesys
623+for 5.5 dark years of no money before Reiser3 finally started to work well
624+enough to bring in money. He owns the copyright.
625+
626+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
627+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
628+opinion, unique in its willingness to invest into things more
629+theoretical than the VC community can readily understand, and more
630+longterm than allows them to be sure that they will be the ones to
631+extract the economic benefits from. DARPA also integrated us into a
632+security community that transformed our security worldview.
633+
634+Vladimir Saveliev is our lead programmer, with us from the beginning,
635+and he worked long hours writing the cleanest code. This is why he is
636+now the lead programmer after years of commitment to our work. He
637+always made the effort to be the best he could be, and to make his
638+code the best that it could be. What resulted was quite remarkable. I
639+don't think that money can ever motivate someone to work the way he
640+did, he is one of the most selfless men I know.
641+
642+Alexander Lyamin was our sysadmin, and helped to educate us in
643+security issues. Moscow State University and IMT were very generous
644+in the internet access they provided us, and in lots of other little
645+ways that a generous institution can be.
646+
647+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
648+locking code, the block allocator, and finished the flushing code.
649+His code is always crystal clean and well structured.
650+
651+Nikita Danilov wrote the core of the balancing code, the core of the
652+plugins code, and the directory code. He worked a steady pace of long
653+hours that produced a whole lot of well abstracted code. He is our
654+senior computer scientist.
655+
656+Vladimir Demidov wrote the parser. Writing an in kernel parser is
657+something very few persons have the skills for, and it is thanks to
658+him that we can say that the parser is really not so big compared to
659+various bits of our other code, and making a parser work in the kernel
660+was not so complicated as everyone would imagine mainly because it was
661+him doing it...
662+
663+Joshua McDonald wrote the transaction manager, and the flush code.
664+The flush code unexpectedly turned out be extremely hairy for reasons
665+you can read about on our web page, and he did a great job on an
666+extremely difficult task.
667+
668+Nina Reiser handled our accounting, government relations, and much
669+more.
670+
671+Ramon Reiser developed our website.
672+
673+Beverly Palmer drew our graphics.
674+
675+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
676+and worked with Umka on developing libreiser4 and userspace plugins.
677+
678+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
679+userspace tools (reiser4progs).
680+
681+Oleg Drokin (aka Green) is the release manager who fixes everything.
682+It is so nice to have someone like that on the team. He (plus Chris
683+and Jeff) make it possible for the entire rest of the Namesys team to
684+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
685+is just amazing to watch his talent for spotting bugs in action.
686+
687diff --git a/fs/reiser4/as_ops.c b/fs/reiser4/as_ops.c
688new file mode 100644
689index 0000000..b4f3375
690--- /dev/null
691+++ b/fs/reiser4/as_ops.c
692@@ -0,0 +1,339 @@
693+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
694+
695+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
696+
697+#include "forward.h"
698+#include "debug.h"
699+#include "dformat.h"
700+#include "coord.h"
701+#include "plugin/item/item.h"
702+#include "plugin/file/file.h"
703+#include "plugin/security/perm.h"
704+#include "plugin/disk_format/disk_format.h"
705+#include "plugin/plugin.h"
706+#include "plugin/plugin_set.h"
707+#include "plugin/object.h"
708+#include "txnmgr.h"
709+#include "jnode.h"
710+#include "znode.h"
711+#include "block_alloc.h"
712+#include "tree.h"
713+#include "vfs_ops.h"
714+#include "inode.h"
715+#include "page_cache.h"
716+#include "ktxnmgrd.h"
717+#include "super.h"
718+#include "reiser4.h"
719+#include "entd.h"
720+
721+#include <linux/profile.h>
722+#include <linux/types.h>
723+#include <linux/mount.h>
724+#include <linux/vfs.h>
725+#include <linux/mm.h>
726+#include <linux/buffer_head.h>
727+#include <linux/dcache.h>
728+#include <linux/list.h>
729+#include <linux/pagemap.h>
730+#include <linux/slab.h>
731+#include <linux/seq_file.h>
732+#include <linux/init.h>
733+#include <linux/module.h>
734+#include <linux/writeback.h>
735+#include <linux/backing-dev.h>
736+#include <linux/quotaops.h>
737+#include <linux/security.h>
738+
739+/* address space operations */
740+
741+/**
742+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
743+ * @page: page to be dirtied
744+ *
745+ * Operation of struct address_space_operations. This implementation is used by
746+ * unix and cryptcompress file plugins.
747+ *
748+ * This is called when reiser4 page gets dirtied outside of reiser4, for
749+ * example, when dirty bit is moved from pte to physical page.
750+ *
751+ * Tags page in the mapping's page tree with special tag so that it is possible
752+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
753+ * capturing by an atom) later because it can not be done in the contexts where
754+ * set_page_dirty is called.
755+ */
756+int reiser4_set_page_dirty(struct page *page)
757+{
758+ /* this page can be unformatted only */
759+ assert("vs-1734", (page->mapping &&
760+ page->mapping->host &&
761+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
762+ page->mapping->host
763+ && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
764+ page->mapping->host
765+ && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
766+ page->mapping->host));
767+
768+ if (!TestSetPageDirty(page)) {
769+ struct address_space *mapping = page->mapping;
770+
771+ if (mapping) {
772+ write_lock_irq(&mapping->tree_lock);
773+
774+ /* check for race with truncate */
775+ if (page->mapping) {
776+ assert("vs-1652", page->mapping == mapping);
777+ if (mapping_cap_account_dirty(mapping))
778+ inc_zone_page_state(page,
779+ NR_FILE_DIRTY);
780+ radix_tree_tag_set(&mapping->page_tree,
781+ page->index,
782+ PAGECACHE_TAG_REISER4_MOVED);
783+ }
784+ write_unlock_irq(&mapping->tree_lock);
785+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
786+ }
787+ }
788+ return 0;
789+}
790+
791+/* ->invalidatepage method for reiser4 */
792+
793+/*
794+ * this is called for each truncated page from
795+ * truncate_inode_pages()->truncate_{complete,partial}_page().
796+ *
797+ * At the moment of call, page is under lock, and outstanding io (if any) has
798+ * completed.
799+ */
800+
801+/**
802+ * reiser4_invalidatepage
803+ * @page: page to invalidate
804+ * @offset: starting offset for partial invalidation
805+ *
806+ */
807+void reiser4_invalidatepage(struct page *page, unsigned long offset)
808+{
809+ int ret = 0;
810+ reiser4_context *ctx;
811+ struct inode *inode;
812+ jnode *node;
813+
814+ /*
815+ * This is called to truncate file's page.
816+ *
817+ * Originally, reiser4 implemented truncate in a standard way
818+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
819+ * first, then file system ->truncate() call-back is invoked).
820+ *
821+ * This lead to the problem when ->invalidatepage() was called on a
822+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
823+ * process. That is, truncate was bypassing transactions. To avoid
824+ * this, try_capture_page_to_invalidate() call was added here.
825+ *
826+ * After many troubles with vmtruncate() based truncate (including
827+ * races with flush, tail conversion, etc.) it was re-written in the
828+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
829+ * and pages belonging to extent are invalidated in kill_hook_extent().
830+ * So probably now additional call to capture is not needed here.
831+ */
832+
833+ assert("nikita-3137", PageLocked(page));
834+ assert("nikita-3138", !PageWriteback(page));
835+ inode = page->mapping->host;
836+
837+ /*
838+ * ->invalidatepage() should only be called for the unformatted
839+ * jnodes. Destruction of all other types of jnodes is performed
840+ * separately. But, during some corner cases (like handling errors
841+ * during mount) it is simpler to let ->invalidatepage to be called on
842+ * them. Check for this, and do nothing.
843+ */
844+ if (reiser4_get_super_fake(inode->i_sb) == inode)
845+ return;
846+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
847+ return;
848+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
849+ return;
850+ assert("vs-1426", PagePrivate(page));
851+ assert("vs-1427",
852+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
853+ assert("", jprivate(page) != NULL);
854+ assert("", ergo(inode_file_plugin(inode) !=
855+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
856+ offset == 0));
857+
858+ ctx = reiser4_init_context(inode->i_sb);
859+ if (IS_ERR(ctx))
860+ return;
861+
862+ node = jprivate(page);
863+ spin_lock_jnode(node);
864+ if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
865+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
866+ /* there is not need to capture */
867+ jref(node);
868+ JF_SET(node, JNODE_HEARD_BANSHEE);
869+ page_clear_jnode(page, node);
870+ reiser4_uncapture_jnode(node);
871+ unhash_unformatted_jnode(node);
872+ jput(node);
873+ reiser4_exit_context(ctx);
874+ return;
875+ }
876+ spin_unlock_jnode(node);
877+
878+ /* capture page being truncated. */
879+ ret = try_capture_page_to_invalidate(page);
880+ if (ret != 0)
881+ warning("nikita-3141", "Cannot capture: %i", ret);
882+
883+ if (offset == 0) {
884+ /* remove jnode from transaction and detach it from page. */
885+ jref(node);
886+ JF_SET(node, JNODE_HEARD_BANSHEE);
887+ /* page cannot be detached from jnode concurrently, because it
888+ * is locked */
889+ reiser4_uncapture_page(page);
890+
891+ /* this detaches page from jnode, so that jdelete will not try
892+ * to lock page which is already locked */
893+ spin_lock_jnode(node);
894+ page_clear_jnode(page, node);
895+ spin_unlock_jnode(node);
896+ unhash_unformatted_jnode(node);
897+
898+ jput(node);
899+ }
900+
901+ reiser4_exit_context(ctx);
902+}
903+
904+/* help function called from reiser4_releasepage(). It returns true if jnode
905+ * can be detached from its page and page released. */
906+int jnode_is_releasable(jnode * node /* node to check */ )
907+{
908+ assert("nikita-2781", node != NULL);
909+ assert_spin_locked(&(node->guard));
910+ assert_spin_locked(&(node->load));
911+
912+ /* is some thread is currently using jnode page, later cannot be
913+ * detached */
914+ if (atomic_read(&node->d_count) != 0) {
915+ return 0;
916+ }
917+
918+ assert("vs-1214", !jnode_is_loaded(node));
919+
920+ /*
921+ * can only release page if real block number is assigned to it. Simple
922+ * check for ->atom wouldn't do, because it is possible for node to be
923+ * clean, not it atom yet, and still having fake block number. For
924+ * example, node just created in jinit_new().
925+ */
926+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
927+ return 0;
928+
929+ /*
930+ * pages prepared for write can not be released anyway, so avoid
931+ * detaching jnode from the page
932+ */
933+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
934+ return 0;
935+
936+ /*
937+ * dirty jnode cannot be released. It can however be submitted to disk
938+ * as part of early flushing, but only after getting flush-prepped.
939+ */
940+ if (JF_ISSET(node, JNODE_DIRTY))
941+ return 0;
942+
943+ /* overwrite set is only written by log writer. */
944+ if (JF_ISSET(node, JNODE_OVRWR))
945+ return 0;
946+
947+ /* jnode is already under writeback */
948+ if (JF_ISSET(node, JNODE_WRITEBACK))
949+ return 0;
950+
951+ /* don't flush bitmaps or journal records */
952+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
953+ return 0;
954+
955+ return 1;
956+}
957+
958+/*
959+ * ->releasepage method for reiser4
960+ *
961+ * This is called by VM scanner when it comes across clean page. What we have
962+ * to do here is to check whether page can really be released (freed that is)
963+ * and if so, detach jnode from it and remove page from the page cache.
964+ *
965+ * Check for releasability is done by releasable() function.
966+ */
967+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
968+{
969+ jnode *node;
970+
971+ assert("nikita-2257", PagePrivate(page));
972+ assert("nikita-2259", PageLocked(page));
973+ assert("nikita-2892", !PageWriteback(page));
974+ assert("nikita-3019", reiser4_schedulable());
975+
976+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
977+ is not clear what to do in this case. A lot of deadlocks seems be
978+ possible. */
979+ if (page_count(page) > 3)
980+ return 0;
981+
982+ node = jnode_by_page(page);
983+ assert("nikita-2258", node != NULL);
984+ assert("reiser4-4", page->mapping != NULL);
985+ assert("reiser4-5", page->mapping->host != NULL);
986+
987+ if (PageDirty(page))
988+ return 0;
989+
990+ /* extra page reference is used by reiser4 to protect
991+ * jnode<->page link from this ->releasepage(). */
992+ if (page_count(page) > 3)
993+ return 0;
994+
995+ /* releasable() needs jnode lock, because it looks at the jnode fields
996+ * and we need jload_lock here to avoid races with jload(). */
997+ spin_lock_jnode(node);
998+ spin_lock(&(node->load));
999+ if (jnode_is_releasable(node)) {
1000+ struct address_space *mapping;
1001+
1002+ mapping = page->mapping;
1003+ jref(node);
1004+ /* there is no need to synchronize against
1005+ * jnode_extent_write() here, because pages seen by
1006+ * jnode_extent_write() are !releasable(). */
1007+ page_clear_jnode(page, node);
1008+ spin_unlock(&(node->load));
1009+ spin_unlock_jnode(node);
1010+
1011+ /* we are under memory pressure so release jnode also. */
1012+ jput(node);
1013+
1014+ return 1;
1015+ } else {
1016+ spin_unlock(&(node->load));
1017+ spin_unlock_jnode(node);
1018+ assert("nikita-3020", reiser4_schedulable());
1019+ return 0;
1020+ }
1021+}
1022+
1023+/* Make Linus happy.
1024+ Local variables:
1025+ c-indentation-style: "K&R"
1026+ mode-name: "LC"
1027+ c-basic-offset: 8
1028+ tab-width: 8
1029+ fill-column: 120
1030+ End:
1031+*/
1032diff --git a/fs/reiser4/block_alloc.c b/fs/reiser4/block_alloc.c
1033new file mode 100644
1034index 0000000..c405c5f
1035--- /dev/null
1036+++ b/fs/reiser4/block_alloc.c
1037@@ -0,0 +1,1137 @@
1038+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1039+
1040+#include "debug.h"
1041+#include "dformat.h"
1042+#include "plugin/plugin.h"
1043+#include "txnmgr.h"
1044+#include "znode.h"
1045+#include "block_alloc.h"
1046+#include "tree.h"
1047+#include "super.h"
1048+
1049+#include <linux/types.h> /* for __u?? */
1050+#include <linux/fs.h> /* for struct super_block */
1051+#include <linux/spinlock.h>
1052+
1053+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
1054+
1055+/* We need to be able to reserve enough disk space to ensure that an atomic
1056+ operation will have enough disk space to flush (see flush.c and
1057+ http://namesys.com/v4/v4.html) and commit it once it is started.
1058+
1059+ In our design a call for reserving disk space may fail but not an actual
1060+ block allocation.
1061+
1062+ All free blocks, already allocated blocks, and all kinds of reserved blocks
1063+ are counted in different per-fs block counters.
1064+
1065+ A reiser4 super block's set of block counters currently is:
1066+
1067+ free -- free blocks,
1068+ used -- already allocated blocks,
1069+
1070+ grabbed -- initially reserved for performing an fs operation, those blocks
1071+ are taken from free blocks, then grabbed disk space leaks from grabbed
1072+ blocks counter to other counters like "fake allocated", "flush
1073+ reserved", "used", the rest of not used grabbed space is returned to
1074+ free space at the end of fs operation;
1075+
1076+ fake allocated -- counts all nodes without real disk block numbers assigned,
1077+ we have separate accounting for formatted and unformatted
1078+ nodes (for easier debugging);
1079+
1080+ flush reserved -- disk space needed for flushing and committing an atom.
1081+ Each dirty already allocated block could be written as a
1082+ part of atom's overwrite set or as a part of atom's
1083+ relocate set. In both case one additional block is needed,
1084+ it is used as a wandered block if we do overwrite or as a
1085+ new location for a relocated block.
1086+
1087+ In addition, blocks in some states are counted on per-thread and per-atom
1088+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
1089+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
1090+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
1091+ blocks, which are reserved for flush processing and atom commit. */
1092+
1093+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
1094+ number of blocks to grab for most expensive case of balancing when the leaf
1095+ node we insert new item to gets split and new leaf node is allocated.
1096+
1097+ So, we need to grab blocks for
1098+
1099+ 1) one block for possible dirtying the node we insert an item to. That block
1100+ would be used for node relocation at flush time or for allocating of a
1101+ wandered one, it depends what will be a result (what set, relocate or
1102+ overwrite the node gets assigned to) of the node processing by the flush
1103+ algorithm.
1104+
1105+ 2) one block for either allocating a new node, or dirtying of right or left
1106+ clean neighbor, only one case may happen.
1107+
1108+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
1109+ node, and creation of new node. have I forgotten something? email me.
1110+
1111+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
1112+ counter and in the fs-wide one (both ctx->grabbed_blocks and
1113+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
1114+ decremented by 2.
1115+
1116+ Suppose both two blocks were spent for dirtying of an already allocated clean
1117+ node (one block went from "grabbed" to "flush reserved") and for new block
1118+ allocating (one block went from "grabbed" to "fake allocated formatted").
1119+
1120+ Inserting of a child pointer to the parent node caused parent node to be
1121+ split, the balancing code takes care about this grabbing necessary space
1122+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
1123+ "can use the 5% reserved disk space".
1124+
1125+ At this moment insertion completes and grabbed blocks (if they were not used)
1126+ should be returned to the free space counter.
1127+
1128+ However the atom life-cycle is not completed. The atom had one "flush
1129+ reserved" block added by our insertion and the new fake allocated node is
1130+ counted as a "fake allocated formatted" one. The atom has to be fully
1131+ processed by flush before commit. Suppose that the flush moved the first,
1132+ already allocated node to the atom's overwrite list, the new fake allocated
1133+ node, obviously, went into the atom relocate set. The reiser4 flush
1134+ allocates the new node using one unit from "fake allocated formatted"
1135+ counter, the log writer uses one from "flush reserved" for wandered block
1136+ allocation.
1137+
1138+ And, it is not the end. When the wandered block is deallocated after the
1139+ atom gets fully played (see wander.c for term description), the disk space
1140+ occupied for it is returned to free blocks. */
1141+
1142+/* BLOCK NUMBERS */
1143+
1144+/* Any reiser4 node has a block number assigned to it. We use these numbers for
1145+ indexing in hash tables, so if a block has not yet been assigned a location
1146+ on disk we need to give it a temporary fake block number.
1147+
1148+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
1149+ use highest bit in 64-bit block number to distinguish fake and real block
1150+ numbers. So, only 63 bits may be used to addressing of real device
1151+ blocks. That "fake" block numbers space is divided into subspaces of fake
1152+ block numbers for data blocks and for shadow (working) bitmap blocks.
1153+
1154+ Fake block numbers for data blocks are generated by a cyclic counter, which
1155+ gets incremented after each real block allocation. We assume that it is
1156+ impossible to overload this counter during one transaction life. */
1157+
1158+/* Initialize a blocknr hint. */
1159+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
1160+{
1161+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
1162+}
1163+
1164+/* Release any resources of a blocknr hint. */
1165+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1166+{
1167+ /* No resources should be freed in current blocknr_hint implementation. */
1168+}
1169+
1170+/* see above for explanation of fake block number. */
1171+/* Audited by: green(2002.06.11) */
1172+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
1173+{
1174+ /* The reason for not simply returning result of '&' operation is that
1175+ while return value is (possibly 32bit) int, the reiser4_block_nr is
1176+ at least 64 bits long, and high bit (which is the only possible
1177+ non zero bit after the masking) would be stripped off */
1178+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1179+}
1180+
1181+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1182+ arithmetic. Mostly, they are isolated to not to code same assertions in
1183+ several places. */
1184+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
1185+{
1186+ BUG_ON(ctx->grabbed_blocks < count);
1187+ assert("zam-527", ctx->grabbed_blocks >= count);
1188+ ctx->grabbed_blocks -= count;
1189+}
1190+
1191+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
1192+{
1193+ ctx->grabbed_blocks += count;
1194+}
1195+
1196+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
1197+{
1198+ assert("zam-525", sbinfo->blocks_grabbed >= count);
1199+ sbinfo->blocks_grabbed -= count;
1200+}
1201+
1202+/* Decrease the counter of block reserved for flush in super block. */
1203+static void
1204+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1205+{
1206+ assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
1207+ sbinfo->blocks_flush_reserved -= count;
1208+}
1209+
1210+static void
1211+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1212+ reiser4_ba_flags_t flags)
1213+{
1214+ if (flags & BA_FORMATTED) {
1215+ assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1216+ sbinfo->blocks_fake_allocated -= count;
1217+ } else {
1218+ assert("zam-528",
1219+ sbinfo->blocks_fake_allocated_unformatted >= count);
1220+ sbinfo->blocks_fake_allocated_unformatted -= count;
1221+ }
1222+}
1223+
1224+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
1225+{
1226+ assert("zam-530",
1227+ sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1228+ sbinfo->blocks_used -= count;
1229+}
1230+
1231+static void
1232+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1233+{
1234+ assert("edward-501", sbinfo->blocks_clustered >= count);
1235+ sbinfo->blocks_clustered -= count;
1236+}
1237+
1238+/* Increase the counter of block reserved for flush in atom. */
1239+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1240+{
1241+ assert("zam-772", atom != NULL);
1242+ assert_spin_locked(&(atom->alock));
1243+ atom->flush_reserved += count;
1244+}
1245+
1246+/* Decrease the counter of block reserved for flush in atom. */
1247+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1248+{
1249+ assert("zam-774", atom != NULL);
1250+ assert_spin_locked(&(atom->alock));
1251+ assert("nikita-2790", atom->flush_reserved >= count);
1252+ atom->flush_reserved -= count;
1253+}
1254+
1255+/* super block has 6 counters: free, used, grabbed, fake allocated
1256+ (formatted and unformatted) and flush reserved. Their sum must be
1257+ number of blocks on a device. This function checks this */
1258+int reiser4_check_block_counters(const struct super_block *super)
1259+{
1260+ __u64 sum;
1261+
1262+ sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1263+ reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1264+ reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
1265+ reiser4_clustered_blocks(super);
1266+ if (reiser4_block_count(super) != sum) {
1267+ printk("super block counters: "
1268+ "used %llu, free %llu, "
1269+ "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1270+ "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1271+ (unsigned long long)reiser4_data_blocks(super),
1272+ (unsigned long long)reiser4_free_blocks(super),
1273+ (unsigned long long)reiser4_grabbed_blocks(super),
1274+ (unsigned long long)reiser4_fake_allocated(super),
1275+ (unsigned long long)
1276+ reiser4_fake_allocated_unformatted(super),
1277+ (unsigned long long)reiser4_flush_reserved(super),
1278+ (unsigned long long)reiser4_clustered_blocks(super),
1279+ (unsigned long long)sum,
1280+ (unsigned long long)reiser4_block_count(super));
1281+ return 0;
1282+ }
1283+ return 1;
1284+}
1285+
1286+/* Adjust "working" free blocks counter for number of blocks we are going to
1287+ allocate. Record number of grabbed blocks in fs-wide and per-thread
1288+ counters. This function should be called before bitmap scanning or
1289+ allocating fake block numbers
1290+
1291+ @super -- pointer to reiser4 super block;
1292+ @count -- number of blocks we reserve;
1293+
1294+ @return -- 0 if success, -ENOSPC, if all
1295+ free blocks are preserved or already allocated.
1296+*/
1297+
1298+static int
1299+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1300+{
1301+ __u64 free_blocks;
1302+ int ret = 0, use_reserved = flags & BA_RESERVED;
1303+ reiser4_super_info_data *sbinfo;
1304+
1305+ assert("vs-1276", ctx == get_current_context());
1306+
1307+ /* Do not grab anything on ro-mounted fs. */
1308+ if (rofs_super(ctx->super)) {
1309+ ctx->grab_enabled = 0;
1310+ return 0;
1311+ }
1312+
1313+ sbinfo = get_super_private(ctx->super);
1314+
1315+ spin_lock_reiser4_super(sbinfo);
1316+
1317+ free_blocks = sbinfo->blocks_free;
1318+
1319+ if ((use_reserved && free_blocks < count) ||
1320+ (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1321+ ret = RETERR(-ENOSPC);
1322+ goto unlock_and_ret;
1323+ }
1324+
1325+ add_to_ctx_grabbed(ctx, count);
1326+
1327+ sbinfo->blocks_grabbed += count;
1328+ sbinfo->blocks_free -= count;
1329+
1330+#if REISER4_DEBUG
1331+ if (ctx->grabbed_initially == 0)
1332+ ctx->grabbed_initially = count;
1333+#endif
1334+
1335+ assert("nikita-2986", reiser4_check_block_counters(ctx->super));
1336+
1337+ /* disable grab space in current context */
1338+ ctx->grab_enabled = 0;
1339+
1340+ unlock_and_ret:
1341+ spin_unlock_reiser4_super(sbinfo);
1342+
1343+ return ret;
1344+}
1345+
1346+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1347+{
1348+ int ret;
1349+ reiser4_context *ctx;
1350+
1351+ assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1352+ lock_stack_isclean(get_current_lock_stack
1353+ ())));
1354+ ctx = get_current_context();
1355+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1356+ return 0;
1357+ }
1358+
1359+ ret = reiser4_grab(ctx, count, flags);
1360+ if (ret == -ENOSPC) {
1361+
1362+ /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1363+ if (flags & BA_CAN_COMMIT) {
1364+ txnmgr_force_commit_all(ctx->super, 0);
1365+ ctx->grab_enabled = 1;
1366+ ret = reiser4_grab(ctx, count, flags);
1367+ }
1368+ }
1369+ /*
1370+ * allocation from reserved pool cannot fail. This is severe error.
1371+ */
1372+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1373+ return ret;
1374+}
1375+
1376+/*
1377+ * SPACE RESERVED FOR UNLINK/TRUNCATE
1378+ *
1379+ * Unlink and truncate require space in transaction (to update stat data, at
1380+ * least). But we don't want rm(1) to fail with "No space on device" error.
1381+ *
1382+ * Solution is to reserve 5% of disk space for truncates and
1383+ * unlinks. Specifically, normal space grabbing requests don't grab space from
1384+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1385+ * drain it. Per super block delete mutex is used to allow only one
1386+ * thread at a time to grab from reserved area.
1387+ *
1388+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1389+ * flag.
1390+ *
1391+ */
1392+
1393+int reiser4_grab_reserved(struct super_block *super,
1394+ __u64 count, reiser4_ba_flags_t flags)
1395+{
1396+ reiser4_super_info_data *sbinfo = get_super_private(super);
1397+
1398+ assert("nikita-3175", flags & BA_CAN_COMMIT);
1399+
1400+ /* Check the delete mutex already taken by us, we assume that
1401+ * reading of machine word is atomic. */
1402+ if (sbinfo->delete_mutex_owner == current) {
1403+ if (reiser4_grab_space
1404+ (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1405+ warning("zam-1003",
1406+ "nested call of grab_reserved fails count=(%llu)",
1407+ (unsigned long long)count);
1408+ reiser4_release_reserved(super);
1409+ return RETERR(-ENOSPC);
1410+ }
1411+ return 0;
1412+ }
1413+
1414+ if (reiser4_grab_space(count, flags)) {
1415+ mutex_lock(&sbinfo->delete_mutex);
1416+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
1417+ sbinfo->delete_mutex_owner = current;
1418+
1419+ if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1420+ warning("zam-833",
1421+ "reserved space is not enough (%llu)",
1422+ (unsigned long long)count);
1423+ reiser4_release_reserved(super);
1424+ return RETERR(-ENOSPC);
1425+ }
1426+ }
1427+ return 0;
1428+}
1429+
1430+void reiser4_release_reserved(struct super_block *super)
1431+{
1432+ reiser4_super_info_data *info;
1433+
1434+ info = get_super_private(super);
1435+ if (info->delete_mutex_owner == current) {
1436+ info->delete_mutex_owner = NULL;
1437+ mutex_unlock(&info->delete_mutex);
1438+ }
1439+}
1440+
1441+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1442+{
1443+ reiser4_context *ctx;
1444+ reiser4_super_info_data *sbinfo;
1445+
1446+ ctx = get_current_context();
1447+ sub_from_ctx_grabbed(ctx, count);
1448+
1449+ sbinfo = get_super_private(ctx->super);
1450+ spin_lock_reiser4_super(sbinfo);
1451+
1452+ sub_from_sb_grabbed(sbinfo, count);
1453+ /* return sbinfo locked */
1454+ return sbinfo;
1455+}
1456+
1457+/* is called after @count fake block numbers are allocated and pointer to
1458+ those blocks are inserted into tree. */
1459+static void grabbed2fake_allocated_formatted(void)
1460+{
1461+ reiser4_super_info_data *sbinfo;
1462+
1463+ sbinfo = grabbed2fake_allocated_head(1);
1464+ sbinfo->blocks_fake_allocated++;
1465+
1466+ assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1467+
1468+ spin_unlock_reiser4_super(sbinfo);
1469+}
1470+
1471+/**
1472+ * grabbed2fake_allocated_unformatted
1473+ * @count:
1474+ *
1475+ */
1476+static void grabbed2fake_allocated_unformatted(int count)
1477+{
1478+ reiser4_super_info_data *sbinfo;
1479+
1480+ sbinfo = grabbed2fake_allocated_head(count);
1481+ sbinfo->blocks_fake_allocated_unformatted += count;
1482+
1483+ assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1484+
1485+ spin_unlock_reiser4_super(sbinfo);
1486+}
1487+
1488+void grabbed2cluster_reserved(int count)
1489+{
1490+ reiser4_context *ctx;
1491+ reiser4_super_info_data *sbinfo;
1492+
1493+ ctx = get_current_context();
1494+ sub_from_ctx_grabbed(ctx, count);
1495+
1496+ sbinfo = get_super_private(ctx->super);
1497+ spin_lock_reiser4_super(sbinfo);
1498+
1499+ sub_from_sb_grabbed(sbinfo, count);
1500+ sbinfo->blocks_clustered += count;
1501+
1502+ assert("edward-504", reiser4_check_block_counters(ctx->super));
1503+
1504+ spin_unlock_reiser4_super(sbinfo);
1505+}
1506+
1507+void cluster_reserved2grabbed(int count)
1508+{
1509+ reiser4_context *ctx;
1510+ reiser4_super_info_data *sbinfo;
1511+
1512+ ctx = get_current_context();
1513+
1514+ sbinfo = get_super_private(ctx->super);
1515+ spin_lock_reiser4_super(sbinfo);
1516+
1517+ sub_from_cluster_reserved(sbinfo, count);
1518+ sbinfo->blocks_grabbed += count;
1519+
1520+ assert("edward-505", reiser4_check_block_counters(ctx->super));
1521+
1522+ spin_unlock_reiser4_super(sbinfo);
1523+ add_to_ctx_grabbed(ctx, count);
1524+}
1525+
1526+void cluster_reserved2free(int count)
1527+{
1528+ reiser4_context *ctx;
1529+ reiser4_super_info_data *sbinfo;
1530+
1531+ ctx = get_current_context();
1532+ sbinfo = get_super_private(ctx->super);
1533+
1534+ cluster_reserved2grabbed(count);
1535+ grabbed2free(ctx, sbinfo, count);
1536+}
1537+
1538+static DEFINE_SPINLOCK(fake_lock);
1539+static reiser4_block_nr fake_gen = 0;
1540+
1541+/**
1542+ * assign_fake_blocknr
1543+ * @blocknr:
1544+ * @count:
1545+ *
1546+ * Obtain a fake block number for new node which will be used to refer to
1547+ * this newly allocated node until real allocation is done.
1548+ */
1549+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1550+{
1551+ spin_lock(&fake_lock);
1552+ *blocknr = fake_gen;
1553+ fake_gen += count;
1554+ spin_unlock(&fake_lock);
1555+
1556+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1557+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1558+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1559+ assert("zam-394", zlook(current_tree, blocknr) == NULL);
1560+}
1561+
1562+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1563+{
1564+ assign_fake_blocknr(blocknr, 1);
1565+ grabbed2fake_allocated_formatted();
1566+ return 0;
1567+}
1568+
1569+/**
1570+ * fake_blocknrs_unformatted
1571+ * @count: number of fake numbers to get
1572+ *
1573+ * Allocates @count fake block numbers which will be assigned to jnodes
1574+ */
1575+reiser4_block_nr fake_blocknr_unformatted(int count)
1576+{
1577+ reiser4_block_nr blocknr;
1578+
1579+ assign_fake_blocknr(&blocknr, count);
1580+ grabbed2fake_allocated_unformatted(count);
1581+
1582+ return blocknr;
1583+}
1584+
1585+/* adjust sb block counters, if real (on-disk) block allocation immediately
1586+ follows grabbing of free disk space. */
1587+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1588+ __u64 count)
1589+{
1590+ sub_from_ctx_grabbed(ctx, count);
1591+
1592+ spin_lock_reiser4_super(sbinfo);
1593+
1594+ sub_from_sb_grabbed(sbinfo, count);
1595+ sbinfo->blocks_used += count;
1596+
1597+ assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1598+
1599+ spin_unlock_reiser4_super(sbinfo);
1600+}
1601+
1602+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1603+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1604+ reiser4_ba_flags_t flags)
1605+{
1606+ spin_lock_reiser4_super(sbinfo);
1607+
1608+ sub_from_sb_fake_allocated(sbinfo, count, flags);
1609+ sbinfo->blocks_used += count;
1610+
1611+ assert("nikita-2680",
1612+ reiser4_check_block_counters(reiser4_get_current_sb()));
1613+
1614+ spin_unlock_reiser4_super(sbinfo);
1615+}
1616+
1617+static void flush_reserved2used(txn_atom * atom, __u64 count)
1618+{
1619+ reiser4_super_info_data *sbinfo;
1620+
1621+ assert("zam-787", atom != NULL);
1622+ assert_spin_locked(&(atom->alock));
1623+
1624+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1625+
1626+ sbinfo = get_current_super_private();
1627+ spin_lock_reiser4_super(sbinfo);
1628+
1629+ sub_from_sb_flush_reserved(sbinfo, count);
1630+ sbinfo->blocks_used += count;
1631+
1632+ assert("zam-789",
1633+ reiser4_check_block_counters(reiser4_get_current_sb()));
1634+
1635+ spin_unlock_reiser4_super(sbinfo);
1636+}
1637+
1638+/* update the per fs blocknr hint default value. */
1639+void
1640+update_blocknr_hint_default(const struct super_block *s,
1641+ const reiser4_block_nr * block)
1642+{
1643+ reiser4_super_info_data *sbinfo = get_super_private(s);
1644+
1645+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1646+
1647+ spin_lock_reiser4_super(sbinfo);
1648+ if (*block < sbinfo->block_count) {
1649+ sbinfo->blocknr_hint_default = *block;
1650+ } else {
1651+ warning("zam-676",
1652+ "block number %llu is too large to be used in a blocknr hint\n",
1653+ (unsigned long long)*block);
1654+ dump_stack();
1655+ DEBUGON(1);
1656+ }
1657+ spin_unlock_reiser4_super(sbinfo);
1658+}
1659+
1660+/* get current value of the default blocknr hint. */
1661+void get_blocknr_hint_default(reiser4_block_nr * result)
1662+{
1663+ reiser4_super_info_data *sbinfo = get_current_super_private();
1664+
1665+ spin_lock_reiser4_super(sbinfo);
1666+ *result = sbinfo->blocknr_hint_default;
1667+ assert("zam-677", *result < sbinfo->block_count);
1668+ spin_unlock_reiser4_super(sbinfo);
1669+}
1670+
1671+/* Allocate "real" disk blocks by calling a proper space allocation plugin
1672+ * method. Blocks are allocated in one contiguous disk region. The plugin
1673+ * independent part accounts blocks by subtracting allocated amount from grabbed
1674+ * or fake block counter and add the same amount to the counter of allocated
1675+ * blocks.
1676+ *
1677+ * @hint -- a reiser4 blocknr hint object which contains further block
1678+ * allocation hints and parameters (search start, a stage of block
1679+ * which will be mapped to disk, etc.),
1680+ * @blk -- an out parameter for the beginning of the allocated region,
1681+ * @len -- in/out parameter, it should contain the maximum number of allocated
1682+ * blocks, after block allocation completes, it contains the length of
1683+ * allocated disk region.
1684+ * @flags -- see reiser4_ba_flags_t description.
1685+ *
1686+ * @return -- 0 if success, error code otherwise.
1687+ */
1688+int
1689+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1690+ reiser4_block_nr * len, reiser4_ba_flags_t flags)
1691+{
1692+ __u64 needed = *len;
1693+ reiser4_context *ctx;
1694+ reiser4_super_info_data *sbinfo;
1695+ int ret;
1696+
1697+ assert("zam-986", hint != NULL);
1698+
1699+ ctx = get_current_context();
1700+ sbinfo = get_super_private(ctx->super);
1701+
1702+ /* For write-optimized data we use default search start value, which is
1703+ * close to last write location. */
1704+ if (flags & BA_USE_DEFAULT_SEARCH_START) {
1705+ get_blocknr_hint_default(&hint->blk);
1706+ }
1707+
1708+ /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1709+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1710+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
1711+ ret = reiser4_grab_space_force(*len, flags);
1712+ if (ret != 0)
1713+ return ret;
1714+ }
1715+
1716+ ret =
1717+ sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1718+ hint, (int)needed, blk, len);
1719+
1720+ if (!ret) {
1721+ assert("zam-680", *blk < reiser4_block_count(ctx->super));
1722+ assert("zam-681",
1723+ *blk + *len <= reiser4_block_count(ctx->super));
1724+
1725+ if (flags & BA_PERMANENT) {
1726+ /* we assume that current atom exists at this moment */
1727+ txn_atom *atom = get_current_atom_locked();
1728+ atom->nr_blocks_allocated += *len;
1729+ spin_unlock_atom(atom);
1730+ }
1731+
1732+ switch (hint->block_stage) {
1733+ case BLOCK_NOT_COUNTED:
1734+ case BLOCK_GRABBED:
1735+ grabbed2used(ctx, sbinfo, *len);
1736+ break;
1737+ case BLOCK_UNALLOCATED:
1738+ fake_allocated2used(sbinfo, *len, flags);
1739+ break;
1740+ case BLOCK_FLUSH_RESERVED:
1741+ {
1742+ txn_atom *atom = get_current_atom_locked();
1743+ flush_reserved2used(atom, *len);
1744+ spin_unlock_atom(atom);
1745+ }
1746+ break;
1747+ default:
1748+ impossible("zam-531", "wrong block stage");
1749+ }
1750+ } else {
1751+ assert("zam-821",
1752+ ergo(hint->max_dist == 0
1753+ && !hint->backward, ret != -ENOSPC));
1754+ if (hint->block_stage == BLOCK_NOT_COUNTED)
1755+ grabbed2free(ctx, sbinfo, needed);
1756+ }
1757+
1758+ return ret;
1759+}
1760+
1761+/* used -> fake_allocated -> grabbed -> free */
1762+
1763+/* adjust sb block counters when @count unallocated blocks get unmapped from
1764+ disk */
1765+static void
1766+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1767+ int formatted)
1768+{
1769+ spin_lock_reiser4_super(sbinfo);
1770+
1771+ if (formatted)
1772+ sbinfo->blocks_fake_allocated += count;
1773+ else
1774+ sbinfo->blocks_fake_allocated_unformatted += count;
1775+
1776+ sub_from_sb_used(sbinfo, count);
1777+
1778+ assert("nikita-2681",
1779+ reiser4_check_block_counters(reiser4_get_current_sb()));
1780+
1781+ spin_unlock_reiser4_super(sbinfo);
1782+}
1783+
1784+static void
1785+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1786+ __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1787+{
1788+ assert("nikita-2791", atom != NULL);
1789+ assert_spin_locked(&(atom->alock));
1790+
1791+ add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1792+
1793+ spin_lock_reiser4_super(sbinfo);
1794+
1795+ sbinfo->blocks_flush_reserved += count;
1796+ /*add_to_sb_flush_reserved(sbinfo, count); */
1797+ sub_from_sb_used(sbinfo, count);
1798+
1799+ assert("nikita-2681",
1800+ reiser4_check_block_counters(reiser4_get_current_sb()));
1801+
1802+ spin_unlock_reiser4_super(sbinfo);
1803+}
1804+
1805+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1806+static void
1807+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1808+ __u64 count, reiser4_ba_flags_t flags)
1809+{
1810+ add_to_ctx_grabbed(ctx, count);
1811+
1812+ spin_lock_reiser4_super(sbinfo);
1813+
1814+ assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1815+
1816+ sbinfo->blocks_grabbed += count;
1817+ sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1818+
1819+ assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1820+
1821+ spin_unlock_reiser4_super(sbinfo);
1822+}
1823+
1824+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1825+{
1826+ reiser4_context *ctx;
1827+ reiser4_super_info_data *sbinfo;
1828+
1829+ ctx = get_current_context();
1830+ sbinfo = get_super_private(ctx->super);
1831+
1832+ fake_allocated2grabbed(ctx, sbinfo, count, flags);
1833+ grabbed2free(ctx, sbinfo, count);
1834+}
1835+
1836+void grabbed2free_mark(__u64 mark)
1837+{
1838+ reiser4_context *ctx;
1839+ reiser4_super_info_data *sbinfo;
1840+
1841+ ctx = get_current_context();
1842+ sbinfo = get_super_private(ctx->super);
1843+
1844+ assert("nikita-3007", (__s64) mark >= 0);
1845+ assert("nikita-3006", ctx->grabbed_blocks >= mark);
1846+ grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1847+}
1848+
1849+/**
1850+ * grabbed2free - adjust grabbed and free block counters
1851+ * @ctx: context to update grabbed block counter of
1852+ * @sbinfo: super block to update grabbed and free block counters of
1853+ * @count: number of blocks to adjust counters by
1854+ *
1855+ * Decreases context's and per filesystem's counters of grabbed
1856+ * blocks. Increases per filesystem's counter of free blocks.
1857+ */
1858+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1859+ __u64 count)
1860+{
1861+ sub_from_ctx_grabbed(ctx, count);
1862+
1863+ spin_lock_reiser4_super(sbinfo);
1864+
1865+ sub_from_sb_grabbed(sbinfo, count);
1866+ sbinfo->blocks_free += count;
1867+ assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1868+
1869+ spin_unlock_reiser4_super(sbinfo);
1870+}
1871+
1872+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1873+{
1874+ reiser4_context *ctx;
1875+ reiser4_super_info_data *sbinfo;
1876+
1877+ assert("vs-1095", atom);
1878+
1879+ ctx = get_current_context();
1880+ sbinfo = get_super_private(ctx->super);
1881+
1882+ sub_from_ctx_grabbed(ctx, count);
1883+
1884+ add_to_atom_flush_reserved_nolock(atom, count);
1885+
1886+ spin_lock_reiser4_super(sbinfo);
1887+
1888+ sbinfo->blocks_flush_reserved += count;
1889+ sub_from_sb_grabbed(sbinfo, count);
1890+
1891+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1892+
1893+ spin_unlock_reiser4_super(sbinfo);
1894+}
1895+
1896+void grabbed2flush_reserved(__u64 count)
1897+{
1898+ txn_atom *atom = get_current_atom_locked();
1899+
1900+ grabbed2flush_reserved_nolock(atom, count);
1901+
1902+ spin_unlock_atom(atom);
1903+}
1904+
1905+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1906+{
1907+ reiser4_context *ctx;
1908+ reiser4_super_info_data *sbinfo;
1909+
1910+ assert("nikita-2788", atom != NULL);
1911+ assert_spin_locked(&(atom->alock));
1912+
1913+ ctx = get_current_context();
1914+ sbinfo = get_super_private(ctx->super);
1915+
1916+ add_to_ctx_grabbed(ctx, count);
1917+
1918+ sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1919+
1920+ spin_lock_reiser4_super(sbinfo);
1921+
1922+ sbinfo->blocks_grabbed += count;
1923+ sub_from_sb_flush_reserved(sbinfo, count);
1924+
1925+ assert("vpf-292", reiser4_check_block_counters(ctx->super));
1926+
1927+ spin_unlock_reiser4_super(sbinfo);
1928+}
1929+
1930+/**
1931+ * all_grabbed2free - releases all blocks grabbed in context
1932+ *
1933+ * Decreases context's and super block's grabbed block counters by number of
1934+ * blocks grabbed by current context and increases super block's free block
1935+ * counter correspondingly.
1936+ */
1937+void all_grabbed2free(void)
1938+{
1939+ reiser4_context *ctx = get_current_context();
1940+
1941+ grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1942+}
1943+
1944+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1945+ after freeing, @count blocks become "grabbed". */
1946+static void
1947+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1948+ __u64 count)
1949+{
1950+ add_to_ctx_grabbed(ctx, count);
1951+
1952+ spin_lock_reiser4_super(sbinfo);
1953+
1954+ sbinfo->blocks_grabbed += count;
1955+ sub_from_sb_used(sbinfo, count);
1956+
1957+ assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1958+
1959+ spin_unlock_reiser4_super(sbinfo);
1960+}
1961+
1962+/* this used to be done through used2grabbed and grabbed2free*/
1963+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1964+{
1965+ spin_lock_reiser4_super(sbinfo);
1966+
1967+ sbinfo->blocks_free += count;
1968+ sub_from_sb_used(sbinfo, count);
1969+
1970+ assert("nikita-2685",
1971+ reiser4_check_block_counters(reiser4_get_current_sb()));
1972+
1973+ spin_unlock_reiser4_super(sbinfo);
1974+}
1975+
1976+#if REISER4_DEBUG
1977+
1978+/* check "allocated" state of given block range */
1979+static void
1980+reiser4_check_blocks(const reiser4_block_nr * start,
1981+ const reiser4_block_nr * len, int desired)
1982+{
1983+ sa_check_blocks(start, len, desired);
1984+}
1985+
1986+/* check "allocated" state of given block */
1987+void reiser4_check_block(const reiser4_block_nr * block, int desired)
1988+{
1989+ const reiser4_block_nr one = 1;
1990+
1991+ reiser4_check_blocks(block, &one, desired);
1992+}
1993+
1994+#endif
1995+
1996+/* Blocks deallocation function may do an actual deallocation through space
1997+ plugin allocation or store deleted block numbers in atom's delete_set data
1998+ structure depend on @defer parameter. */
1999+
2000+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
2001+ will be deleted from WORKING bitmap. They might be just unmapped from disk, or
2002+ freed but disk space is still grabbed by current thread, or these blocks must
2003+ not be counted in any reiser4 sb block counters, see block_stage_t comment */
2004+
2005+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
2006+ distinguish blocks allocated for unformatted and formatted nodes */
2007+
2008+int
2009+reiser4_dealloc_blocks(const reiser4_block_nr * start,
2010+ const reiser4_block_nr * len,
2011+ block_stage_t target_stage, reiser4_ba_flags_t flags)
2012+{
2013+ txn_atom *atom = NULL;
2014+ int ret;
2015+ reiser4_context *ctx;
2016+ reiser4_super_info_data *sbinfo;
2017+
2018+ ctx = get_current_context();
2019+ sbinfo = get_super_private(ctx->super);
2020+
2021+ if (REISER4_DEBUG) {
2022+ assert("zam-431", *len != 0);
2023+ assert("zam-432", *start != 0);
2024+ assert("zam-558", !reiser4_blocknr_is_fake(start));
2025+
2026+ spin_lock_reiser4_super(sbinfo);
2027+ assert("zam-562", *start < sbinfo->block_count);
2028+ spin_unlock_reiser4_super(sbinfo);
2029+ }
2030+
2031+ if (flags & BA_DEFER) {
2032+ blocknr_set_entry *bsep = NULL;
2033+
2034+ /* storing deleted block numbers in a blocknr set
2035+ datastructure for further actual deletion */
2036+ do {
2037+ atom = get_current_atom_locked();
2038+ assert("zam-430", atom != NULL);
2039+
2040+ ret =
2041+ blocknr_set_add_extent(atom, &atom->delete_set,
2042+ &bsep, start, len);
2043+
2044+ if (ret == -ENOMEM)
2045+ return ret;
2046+
2047+ /* This loop might spin at most two times */
2048+ } while (ret == -E_REPEAT);
2049+
2050+ assert("zam-477", ret == 0);
2051+ assert("zam-433", atom != NULL);
2052+
2053+ spin_unlock_atom(atom);
2054+
2055+ } else {
2056+ assert("zam-425", get_current_super_private() != NULL);
2057+ sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
2058+ *start, *len);
2059+
2060+ if (flags & BA_PERMANENT) {
2061+ /* These blocks were counted as allocated, we have to revert it
2062+ * back if allocation is discarded. */
2063+ txn_atom *atom = get_current_atom_locked();
2064+ atom->nr_blocks_allocated -= *len;
2065+ spin_unlock_atom(atom);
2066+ }
2067+
2068+ switch (target_stage) {
2069+ case BLOCK_NOT_COUNTED:
2070+ assert("vs-960", flags & BA_FORMATTED);
2071+ /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
2072+ used2free(sbinfo, *len);
2073+ break;
2074+
2075+ case BLOCK_GRABBED:
2076+ used2grabbed(ctx, sbinfo, *len);
2077+ break;
2078+
2079+ case BLOCK_UNALLOCATED:
2080+ used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
2081+ break;
2082+
2083+ case BLOCK_FLUSH_RESERVED:{
2084+ txn_atom *atom;
2085+
2086+ atom = get_current_atom_locked();
2087+ used2flush_reserved(sbinfo, atom, *len,
2088+ flags & BA_FORMATTED);
2089+ spin_unlock_atom(atom);
2090+ break;
2091+ }
2092+ default:
2093+ impossible("zam-532", "wrong block stage");
2094+ }
2095+ }
2096+
2097+ return 0;
2098+}
2099+
2100+/* wrappers for block allocator plugin methods */
2101+int reiser4_pre_commit_hook(void)
2102+{
2103+ assert("zam-502", get_current_super_private() != NULL);
2104+ sa_pre_commit_hook();
2105+ return 0;
2106+}
2107+
2108+/* an actor which applies delete set to block allocator data */
2109+static int
2110+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
2111+ const reiser4_block_nr * b, void *data UNUSED_ARG)
2112+{
2113+ reiser4_context *ctx;
2114+ reiser4_super_info_data *sbinfo;
2115+
2116+ __u64 len = 1;
2117+
2118+ ctx = get_current_context();
2119+ sbinfo = get_super_private(ctx->super);
2120+
2121+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
2122+ assert("zam-552", sbinfo != NULL);
2123+
2124+ if (b != NULL)
2125+ len = *b;
2126+
2127+ if (REISER4_DEBUG) {
2128+ spin_lock_reiser4_super(sbinfo);
2129+
2130+ assert("zam-554", *a < reiser4_block_count(ctx->super));
2131+ assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
2132+
2133+ spin_unlock_reiser4_super(sbinfo);
2134+ }
2135+
2136+ sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
2137+ /* adjust sb block counters */
2138+ used2free(sbinfo, len);
2139+ return 0;
2140+}
2141+
2142+void reiser4_post_commit_hook(void)
2143+{
2144+ txn_atom *atom;
2145+
2146+ atom = get_current_atom_locked();
2147+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2148+ spin_unlock_atom(atom);
2149+
2150+ /* do the block deallocation which was deferred
2151+ until commit is done */
2152+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2153+
2154+ assert("zam-504", get_current_super_private() != NULL);
2155+ sa_post_commit_hook();
2156+}
2157+
2158+void reiser4_post_write_back_hook(void)
2159+{
2160+ assert("zam-504", get_current_super_private() != NULL);
2161+
2162+ sa_post_commit_hook();
2163+}
2164+
2165+/*
2166+ Local variables:
2167+ c-indentation-style: "K&R"
2168+ mode-name: "LC"
2169+ c-basic-offset: 8
2170+ tab-width: 8
2171+ fill-column: 120
2172+ scroll-step: 1
2173+ End:
2174+*/
2175diff --git a/fs/reiser4/block_alloc.h b/fs/reiser4/block_alloc.h
2176new file mode 100644
2177index 0000000..f4b79f8
2178--- /dev/null
2179+++ b/fs/reiser4/block_alloc.h
2180@@ -0,0 +1,175 @@
2181+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2182+
2183+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2184+#define __FS_REISER4_BLOCK_ALLOC_H__
2185+
2186+#include "dformat.h"
2187+#include "forward.h"
2188+
2189+#include <linux/types.h> /* for __u?? */
2190+#include <linux/fs.h>
2191+
2192+/* Mask when is applied to given block number shows is that block number is a fake one */
2193+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
2194+/* Mask which isolates a type of object this fake block number was assigned to */
2195+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2196+
2197+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2198+ against these two values to understand is the object unallocated or bitmap
2199+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2200+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
2201+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
2202+
2203+/* specification how block allocation was counted in sb block counters */
2204+typedef enum {
2205+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
2206+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
2207+ of this block */
2208+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
2209+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
2210+ ( unallocated formatted or unformatted
2211+ node) */
2212+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
2213+ number assigned */
2214+} block_stage_t;
2215+
2216+/* a hint for block allocator */
2217+struct reiser4_blocknr_hint {
2218+ /* FIXME: I think we want to add a longterm lock on the bitmap block here. This
2219+ is to prevent jnode_flush() calls from interleaving allocations on the same
2220+ bitmap, once a hint is established. */
2221+
2222+ /* search start hint */
2223+ reiser4_block_nr blk;
2224+ /* if not zero, it is a region size we search for free blocks in */
2225+ reiser4_block_nr max_dist;
2226+ /* level for allocation, may be useful have branch-level and higher
2227+ write-optimized. */
2228+ tree_level level;
2229+ /* block allocator assumes that blocks, which will be mapped to disk,
2230+ are in this specified block_stage */
2231+ block_stage_t block_stage;
2232+ /* If direction = 1 allocate blocks in backward direction from the end
2233+ * of disk to the beginning of disk. */
2234+ unsigned int backward:1;
2235+
2236+};
2237+
2238+/* These flags control block allocation/deallocation behavior */
2239+enum reiser4_ba_flags {
2240+ /* do allocatations from reserved (5%) area */
2241+ BA_RESERVED = (1 << 0),
2242+
2243+ /* block allocator can do commit trying to recover free space */
2244+ BA_CAN_COMMIT = (1 << 1),
2245+
2246+ /* if operation will be applied to formatted block */
2247+ BA_FORMATTED = (1 << 2),
2248+
2249+ /* defer actual block freeing until transaction commit */
2250+ BA_DEFER = (1 << 3),
2251+
2252+ /* allocate blocks for permanent fs objects (formatted or unformatted), not
2253+ wandered of log blocks */
2254+ BA_PERMANENT = (1 << 4),
2255+
2256+ /* grab space even it was disabled */
2257+ BA_FORCE = (1 << 5),
2258+
2259+ /* use default start value for free blocks search. */
2260+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2261+};
2262+
2263+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2264+
2265+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
2266+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
2267+extern void update_blocknr_hint_default(const struct super_block *,
2268+ const reiser4_block_nr *);
2269+extern void get_blocknr_hint_default(reiser4_block_nr *);
2270+
2271+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
2272+
2273+int assign_fake_blocknr_formatted(reiser4_block_nr *);
2274+reiser4_block_nr fake_blocknr_unformatted(int);
2275+
2276+/* free -> grabbed -> fake_allocated -> used */
2277+
2278+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
2279+void all_grabbed2free(void);
2280+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
2281+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
2282+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2283+void grabbed2flush_reserved(__u64 count);
2284+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
2285+ reiser4_block_nr * start,
2286+ reiser4_block_nr * len, reiser4_ba_flags_t flags);
2287+int reiser4_dealloc_blocks(const reiser4_block_nr *,
2288+ const reiser4_block_nr *,
2289+ block_stage_t, reiser4_ba_flags_t flags);
2290+
2291+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2292+ reiser4_block_nr * start,
2293+ reiser4_ba_flags_t flags)
2294+{
2295+ reiser4_block_nr one = 1;
2296+ return reiser4_alloc_blocks(hint, start, &one, flags);
2297+}
2298+
2299+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2300+ block_stage_t stage,
2301+ reiser4_ba_flags_t flags)
2302+{
2303+ const reiser4_block_nr one = 1;
2304+ return reiser4_dealloc_blocks(block, &one, stage, flags);
2305+}
2306+
2307+#define reiser4_grab_space_force(count, flags) \
2308+ reiser4_grab_space(count, flags | BA_FORCE)
2309+
2310+extern void grabbed2free_mark(__u64 mark);
2311+extern int reiser4_grab_reserved(struct super_block *,
2312+ __u64, reiser4_ba_flags_t);
2313+extern void reiser4_release_reserved(struct super_block *super);
2314+
2315+/* grabbed -> fake_allocated */
2316+
2317+/* fake_allocated -> used */
2318+
2319+/* used -> fake_allocated -> grabbed -> free */
2320+
2321+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2322+
2323+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
2324+
2325+extern void grabbed2cluster_reserved(int count);
2326+extern void cluster_reserved2grabbed(int count);
2327+extern void cluster_reserved2free(int count);
2328+
2329+extern int reiser4_check_block_counters(const struct super_block *);
2330+
2331+#if REISER4_DEBUG
2332+
2333+extern void reiser4_check_block(const reiser4_block_nr *, int);
2334+
2335+#else
2336+
2337+# define reiser4_check_block(beg, val) noop
2338+
2339+#endif
2340+
2341+extern int reiser4_pre_commit_hook(void);
2342+extern void reiser4_post_commit_hook(void);
2343+extern void reiser4_post_write_back_hook(void);
2344+
2345+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
2346+
2347+/* Make Linus happy.
2348+ Local variables:
2349+ c-indentation-style: "K&R"
2350+ mode-name: "LC"
2351+ c-basic-offset: 8
2352+ tab-width: 8
2353+ fill-column: 120
2354+ End:
2355+*/
2356diff --git a/fs/reiser4/blocknrset.c b/fs/reiser4/blocknrset.c
2357new file mode 100644
2358index 0000000..da50a5a
2359--- /dev/null
2360+++ b/fs/reiser4/blocknrset.c
2361@@ -0,0 +1,368 @@
2362+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2363+
2364+/* This file contains code for various block number sets used by the atom to
2365+ track the deleted set and wandered block mappings. */
2366+
2367+#include "debug.h"
2368+#include "dformat.h"
2369+#include "txnmgr.h"
2370+#include "context.h"
2371+
2372+#include <linux/slab.h>
2373+
2374+/* The proposed data structure for storing unordered block number sets is a
2375+ list of elements, each of which contains an array of block number or/and
2376+ array of block number pairs. That element called blocknr_set_entry is used
2377+ to store block numbers from the beginning and for extents from the end of
2378+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2379+ count numbers of blocks and extents.
2380+
2381+ +------------------- blocknr_set_entry->data ------------------+
2382+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2383+ +------------------------------------------------------------+
2384+
2385+ When current blocknr_set_entry is full, allocate a new one. */
2386+
2387+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2388+ * set (single blocks and block extents), in that case blocknr pair represent an
2389+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2390+ * there represent a (real block) -> (wandered block) mapping. */
2391+
2392+/* Protection: blocknr sets belong to reiser4 atom, and
2393+ * their modifications are performed with the atom lock held */
2394+
2395+typedef struct blocknr_pair blocknr_pair;
2396+
2397+/* The total size of a blocknr_set_entry. */
2398+#define BLOCKNR_SET_ENTRY_SIZE 128
2399+
2400+/* The number of blocks that can fit the blocknr data area. */
2401+#define BLOCKNR_SET_ENTRIES_NUMBER \
2402+ ((BLOCKNR_SET_ENTRY_SIZE - \
2403+ 2 * sizeof (unsigned) - \
2404+ sizeof(struct list_head)) / \
2405+ sizeof(reiser4_block_nr))
2406+
2407+/* An entry of the blocknr_set */
2408+struct blocknr_set_entry {
2409+ unsigned nr_singles;
2410+ unsigned nr_pairs;
2411+ struct list_head link;
2412+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2413+};
2414+
2415+/* A pair of blocks as recorded in the blocknr_set_entry data. */
2416+struct blocknr_pair {
2417+ reiser4_block_nr a;
2418+ reiser4_block_nr b;
2419+};
2420+
2421+/* Return the number of blocknr slots available in a blocknr_set_entry. */
2422+/* Audited by: green(2002.06.11) */
2423+static unsigned bse_avail(blocknr_set_entry * bse)
2424+{
2425+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2426+
2427+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2428+ cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2429+
2430+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
2431+}
2432+
2433+/* Initialize a blocknr_set_entry. */
2434+static void bse_init(blocknr_set_entry *bse)
2435+{
2436+ bse->nr_singles = 0;
2437+ bse->nr_pairs = 0;
2438+ INIT_LIST_HEAD(&bse->link);
2439+}
2440+
2441+/* Allocate and initialize a blocknr_set_entry. */
2442+/* Audited by: green(2002.06.11) */
2443+static blocknr_set_entry *bse_alloc(void)
2444+{
2445+ blocknr_set_entry *e;
2446+
2447+ if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2448+ reiser4_ctx_gfp_mask_get())) == NULL)
2449+ return NULL;
2450+
2451+ bse_init(e);
2452+
2453+ return e;
2454+}
2455+
2456+/* Free a blocknr_set_entry. */
2457+/* Audited by: green(2002.06.11) */
2458+static void bse_free(blocknr_set_entry * bse)
2459+{
2460+ kfree(bse);
2461+}
2462+
2463+/* Add a block number to a blocknr_set_entry */
2464+/* Audited by: green(2002.06.11) */
2465+static void
2466+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2467+{
2468+ assert("jmacd-5099", bse_avail(bse) >= 1);
2469+
2470+ bse->entries[bse->nr_singles++] = *block;
2471+}
2472+
2473+/* Get a pair of block numbers */
2474+/* Audited by: green(2002.06.11) */
2475+static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2476+{
2477+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2478+
2479+ return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2480+ 2 * (pno + 1));
2481+}
2482+
2483+/* Add a pair of block numbers to a blocknr_set_entry */
2484+/* Audited by: green(2002.06.11) */
2485+static void
2486+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2487+ const reiser4_block_nr * b)
2488+{
2489+ blocknr_pair *pair;
2490+
2491+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2492+
2493+ pair = bse_get_pair(bse, bse->nr_pairs++);
2494+
2495+ pair->a = *a;
2496+ pair->b = *b;
2497+}
2498+
2499+/* Add either a block or pair of blocks to the block number set. The first
2500+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
2501+ @b is non-NULL a pair is added. The block number set belongs to atom, and
2502+ the call is made with the atom lock held. There may not be enough space in
2503+ the current blocknr_set_entry. If new_bsep points to a non-NULL
2504+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2505+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
2506+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
2507+ returned with the atom unlocked for the operation to be tried again. If
2508+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
2509+ used during the call, it will be freed automatically. */
2510+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2511+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2512+ const reiser4_block_nr *b)
2513+{
2514+ blocknr_set_entry *bse;
2515+ unsigned entries_needed;
2516+
2517+ assert("jmacd-5101", a != NULL);
2518+
2519+ entries_needed = (b == NULL) ? 1 : 2;
2520+ if (list_empty(bset) ||
2521+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2522+ /* See if a bse was previously allocated. */
2523+ if (*new_bsep == NULL) {
2524+ spin_unlock_atom(atom);
2525+ *new_bsep = bse_alloc();
2526+ return (*new_bsep != NULL) ? -E_REPEAT :
2527+ RETERR(-ENOMEM);
2528+ }
2529+
2530+ /* Put it on the head of the list. */
2531+ list_add(&((*new_bsep)->link), bset);
2532+
2533+ *new_bsep = NULL;
2534+ }
2535+
2536+ /* Add the single or pair. */
2537+ bse = list_entry(bset->next, blocknr_set_entry, link);
2538+ if (b == NULL) {
2539+ bse_put_single(bse, a);
2540+ } else {
2541+ bse_put_pair(bse, a, b);
2542+ }
2543+
2544+ /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2545+ if (*new_bsep != NULL) {
2546+ bse_free(*new_bsep);
2547+ *new_bsep = NULL;
2548+ }
2549+
2550+ return 0;
2551+}
2552+
2553+/* Add an extent to the block set. If the length is 1, it is treated as a
2554+ single block (e.g., reiser4_set_add_block). */
2555+/* Audited by: green(2002.06.11) */
2556+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2557+ kmalloc might schedule. The only exception is atom spinlock, which is
2558+ properly freed. */
2559+int
2560+blocknr_set_add_extent(txn_atom * atom,
2561+ struct list_head * bset,
2562+ blocknr_set_entry ** new_bsep,
2563+ const reiser4_block_nr * start,
2564+ const reiser4_block_nr * len)
2565+{
2566+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2567+ return blocknr_set_add(atom, bset, new_bsep, start,
2568+ *len == 1 ? NULL : len);
2569+}
2570+
2571+/* Add a block pair to the block set. It adds exactly a pair, which is checked
2572+ * by an assertion that both arguments are not null.*/
2573+/* Audited by: green(2002.06.11) */
2574+/* Auditor note: Entire call chain cannot hold any spinlocks, because
2575+ kmalloc might schedule. The only exception is atom spinlock, which is
2576+ properly freed. */
2577+int
2578+blocknr_set_add_pair(txn_atom * atom,
2579+ struct list_head * bset,
2580+ blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2581+ const reiser4_block_nr * b)
2582+{
2583+ assert("jmacd-5103", a != NULL && b != NULL);
2584+ return blocknr_set_add(atom, bset, new_bsep, a, b);
2585+}
2586+
2587+/* Initialize a blocknr_set. */
2588+void blocknr_set_init(struct list_head *bset)
2589+{
2590+ INIT_LIST_HEAD(bset);
2591+}
2592+
2593+/* Release the entries of a blocknr_set. */
2594+void blocknr_set_destroy(struct list_head *bset)
2595+{
2596+ blocknr_set_entry *bse;
2597+
2598+ while (!list_empty(bset)) {
2599+ bse = list_entry(bset->next, blocknr_set_entry, link);
2600+ list_del_init(&bse->link);
2601+ bse_free(bse);
2602+ }
2603+}
2604+
2605+/* Merge blocknr_set entries out of @from into @into. */
2606+/* Audited by: green(2002.06.11) */
2607+/* Auditor comments: This merge does not know if merged sets contain
2608+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
2609+ overlapping ranges if there is some. So I believe it may lead to
2610+ some blocks being presented several times in one blocknr_set. To help
2611+ debugging such problems it might help to check for duplicate entries on
2612+ actual processing of this set. Testing this kind of stuff right here is
2613+ also complicated by the fact that these sets are not sorted and going
2614+ through whole set on each element addition is going to be CPU-heavy task */
2615+void blocknr_set_merge(struct list_head * from, struct list_head * into)
2616+{
2617+ blocknr_set_entry *bse_into = NULL;
2618+
2619+ /* If @from is empty, no work to perform. */
2620+ if (list_empty(from))
2621+ return;
2622+ /* If @into is not empty, try merging partial-entries. */
2623+ if (!list_empty(into)) {
2624+
2625+ /* Neither set is empty, pop the front to members and try to combine them. */
2626+ blocknr_set_entry *bse_from;
2627+ unsigned into_avail;
2628+
2629+ bse_into = list_entry(into->next, blocknr_set_entry, link);
2630+ list_del_init(&bse_into->link);
2631+ bse_from = list_entry(from->next, blocknr_set_entry, link);
2632+ list_del_init(&bse_from->link);
2633+
2634+ /* Combine singles. */
2635+ for (into_avail = bse_avail(bse_into);
2636+ into_avail != 0 && bse_from->nr_singles != 0;
2637+ into_avail -= 1) {
2638+ bse_put_single(bse_into,
2639+ &bse_from->entries[--bse_from->
2640+ nr_singles]);
2641+ }
2642+
2643+ /* Combine pairs. */
2644+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
2645+ into_avail -= 2) {
2646+ blocknr_pair *pair =
2647+ bse_get_pair(bse_from, --bse_from->nr_pairs);
2648+ bse_put_pair(bse_into, &pair->a, &pair->b);
2649+ }
2650+
2651+ /* If bse_from is empty, delete it now. */
2652+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2653+ bse_free(bse_from);
2654+ } else {
2655+ /* Otherwise, bse_into is full or nearly full (e.g.,
2656+ it could have one slot avail and bse_from has one
2657+ pair left). Push it back onto the list. bse_from
2658+ becomes bse_into, which will be the new partial. */
2659+ list_add(&bse_into->link, into);
2660+ bse_into = bse_from;
2661+ }
2662+ }
2663+
2664+ /* Splice lists together. */
2665+ list_splice_init(from, into->prev);
2666+
2667+ /* Add the partial entry back to the head of the list. */
2668+ if (bse_into != NULL)
2669+ list_add(&bse_into->link, into);
2670+}
2671+
2672+/* Iterate over all blocknr set elements. */
2673+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2674+ blocknr_set_actor_f actor, void *data, int delete)
2675+{
2676+
2677+ blocknr_set_entry *entry;
2678+
2679+ assert("zam-429", atom != NULL);
2680+ assert("zam-430", atom_is_protected(atom));
2681+ assert("zam-431", bset != 0);
2682+ assert("zam-432", actor != NULL);
2683+
2684+ entry = list_entry(bset->next, blocknr_set_entry, link);
2685+ while (bset != &entry->link) {
2686+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2687+ unsigned int i;
2688+ int ret;
2689+
2690+ for (i = 0; i < entry->nr_singles; i++) {
2691+ ret = actor(atom, &entry->entries[i], NULL, data);
2692+
2693+ /* We can't break a loop if delete flag is set. */
2694+ if (ret != 0 && !delete)
2695+ return ret;
2696+ }
2697+
2698+ for (i = 0; i < entry->nr_pairs; i++) {
2699+ struct blocknr_pair *ab;
2700+
2701+ ab = bse_get_pair(entry, i);
2702+
2703+ ret = actor(atom, &ab->a, &ab->b, data);
2704+
2705+ if (ret != 0 && !delete)
2706+ return ret;
2707+ }
2708+
2709+ if (delete) {
2710+ list_del(&entry->link);
2711+ bse_free(entry);
2712+ }
2713+
2714+ entry = tmp;
2715+ }
2716+
2717+ return 0;
2718+}
2719+
2720+/*
2721+ * Local variables:
2722+ * c-indentation-style: "K&R"
2723+ * mode-name: "LC"
2724+ * c-basic-offset: 8
2725+ * tab-width: 8
2726+ * fill-column: 79
2727+ * scroll-step: 1
2728+ * End:
2729+ */
2730diff --git a/fs/reiser4/carry.c b/fs/reiser4/carry.c
2731new file mode 100644
2732index 0000000..c90a0f0
2733--- /dev/null
2734+++ b/fs/reiser4/carry.c
2735@@ -0,0 +1,1391 @@
2736+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2737+/* Functions to "carry" tree modification(s) upward. */
2738+/* Tree is modified one level at a time. As we modify a level we accumulate a
2739+ set of changes that need to be propagated to the next level. We manage
2740+ node locking such that any searches that collide with carrying are
2741+ restarted, from the root if necessary.
2742+
2743+ Insertion of a new item may result in items being moved among nodes and
2744+ this requires the delimiting key to be updated at the least common parent
2745+ of the nodes modified to preserve search tree invariants. Also, insertion
2746+ may require allocation of a new node. A pointer to the new node has to be
2747+ inserted into some node on the parent level, etc.
2748+
2749+ Tree carrying is meant to be analogous to arithmetic carrying.
2750+
2751+ A carry operation is always associated with some node (&carry_node).
2752+
2753+ Carry process starts with some initial set of operations to be performed
2754+ and an initial set of already locked nodes. Operations are performed one
2755+ by one. Performing each single operation has following possible effects:
2756+
2757+ - content of carry node associated with operation is modified
2758+ - new carry nodes are locked and involved into carry process on this level
2759+ - new carry operations are posted to the next level
2760+
2761+ After all carry operations on this level are done, process is repeated for
2762+ the accumulated sequence on carry operations for the next level. This
2763+ starts by trying to lock (in left to right order) all carry nodes
2764+ associated with carry operations on the parent level. After this, we decide
2765+ whether more nodes are required on the left of already locked set. If so,
2766+ all locks taken on the parent level are released, new carry nodes are
2767+ added, and locking process repeats.
2768+
2769+ It may happen that balancing process fails owing to unrecoverable error on
2770+ some of upper levels of a tree (possible causes are io error, failure to
2771+ allocate new node, etc.). In this case we should unmount the filesystem,
2772+ rebooting if it is the root, and possibly advise the use of fsck.
2773+
2774+ USAGE:
2775+
2776+ int some_tree_operation( znode *node, ... )
2777+ {
2778+ // Allocate on a stack pool of carry objects: operations and nodes.
2779+ // Most carry processes will only take objects from here, without
2780+ // dynamic allocation.
2781+
2782+I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans
2783+
2784+ carry_pool pool;
2785+ carry_level lowest_level;
2786+ carry_op *op;
2787+
2788+ init_carry_pool( &pool );
2789+ init_carry_level( &lowest_level, &pool );
2790+
2791+ // operation may be one of:
2792+ // COP_INSERT --- insert new item into node
2793+ // COP_CUT --- remove part of or whole node
2794+ // COP_PASTE --- increase size of item
2795+ // COP_DELETE --- delete pointer from parent node
2796+ // COP_UPDATE --- update delimiting key in least
2797+ // common ancestor of two
2798+
2799+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2800+ if( IS_ERR( op ) || ( op == NULL ) ) {
2801+ handle error
2802+ } else {
2803+ // fill in remaining fields in @op, according to carry.h:carry_op
2804+ result = carry( &lowest_level, NULL );
2805+ }
2806+ done_carry_pool( &pool );
2807+ }
2808+
2809+ When you are implementing node plugin method that participates in carry
2810+ (shifting, insertion, deletion, etc.), do the following:
2811+
2812+ int foo_node_method( znode *node, ..., carry_level *todo )
2813+ {
2814+ carry_op *op;
2815+
2816+ ....
2817+
2818+ // note, that last argument to reiser4_post_carry() is non-null
2819+ // here, because @op is to be applied to the parent of @node, rather
2820+ // than to the @node itself as in the previous case.
2821+
2822+ op = node_post_carry( todo, operation, node, 1 );
2823+ // fill in remaining fields in @op, according to carry.h:carry_op
2824+
2825+ ....
2826+
2827+ }
2828+
2829+ BATCHING:
2830+
2831+ One of the main advantages of level-by-level balancing implemented here is
2832+ ability to batch updates on a parent level and to peform them more
2833+ efficiently as a result.
2834+
2835+ Description To Be Done (TBD).
2836+
2837+ DIFFICULTIES AND SUBTLE POINTS:
2838+
2839+ 1. complex plumbing is required, because:
2840+
2841+ a. effective allocation through pools is needed
2842+
2843+ b. target of operation is not exactly known when operation is
2844+ posted. This is worked around through bitfields in &carry_node and
2845+ logic in lock_carry_node()
2846+
2847+ c. of interaction with locking code: node should be added into sibling
2848+ list when pointer to it is inserted into its parent, which is some time
2849+ after node was created. Between these moments, node is somewhat in
2850+ suspended state and is only registered in the carry lists
2851+
2852+ 2. whole balancing logic is implemented here, in particular, insertion
2853+ logic is coded in make_space().
2854+
2855+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
2856+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2857+ (insert_paste()) have to be handled.
2858+
2859+ 4. there is non-trivial interdependency between allocation of new nodes
2860+ and almost everything else. This is mainly due to the (1.c) above. I shall
2861+ write about this later.
2862+
2863+*/
2864+
2865+#include "forward.h"
2866+#include "debug.h"
2867+#include "key.h"
2868+#include "coord.h"
2869+#include "plugin/item/item.h"
2870+#include "plugin/item/extent.h"
2871+#include "plugin/node/node.h"
2872+#include "jnode.h"
2873+#include "znode.h"
2874+#include "tree_mod.h"
2875+#include "tree_walk.h"
2876+#include "block_alloc.h"
2877+#include "pool.h"
2878+#include "tree.h"
2879+#include "carry.h"
2880+#include "carry_ops.h"
2881+#include "super.h"
2882+#include "reiser4.h"
2883+
2884+#include <linux/types.h>
2885+
2886+/* level locking/unlocking */
2887+static int lock_carry_level(carry_level * level);
2888+static void unlock_carry_level(carry_level * level, int failure);
2889+static void done_carry_level(carry_level * level);
2890+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2891+
2892+int lock_carry_node(carry_level * level, carry_node * node);
2893+int lock_carry_node_tail(carry_node * node);
2894+
2895+/* carry processing proper */
2896+static int carry_on_level(carry_level * doing, carry_level * todo);
2897+
2898+static carry_op *add_op(carry_level * level, pool_ordering order,
2899+ carry_op * reference);
2900+
2901+/* handlers for carry operations. */
2902+
2903+static void fatal_carry_error(carry_level * doing, int ecode);
2904+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2905+
2906+static void print_level(const char *prefix, carry_level * level);
2907+
2908+#if REISER4_DEBUG
2909+typedef enum {
2910+ CARRY_TODO,
2911+ CARRY_DOING
2912+} carry_queue_state;
2913+static int carry_level_invariant(carry_level * level, carry_queue_state state);
2914+#endif
2915+
2916+/* main entry point for tree balancing.
2917+
2918+ Tree carry performs operations from @doing and while doing so accumulates
2919+ information about operations to be performed on the next level ("carried"
2920+ to the parent level). Carried operations are performed, causing possibly
2921+ more operations to be carried upward etc. carry() takes care about
2922+ locking and pinning znodes while operating on them.
2923+
2924+ For usage, see comment at the top of fs/reiser4/carry.c
2925+
2926+*/
2927+int reiser4_carry(carry_level * doing /* set of carry operations to be
2928+ * performed */ ,
2929+ carry_level * done /* set of nodes, already performed
2930+ * at the previous level.
2931+ * NULL in most cases */)
2932+{
2933+ int result = 0;
2934+ /* queue of new requests */
2935+ carry_level *todo;
2936+ ON_DEBUG(STORE_COUNTERS);
2937+
2938+ assert("nikita-888", doing != NULL);
2939+ BUG_ON(done != NULL);
2940+
2941+ todo = doing + 1;
2942+ init_carry_level(todo, doing->pool);
2943+
2944+ /* queue of requests preformed on the previous level */
2945+ done = todo + 1;
2946+ init_carry_level(done, doing->pool);
2947+
2948+ /* iterate until there is nothing more to do */
2949+ while (result == 0 && doing->ops_num > 0) {
2950+ carry_level *tmp;
2951+
2952+ /* at this point @done is locked. */
2953+ /* repeat lock/do/unlock while
2954+
2955+ (1) lock_carry_level() fails due to deadlock avoidance, or
2956+
2957+ (2) carry_on_level() decides that more nodes have to
2958+ be involved.
2959+
2960+ (3) some unexpected error occurred while balancing on the
2961+ upper levels. In this case all changes are rolled back.
2962+
2963+ */
2964+ while (1) {
2965+ result = lock_carry_level(doing);
2966+ if (result == 0) {
2967+ /* perform operations from @doing and
2968+ accumulate new requests in @todo */
2969+ result = carry_on_level(doing, todo);
2970+ if (result == 0)
2971+ break;
2972+ else if (result != -E_REPEAT ||
2973+ !doing->restartable) {
2974+ warning("nikita-1043",
2975+ "Fatal error during carry: %i",
2976+ result);
2977+ print_level("done", done);
2978+ print_level("doing", doing);
2979+ print_level("todo", todo);
2980+ /* do some rough stuff like aborting
2981+ all pending transcrashes and thus
2982+ pushing tree back to the consistent
2983+ state. Alternatvely, just panic.
2984+ */
2985+ fatal_carry_error(doing, result);
2986+ return result;
2987+ }
2988+ } else if (result != -E_REPEAT) {
2989+ fatal_carry_error(doing, result);
2990+ return result;
2991+ }
2992+ unlock_carry_level(doing, 1);
2993+ }
2994+ /* at this point @done can be safely unlocked */
2995+ done_carry_level(done);
2996+
2997+ /* cyclically shift queues */
2998+ tmp = done;
2999+ done = doing;
3000+ doing = todo;
3001+ todo = tmp;
3002+ init_carry_level(todo, doing->pool);
3003+
3004+ /* give other threads chance to run */
3005+ reiser4_preempt_point();
3006+ }
3007+ done_carry_level(done);
3008+
3009+ /* all counters, but x_refs should remain the same. x_refs can change
3010+ owing to transaction manager */
3011+ ON_DEBUG(CHECK_COUNTERS);
3012+ return result;
3013+}
3014+
3015+/* perform carry operations on given level.
3016+
3017+ Optimizations proposed by pooh:
3018+
3019+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
3020+ required;
3021+
3022+ (2) unlock node if there are no more operations to be performed upon it and
3023+ node didn't add any operation to @todo. This can be implemented by
3024+ attaching to each node two counters: counter of operaions working on this
3025+ node and counter and operations carried upward from this node.
3026+
3027+*/
3028+static int carry_on_level(carry_level * doing /* queue of carry operations to
3029+ * do on this level */ ,
3030+ carry_level * todo /* queue where new carry
3031+ * operations to be performed on
3032+ * the * parent level are
3033+ * accumulated during @doing
3034+ * processing. */ )
3035+{
3036+ int result;
3037+ int (*f) (carry_op *, carry_level *, carry_level *);
3038+ carry_op *op;
3039+ carry_op *tmp_op;
3040+
3041+ assert("nikita-1034", doing != NULL);
3042+ assert("nikita-1035", todo != NULL);
3043+
3044+ /* @doing->nodes are locked. */
3045+
3046+ /* This function can be split into two phases: analysis and modification.
3047+
3048+ Analysis calculates precisely what items should be moved between
3049+ nodes. This information is gathered in some structures attached to
3050+ each carry_node in a @doing queue. Analysis also determines whether
3051+ new nodes are to be allocated etc.
3052+
3053+ After analysis is completed, actual modification is performed. Here
3054+ we can take advantage of "batch modification": if there are several
3055+ operations acting on the same node, modifications can be performed
3056+ more efficiently when batched together.
3057+
3058+ Above is an optimization left for the future.
3059+ */
3060+ /* Important, but delayed optimization: it's possible to batch
3061+ operations together and perform them more efficiently as a
3062+ result. For example, deletion of several neighboring items from a
3063+ node can be converted to a single ->cut() operation.
3064+
3065+ Before processing queue, it should be scanned and "mergeable"
3066+ operations merged.
3067+ */
3068+ result = 0;
3069+ for_all_ops(doing, op, tmp_op) {
3070+ carry_opcode opcode;
3071+
3072+ assert("nikita-1041", op != NULL);
3073+ opcode = op->op;
3074+ assert("nikita-1042", op->op < COP_LAST_OP);
3075+ f = op_dispatch_table[op->op].handler;
3076+ result = f(op, doing, todo);
3077+ /* locking can fail with -E_REPEAT. Any different error is fatal
3078+ and will be handled by fatal_carry_error() sledgehammer.
3079+ */
3080+ if (result != 0)
3081+ break;
3082+ }
3083+ if (result == 0) {
3084+ carry_plugin_info info;
3085+ carry_node *scan;
3086+ carry_node *tmp_scan;
3087+
3088+ info.doing = doing;
3089+ info.todo = todo;
3090+
3091+ assert("nikita-3002",
3092+ carry_level_invariant(doing, CARRY_DOING));
3093+ for_all_nodes(doing, scan, tmp_scan) {
3094+ znode *node;
3095+
3096+ node = reiser4_carry_real(scan);
3097+ assert("nikita-2547", node != NULL);
3098+ if (node_is_empty(node)) {
3099+ result =
3100+ node_plugin_by_node(node)->
3101+ prepare_removal(node, &info);
3102+ if (result != 0)
3103+ break;
3104+ }
3105+ }
3106+ }
3107+ return result;
3108+}
3109+
3110+/* post carry operation
3111+
3112+ This is main function used by external carry clients: node layout plugins
3113+ and tree operations to create new carry operation to be performed on some
3114+ level.
3115+
3116+ New operation will be included in the @level queue. To actually perform it,
3117+ call carry( level, ... ). This function takes write lock on @node. Carry
3118+ manages all its locks by itself, don't worry about this.
3119+
3120+ This function adds operation and node at the end of the queue. It is up to
3121+ caller to guarantee proper ordering of node queue.
3122+
3123+*/
3124+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
3125+ * is to be posted at */ ,
3126+ carry_opcode op /* opcode of operation */ ,
3127+ znode * node /* node on which this operation
3128+ * will operate */ ,
3129+ int apply_to_parent_p /* whether operation will
3130+ * operate directly on @node
3131+ * or on it parent. */)
3132+{
3133+ carry_op *result;
3134+ carry_node *child;
3135+
3136+ assert("nikita-1046", level != NULL);
3137+ assert("nikita-1788", znode_is_write_locked(node));
3138+
3139+ result = add_op(level, POOLO_LAST, NULL);
3140+ if (IS_ERR(result))
3141+ return result;
3142+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
3143+ if (IS_ERR(child)) {
3144+ reiser4_pool_free(&level->pool->op_pool, &result->header);
3145+ return (carry_op *) child;
3146+ }
3147+ result->node = child;
3148+ result->op = op;
3149+ child->parent = apply_to_parent_p;
3150+ if (ZF_ISSET(node, JNODE_ORPHAN))
3151+ child->left_before = 1;
3152+ child->node = node;
3153+ return result;
3154+}
3155+
3156+/* initialize carry queue */
3157+void init_carry_level(carry_level * level /* level to initialize */ ,
3158+ carry_pool * pool /* pool @level will allocate objects
3159+ * from */ )
3160+{
3161+ assert("nikita-1045", level != NULL);
3162+ assert("nikita-967", pool != NULL);
3163+
3164+ memset(level, 0, sizeof *level);
3165+ level->pool = pool;
3166+
3167+ INIT_LIST_HEAD(&level->nodes);
3168+ INIT_LIST_HEAD(&level->ops);
3169+}
3170+
3171+/* allocate carry pool and initialize pools within queue */
3172+carry_pool *init_carry_pool(int size)
3173+{
3174+ carry_pool *pool;
3175+
3176+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
3177+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
3178+ if (pool == NULL)
3179+ return ERR_PTR(RETERR(-ENOMEM));
3180+
3181+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
3182+ (char *)pool->op);
3183+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
3184+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
3185+ return pool;
3186+}
3187+
3188+/* finish with queue pools */
3189+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
3190+{
3191+ reiser4_done_pool(&pool->op_pool);
3192+ reiser4_done_pool(&pool->node_pool);
3193+ kfree(pool);
3194+}
3195+
3196+/* add new carry node to the @level.
3197+
3198+ Returns pointer to the new carry node allocated from pool. It's up to
3199+ callers to maintain proper order in the @level. Assumption is that if carry
3200+ nodes on one level are already sorted and modifications are peroformed from
3201+ left to right, carry nodes added on the parent level will be ordered
3202+ automatically. To control ordering use @order and @reference parameters.
3203+
3204+*/
3205+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
3206+ * node to */ ,
3207+ pool_ordering order /* where to insert:
3208+ * at the beginning of
3209+ * @level,
3210+ * before @reference,
3211+ * after @reference,
3212+ * at the end of @level
3213+ */ ,
3214+ carry_node * reference/* reference node for
3215+ * insertion */)
3216+{
3217+ ON_DEBUG(carry_node * orig_ref = reference);
3218+
3219+ if (order == POOLO_BEFORE) {
3220+ reference = find_left_carry(reference, level);
3221+ if (reference == NULL)
3222+ reference = list_entry(level->nodes.next, carry_node,
3223+ header.level_linkage);
3224+ else
3225+ reference = list_entry(reference->header.level_linkage.next,
3226+ carry_node, header.level_linkage);
3227+ } else if (order == POOLO_AFTER) {
3228+ reference = find_right_carry(reference, level);
3229+ if (reference == NULL)
3230+ reference = list_entry(level->nodes.prev, carry_node,
3231+ header.level_linkage);
3232+ else
3233+ reference = list_entry(reference->header.level_linkage.prev,
3234+ carry_node, header.level_linkage);
3235+ }
3236+ assert("nikita-2209",
3237+ ergo(orig_ref != NULL,
3238+ reiser4_carry_real(reference) ==
3239+ reiser4_carry_real(orig_ref)));
3240+ return reiser4_add_carry(level, order, reference);
3241+}
3242+
3243+carry_node *reiser4_add_carry(carry_level * level /* &carry_level to add node
3244+ * to */ ,
3245+ pool_ordering order /* where to insert: at the
3246+ * beginning of @level, before
3247+ * @reference, after @reference,
3248+ * at the end of @level */ ,
3249+ carry_node * reference /* reference node for
3250+ * insertion */ )
3251+{
3252+ carry_node *result;
3253+
3254+ result =
3255+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
3256+ &level->nodes,
3257+ order, &reference->header);
3258+ if (!IS_ERR(result) && (result != NULL))
3259+ ++level->nodes_num;
3260+ return result;
3261+}
3262+
3263+/* add new carry operation to the @level.
3264+
3265+ Returns pointer to the new carry operations allocated from pool. It's up to
3266+ callers to maintain proper order in the @level. To control ordering use
3267+ @order and @reference parameters.
3268+
3269+*/
3270+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
3271+ pool_ordering order /* where to insert: at the beginning of
3272+ * @level, before @reference, after
3273+ * @reference, at the end of @level */ ,
3274+ carry_op *
3275+ reference /* reference node for insertion */ )
3276+{
3277+ carry_op *result;
3278+
3279+ result =
3280+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
3281+ order, &reference->header);
3282+ if (!IS_ERR(result) && (result != NULL))
3283+ ++level->ops_num;
3284+ return result;
3285+}
3286+
3287+/* Return node on the right of which @node was created.
3288+
3289+ Each node is created on the right of some existing node (or it is new root,
3290+ which is special case not handled here).
3291+
3292+ @node is new node created on some level, but not yet inserted into its
3293+ parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3294+
3295+*/
3296+static carry_node *find_begetting_brother(carry_node * node /* node to start search
3297+ * from */ ,
3298+ carry_level * kin UNUSED_ARG /* level to
3299+ * scan */ )
3300+{
3301+ carry_node *scan;
3302+
3303+ assert("nikita-1614", node != NULL);
3304+ assert("nikita-1615", kin != NULL);
3305+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3306+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
3307+ ZF_ISSET(reiser4_carry_real(node),
3308+ JNODE_ORPHAN)));
3309+ for (scan = node;;
3310+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
3311+ header.level_linkage)) {
3312+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3313+ if ((scan->node != node->node) &&
3314+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3315+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
3316+ break;
3317+ }
3318+ }
3319+ return scan;
3320+}
3321+
3322+static cmp_t
3323+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3324+{
3325+ assert("nikita-2199", n1 != NULL);
3326+ assert("nikita-2200", n2 != NULL);
3327+
3328+ if (n1 == n2)
3329+ return EQUAL_TO;
3330+ while (1) {
3331+ n1 = carry_node_next(n1);
3332+ if (carry_node_end(level, n1))
3333+ return GREATER_THAN;
3334+ if (n1 == n2)
3335+ return LESS_THAN;
3336+ }
3337+ impossible("nikita-2201", "End of level reached");
3338+}
3339+
3340+carry_node *find_carry_node(carry_level * level, const znode * node)
3341+{
3342+ carry_node *scan;
3343+ carry_node *tmp_scan;
3344+
3345+ assert("nikita-2202", level != NULL);
3346+ assert("nikita-2203", node != NULL);
3347+
3348+ for_all_nodes(level, scan, tmp_scan) {
3349+ if (reiser4_carry_real(scan) == node)
3350+ return scan;
3351+ }
3352+ return NULL;
3353+}
3354+
3355+znode *reiser4_carry_real(const carry_node * node)
3356+{
3357+ assert("nikita-3061", node != NULL);
3358+
3359+ return node->lock_handle.node;
3360+}
3361+
3362+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3363+ const znode * node)
3364+{
3365+ carry_node *base;
3366+ carry_node *scan;
3367+ carry_node *tmp_scan;
3368+ carry_node *proj;
3369+
3370+ base = find_carry_node(doing, node);
3371+ assert("nikita-2204", base != NULL);
3372+
3373+ for_all_nodes(todo, scan, tmp_scan) {
3374+ proj = find_carry_node(doing, scan->node);
3375+ assert("nikita-2205", proj != NULL);
3376+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3377+ break;
3378+ }
3379+ return scan;
3380+}
3381+
3382+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3383+ znode * node)
3384+{
3385+ carry_node *reference;
3386+
3387+ assert("nikita-2994", doing != NULL);
3388+ assert("nikita-2995", todo != NULL);
3389+ assert("nikita-2996", node != NULL);
3390+
3391+ reference = insert_carry_node(doing, todo, node);
3392+ assert("nikita-2997", reference != NULL);
3393+
3394+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
3395+}
3396+
3397+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
3398+ This function is different from reiser4_post_carry() in that it finds proper
3399+ place to insert node in the queue. */
3400+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
3401+ * passed down to node
3402+ * plugin */ ,
3403+ carry_opcode op /* opcode of operation */ ,
3404+ znode * node /* node on which this
3405+ * operation will operate */ ,
3406+ int apply_to_parent_p /* whether operation will
3407+ * operate directly on @node
3408+ * or on it parent. */ )
3409+{
3410+ carry_op *result;
3411+ carry_node *child;
3412+
3413+ assert("nikita-2207", info != NULL);
3414+ assert("nikita-2208", info->todo != NULL);
3415+
3416+ if (info->doing == NULL)
3417+ return reiser4_post_carry(info->todo, op, node,
3418+ apply_to_parent_p);
3419+
3420+ result = add_op(info->todo, POOLO_LAST, NULL);
3421+ if (IS_ERR(result))
3422+ return result;
3423+ child = add_carry_atplace(info->doing, info->todo, node);
3424+ if (IS_ERR(child)) {
3425+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3426+ return (carry_op *) child;
3427+ }
3428+ result->node = child;
3429+ result->op = op;
3430+ child->parent = apply_to_parent_p;
3431+ if (ZF_ISSET(node, JNODE_ORPHAN))
3432+ child->left_before = 1;
3433+ child->node = node;
3434+ return result;
3435+}
3436+
3437+/* lock all carry nodes in @level */
3438+static int lock_carry_level(carry_level * level /* level to lock */ )
3439+{
3440+ int result;
3441+ carry_node *node;
3442+ carry_node *tmp_node;
3443+
3444+ assert("nikita-881", level != NULL);
3445+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3446+
3447+ /* lock nodes from left to right */
3448+ result = 0;
3449+ for_all_nodes(level, node, tmp_node) {
3450+ result = lock_carry_node(level, node);
3451+ if (result != 0)
3452+ break;
3453+ }
3454+ return result;
3455+}
3456+
3457+/* Synchronize delimiting keys between @node and its left neighbor.
3458+
3459+ To reduce contention on dk key and simplify carry code, we synchronize
3460+ delimiting keys only when carry ultimately leaves tree level (carrying
3461+ changes upward) and unlocks nodes at this level.
3462+
3463+ This function first finds left neighbor of @node and then updates left
3464+ neighbor's right delimiting key to conincide with least key in @node.
3465+
3466+*/
3467+
3468+ON_DEBUG(extern atomic_t delim_key_version;
3469+ )
3470+
3471+static void sync_dkeys(znode * spot /* node to update */ )
3472+{
3473+ reiser4_key pivot;
3474+ reiser4_tree *tree;
3475+
3476+ assert("nikita-1610", spot != NULL);
3477+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3478+
3479+ tree = znode_get_tree(spot);
3480+ read_lock_tree(tree);
3481+ write_lock_dk(tree);
3482+
3483+ assert("nikita-2192", znode_is_loaded(spot));
3484+
3485+ /* sync left delimiting key of @spot with key in its leftmost item */
3486+ if (node_is_empty(spot))
3487+ pivot = *znode_get_rd_key(spot);
3488+ else
3489+ leftmost_key_in_node(spot, &pivot);
3490+
3491+ znode_set_ld_key(spot, &pivot);
3492+
3493+ /* there can be sequence of empty nodes pending removal on the left of
3494+ @spot. Scan them and update their left and right delimiting keys to
3495+ match left delimiting key of @spot. Also, update right delimiting
3496+ key of first non-empty left neighbor.
3497+ */
3498+ while (1) {
3499+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3500+ break;
3501+
3502+ spot = spot->left;
3503+ if (spot == NULL)
3504+ break;
3505+
3506+ znode_set_rd_key(spot, &pivot);
3507+ /* don't sink into the domain of another balancing */
3508+ if (!znode_is_write_locked(spot))
3509+ break;
3510+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3511+ znode_set_ld_key(spot, &pivot);
3512+ else
3513+ break;
3514+ }
3515+
3516+ write_unlock_dk(tree);
3517+ read_unlock_tree(tree);
3518+}
3519+
3520+/* unlock all carry nodes in @level */
3521+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3522+ int failure /* true if unlocking owing to
3523+ * failure */ )
3524+{
3525+ carry_node *node;
3526+ carry_node *tmp_node;
3527+
3528+ assert("nikita-889", level != NULL);
3529+
3530+ if (!failure) {
3531+ znode *spot;
3532+
3533+ spot = NULL;
3534+ /* update delimiting keys */
3535+ for_all_nodes(level, node, tmp_node) {
3536+ if (reiser4_carry_real(node) != spot) {
3537+ spot = reiser4_carry_real(node);
3538+ sync_dkeys(spot);
3539+ }
3540+ }
3541+ }
3542+
3543+ /* nodes can be unlocked in arbitrary order. In preemptible
3544+ environment it's better to unlock in reverse order of locking,
3545+ though.
3546+ */
3547+ for_all_nodes_back(level, node, tmp_node) {
3548+ /* all allocated nodes should be already linked to their
3549+ parents at this moment. */
3550+ assert("nikita-1631",
3551+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3552+ JNODE_ORPHAN)));
3553+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3554+ unlock_carry_node(level, node, failure);
3555+ }
3556+ level->new_root = NULL;
3557+}
3558+
3559+/* finish with @level
3560+
3561+ Unlock nodes and release all allocated resources */
3562+static void done_carry_level(carry_level * level /* level to finish */ )
3563+{
3564+ carry_node *node;
3565+ carry_node *tmp_node;
3566+ carry_op *op;
3567+ carry_op *tmp_op;
3568+
3569+ assert("nikita-1076", level != NULL);
3570+
3571+ unlock_carry_level(level, 0);
3572+ for_all_nodes(level, node, tmp_node) {
3573+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3574+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3575+ reiser4_pool_free(&level->pool->node_pool, &node->header);
3576+ }
3577+ for_all_ops(level, op, tmp_op)
3578+ reiser4_pool_free(&level->pool->op_pool, &op->header);
3579+}
3580+
3581+/* helper function to complete locking of carry node
3582+
3583+ Finish locking of carry node. There are several ways in which new carry
3584+ node can be added into carry level and locked. Normal is through
3585+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
3586+ function factors out common final part of all locking scenarios. It
3587+ supposes that @node -> lock_handle is lock handle for lock just taken and
3588+ fills ->real_node from this lock handle.
3589+
3590+*/
3591+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3592+{
3593+ assert("nikita-1052", node != NULL);
3594+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
3595+ assert("nikita-1188", !node->unlock);
3596+
3597+ node->unlock = 1;
3598+ /* Load node content into memory and install node plugin by
3599+ looking at the node header.
3600+
3601+ Most of the time this call is cheap because the node is
3602+ already in memory.
3603+
3604+ Corresponding zrelse() is in unlock_carry_node()
3605+ */
3606+ return zload(reiser4_carry_real(node));
3607+}
3608+
3609+/* lock carry node
3610+
3611+ "Resolve" node to real znode, lock it and mark as locked.
3612+ This requires recursive locking of znodes.
3613+
3614+ When operation is posted to the parent level, node it will be applied to is
3615+ not yet known. For example, when shifting data between two nodes,
3616+ delimiting has to be updated in parent or parents of nodes involved. But
3617+ their parents is not yet locked and, moreover said nodes can be reparented
3618+ by concurrent balancing.
3619+
3620+ To work around this, carry operation is applied to special "carry node"
3621+ rather than to the znode itself. Carry node consists of some "base" or
3622+ "reference" znode and flags indicating how to get to the target of carry
3623+ operation (->real_node field of carry_node) from base.
3624+
3625+*/
3626+int lock_carry_node(carry_level * level /* level @node is in */ ,
3627+ carry_node * node /* node to lock */ )
3628+{
3629+ int result;
3630+ znode *reference_point;
3631+ lock_handle lh;
3632+ lock_handle tmp_lh;
3633+ reiser4_tree *tree;
3634+
3635+ assert("nikita-887", level != NULL);
3636+ assert("nikita-882", node != NULL);
3637+
3638+ result = 0;
3639+ reference_point = node->node;
3640+ init_lh(&lh);
3641+ init_lh(&tmp_lh);
3642+ if (node->left_before) {
3643+ /* handling of new nodes, allocated on the previous level:
3644+
3645+ some carry ops were propably posted from the new node, but
3646+ this node neither has parent pointer set, nor is
3647+ connected. This will be done in ->create_hook() for
3648+ internal item.
3649+
3650+ No then less, parent of new node has to be locked. To do
3651+ this, first go to the "left" in the carry order. This
3652+ depends on the decision to always allocate new node on the
3653+ right of existing one.
3654+
3655+ Loop handles case when multiple nodes, all orphans, were
3656+ inserted.
3657+
3658+ Strictly speaking, taking tree lock is not necessary here,
3659+ because all nodes scanned by loop in
3660+ find_begetting_brother() are write-locked by this thread,
3661+ and thus, their sibling linkage cannot change.
3662+
3663+ */
3664+ tree = znode_get_tree(reference_point);
3665+ read_lock_tree(tree);
3666+ reference_point = find_begetting_brother(node, level)->node;
3667+ read_unlock_tree(tree);
3668+ assert("nikita-1186", reference_point != NULL);
3669+ }
3670+ if (node->parent && (result == 0)) {
3671+ result =
3672+ reiser4_get_parent(&tmp_lh, reference_point,
3673+ ZNODE_WRITE_LOCK);
3674+ if (result != 0) {
3675+ ; /* nothing */
3676+ } else if (znode_get_level(tmp_lh.node) == 0) {
3677+ assert("nikita-1347", znode_above_root(tmp_lh.node));
3678+ result = add_new_root(level, node, tmp_lh.node);
3679+ if (result == 0) {
3680+ reference_point = level->new_root;
3681+ move_lh(&lh, &node->lock_handle);
3682+ }
3683+ } else if ((level->new_root != NULL)
3684+ && (level->new_root !=
3685+ znode_parent_nolock(reference_point))) {
3686+ /* parent of node exists, but this level aready
3687+ created different new root, so */
3688+ warning("nikita-1109",
3689+ /* it should be "radicis", but tradition is
3690+ tradition. do banshees read latin? */
3691+ "hodie natus est radici frater");
3692+ result = -EIO;
3693+ } else {
3694+ move_lh(&lh, &tmp_lh);
3695+ reference_point = lh.node;
3696+ }
3697+ }
3698+ if (node->left && (result == 0)) {
3699+ assert("nikita-1183", node->parent);
3700+ assert("nikita-883", reference_point != NULL);
3701+ result =
3702+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
3703+ ZNODE_WRITE_LOCK,
3704+ GN_CAN_USE_UPPER_LEVELS);
3705+ if (result == 0) {
3706+ done_lh(&lh);
3707+ move_lh(&lh, &tmp_lh);
3708+ reference_point = lh.node;
3709+ }
3710+ }
3711+ if (!node->parent && !node->left && !node->left_before) {
3712+ result =
3713+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3714+ ZNODE_LOCK_HIPRI);
3715+ }
3716+ if (result == 0) {
3717+ move_lh(&node->lock_handle, &lh);
3718+ result = lock_carry_node_tail(node);
3719+ }
3720+ done_lh(&tmp_lh);
3721+ done_lh(&lh);
3722+ return result;
3723+}
3724+
3725+/* release a lock on &carry_node.
3726+
3727+ Release if necessary lock on @node. This opearion is pair of
3728+ lock_carry_node() and is idempotent: you can call it more than once on the
3729+ same node.
3730+
3731+*/
3732+static void
3733+unlock_carry_node(carry_level * level,
3734+ carry_node * node /* node to be released */ ,
3735+ int failure /* 0 if node is unlocked due
3736+ * to some error */ )
3737+{
3738+ znode *real_node;
3739+
3740+ assert("nikita-884", node != NULL);
3741+
3742+ real_node = reiser4_carry_real(node);
3743+ /* pair to zload() in lock_carry_node_tail() */
3744+ zrelse(real_node);
3745+ if (node->unlock && (real_node != NULL)) {
3746+ assert("nikita-899", real_node == node->lock_handle.node);
3747+ longterm_unlock_znode(&node->lock_handle);
3748+ }
3749+ if (failure) {
3750+ if (node->deallocate && (real_node != NULL)) {
3751+ /* free node in bitmap
3752+
3753+ Prepare node for removal. Last zput() will finish
3754+ with it.
3755+ */
3756+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3757+ }
3758+ if (node->free) {
3759+ assert("nikita-2177",
3760+ list_empty_careful(&node->lock_handle.locks_link));
3761+ assert("nikita-2112",
3762+ list_empty_careful(&node->lock_handle.owners_link));
3763+ reiser4_pool_free(&level->pool->node_pool,
3764+ &node->header);
3765+ }
3766+ }
3767+}
3768+
3769+/* fatal_carry_error() - all-catching error handling function
3770+
3771+ It is possible that carry faces unrecoverable error, like unability to
3772+ insert pointer at the internal level. Our simple solution is just panic in
3773+ this situation. More sophisticated things like attempt to remount
3774+ file-system as read-only can be implemented without much difficlties.
3775+
3776+ It is believed, that:
3777+
3778+ 1. in stead of panicking, all current transactions can be aborted rolling
3779+ system back to the consistent state.
3780+
3781+Umm, if you simply panic without doing anything more at all, then all current
3782+transactions are aborted and the system is rolled back to a consistent state,
3783+by virtue of the design of the transactional mechanism. Well, wait, let's be
3784+precise. If an internal node is corrupted on disk due to hardware failure,
3785+then there may be no consistent state that can be rolled back to, so instead
3786+we should say that it will rollback the transactions, which barring other
3787+factors means rolling back to a consistent state.
3788+
3789+# Nikita: there is a subtle difference between panic and aborting
3790+# transactions: machine doesn't reboot. Processes aren't killed. Processes
3791+# don't using reiser4 (not that we care about such processes), or using other
3792+# reiser4 mounts (about them we do care) will simply continue to run. With
3793+# some luck, even application using aborted file system can survive: it will
3794+# get some error, like EBADF, from each file descriptor on failed file system,
3795+# but applications that do care about tolerance will cope with this (squid
3796+# will).
3797+
3798+It would be a nice feature though to support rollback without rebooting
3799+followed by remount, but this can wait for later versions.
3800+
3801+ 2. once isolated transactions will be implemented it will be possible to
3802+ roll back offending transaction.
3803+
3804+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3805+it more before deciding if it should be done. -Hans
3806+
3807+*/
3808+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
3809+ * where
3810+ * unrecoverable
3811+ * error
3812+ * occurred */ ,
3813+ int ecode /* error code */ )
3814+{
3815+ assert("nikita-1230", doing != NULL);
3816+ assert("nikita-1231", ecode < 0);
3817+
3818+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3819+}
3820+
3821+/* add new root to the tree
3822+
3823+ This function itself only manages changes in carry structures and delegates
3824+ all hard work (allocation of znode for new root, changes of parent and
3825+ sibling pointers to the reiser4_add_tree_root().
3826+
3827+ Locking: old tree root is locked by carry at this point. Fake znode is also
3828+ locked.
3829+
3830+*/
3831+static int add_new_root(carry_level * level /* carry level in context of which
3832+ * operation is performed */ ,
3833+ carry_node * node /* carry node for existing root */ ,
3834+ znode * fake /* "fake" znode already locked by
3835+ * us */ )
3836+{
3837+ int result;
3838+
3839+ assert("nikita-1104", level != NULL);
3840+ assert("nikita-1105", node != NULL);
3841+
3842+ assert("nikita-1403", znode_is_write_locked(node->node));
3843+ assert("nikita-1404", znode_is_write_locked(fake));
3844+
3845+ /* trying to create new root. */
3846+ /* @node is root and it's already locked by us. This
3847+ means that nobody else can be trying to add/remove
3848+ tree root right now.
3849+ */
3850+ if (level->new_root == NULL)
3851+ level->new_root = reiser4_add_tree_root(node->node, fake);
3852+ if (!IS_ERR(level->new_root)) {
3853+ assert("nikita-1210", znode_is_root(level->new_root));
3854+ node->deallocate = 1;
3855+ result =
3856+ longterm_lock_znode(&node->lock_handle, level->new_root,
3857+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3858+ if (result == 0)
3859+ zput(level->new_root);
3860+ } else {
3861+ result = PTR_ERR(level->new_root);
3862+ level->new_root = NULL;
3863+ }
3864+ return result;
3865+}
3866+
3867+/* allocate new znode and add the operation that inserts the
3868+ pointer to it into the parent node into the todo level
3869+
3870+ Allocate new znode, add it into carry queue and post into @todo queue
3871+ request to add pointer to new node into its parent.
3872+
3873+ This is carry related routing that calls reiser4_new_node() to allocate new
3874+ node.
3875+*/
3876+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
3877+ * node */ ,
3878+ carry_node * ref /* carry node after which new
3879+ * carry node is to be inserted
3880+ * into queue. This affects
3881+ * locking. */ ,
3882+ carry_level * doing /* carry queue where new node is
3883+ * to be added */ ,
3884+ carry_level * todo /* carry queue where COP_INSERT
3885+ * operation to add pointer to
3886+ * new node will ne added */ )
3887+{
3888+ carry_node *fresh;
3889+ znode *new_znode;
3890+ carry_op *add_pointer;
3891+ carry_plugin_info info;
3892+
3893+ assert("nikita-1048", brother != NULL);
3894+ assert("nikita-1049", todo != NULL);
3895+
3896+ /* There is a lot of possible variations here: to what parent
3897+ new node will be attached and where. For simplicity, always
3898+ do the following:
3899+
3900+ (1) new node and @brother will have the same parent.
3901+
3902+ (2) new node is added on the right of @brother
3903+
3904+ */
3905+
3906+ fresh = reiser4_add_carry_skip(doing,
3907+ ref ? POOLO_AFTER : POOLO_LAST, ref);
3908+ if (IS_ERR(fresh))
3909+ return fresh;
3910+
3911+ fresh->deallocate = 1;
3912+ fresh->free = 1;
3913+
3914+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
3915+ if (IS_ERR(new_znode))
3916+ /* @fresh will be deallocated automatically by error
3917+ handling code in the caller. */
3918+ return (carry_node *) new_znode;
3919+
3920+ /* new_znode returned znode with x_count 1. Caller has to decrease
3921+ it. make_space() does. */
3922+
3923+ ZF_SET(new_znode, JNODE_ORPHAN);
3924+ fresh->node = new_znode;
3925+
3926+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3927+ ref = carry_node_prev(ref);
3928+ assert("nikita-1606", !carry_node_end(doing, ref));
3929+ }
3930+
3931+ info.todo = todo;
3932+ info.doing = doing;
3933+ add_pointer = node_post_carry(&info, COP_INSERT,
3934+ reiser4_carry_real(ref), 1);
3935+ if (IS_ERR(add_pointer)) {
3936+ /* no need to deallocate @new_znode here: it will be
3937+ deallocated during carry error handling. */
3938+ return (carry_node *) add_pointer;
3939+ }
3940+
3941+ add_pointer->u.insert.type = COPT_CHILD;
3942+ add_pointer->u.insert.child = fresh;
3943+ add_pointer->u.insert.brother = brother;
3944+ /* initially new node spawns empty key range */
3945+ write_lock_dk(znode_get_tree(brother));
3946+ znode_set_ld_key(new_znode,
3947+ znode_set_rd_key(new_znode,
3948+ znode_get_rd_key(brother)));
3949+ write_unlock_dk(znode_get_tree(brother));
3950+ return fresh;
3951+}
3952+
3953+/* DEBUGGING FUNCTIONS.
3954+
3955+ Probably we also should leave them on even when
3956+ debugging is turned off to print dumps at errors.
3957+*/
3958+#if REISER4_DEBUG
3959+static int carry_level_invariant(carry_level * level, carry_queue_state state)
3960+{
3961+ carry_node *node;
3962+ carry_node *tmp_node;
3963+
3964+ if (level == NULL)
3965+ return 0;
3966+
3967+ if (level->track_type != 0 &&
3968+ level->track_type != CARRY_TRACK_NODE &&
3969+ level->track_type != CARRY_TRACK_CHANGE)
3970+ return 0;
3971+
3972+ /* check that nodes are in ascending order */
3973+ for_all_nodes(level, node, tmp_node) {
3974+ znode *left;
3975+ znode *right;
3976+
3977+ reiser4_key lkey;
3978+ reiser4_key rkey;
3979+
3980+ if (node != carry_node_front(level)) {
3981+ if (state == CARRY_TODO) {
3982+ right = node->node;
3983+ left = carry_node_prev(node)->node;
3984+ } else {
3985+ right = reiser4_carry_real(node);
3986+ left = reiser4_carry_real(carry_node_prev(node));
3987+ }
3988+ if (right == NULL || left == NULL)
3989+ continue;
3990+ if (node_is_empty(right) || node_is_empty(left))
3991+ continue;
3992+ if (!keyle(leftmost_key_in_node(left, &lkey),
3993+ leftmost_key_in_node(right, &rkey))) {
3994+ warning("", "wrong key order");
3995+ return 0;
3996+ }
3997+ }
3998+ }
3999+ return 1;
4000+}
4001+#endif
4002+
4003+/* get symbolic name for boolean */
4004+static const char *tf(int boolean /* truth value */ )
4005+{
4006+ return boolean ? "t" : "f";
4007+}
4008+
4009+/* symbolic name for carry operation */
4010+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
4011+{
4012+ switch (op) {
4013+ case COP_INSERT:
4014+ return "COP_INSERT";
4015+ case COP_DELETE:
4016+ return "COP_DELETE";
4017+ case COP_CUT:
4018+ return "COP_CUT";
4019+ case COP_PASTE:
4020+ return "COP_PASTE";
4021+ case COP_UPDATE:
4022+ return "COP_UPDATE";
4023+ case COP_EXTENT:
4024+ return "COP_EXTENT";
4025+ case COP_INSERT_FLOW:
4026+ return "COP_INSERT_FLOW";
4027+ default:{
4028+ /* not mt safe, but who cares? */
4029+ static char buf[20];
4030+
4031+ sprintf(buf, "unknown op: %x", op);
4032+ return buf;
4033+ }
4034+ }
4035+}
4036+
4037+/* dump information about carry node */
4038+static void print_carry(const char *prefix /* prefix to print */ ,
4039+ carry_node * node /* node to print */ )
4040+{
4041+ if (node == NULL) {
4042+ printk("%s: null\n", prefix);
4043+ return;
4044+ }
4045+ printk
4046+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
4047+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
4048+ tf(node->free), tf(node->deallocate));
4049+}
4050+
4051+/* dump information about carry operation */
4052+static void print_op(const char *prefix /* prefix to print */ ,
4053+ carry_op * op /* operation to print */ )
4054+{
4055+ if (op == NULL) {
4056+ printk("%s: null\n", prefix);
4057+ return;
4058+ }
4059+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
4060+ print_carry("\tnode", op->node);
4061+ switch (op->op) {
4062+ case COP_INSERT:
4063+ case COP_PASTE:
4064+ print_coord("\tcoord",
4065+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
4066+ reiser4_print_key("\tkey",
4067+ op->u.insert.d ? op->u.insert.d->key : NULL);
4068+ print_carry("\tchild", op->u.insert.child);
4069+ break;
4070+ case COP_DELETE:
4071+ print_carry("\tchild", op->u.delete.child);
4072+ break;
4073+ case COP_CUT:
4074+ if (op->u.cut_or_kill.is_cut) {
4075+ print_coord("\tfrom",
4076+ op->u.cut_or_kill.u.kill->params.from, 0);
4077+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
4078+ 0);
4079+ } else {
4080+ print_coord("\tfrom",
4081+ op->u.cut_or_kill.u.cut->params.from, 0);
4082+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
4083+ 0);
4084+ }
4085+ break;
4086+ case COP_UPDATE:
4087+ print_carry("\tleft", op->u.update.left);
4088+ break;
4089+ default:
4090+ /* do nothing */
4091+ break;
4092+ }
4093+}
4094+
4095+/* dump information about all nodes and operations in a @level */
4096+static void print_level(const char *prefix /* prefix to print */ ,
4097+ carry_level * level /* level to print */ )
4098+{
4099+ carry_node *node;
4100+ carry_node *tmp_node;
4101+ carry_op *op;
4102+ carry_op *tmp_op;
4103+
4104+ if (level == NULL) {
4105+ printk("%s: null\n", prefix);
4106+ return;
4107+ }
4108+ printk("%s: %p, restartable: %s\n",
4109+ prefix, level, tf(level->restartable));
4110+
4111+ for_all_nodes(level, node, tmp_node)
4112+ print_carry("\tcarry node", node);
4113+ for_all_ops(level, op, tmp_op)
4114+ print_op("\tcarry op", op);
4115+}
4116+
4117+/* Make Linus happy.
4118+ Local variables:
4119+ c-indentation-style: "K&R"
4120+ mode-name: "LC"
4121+ c-basic-offset: 8
4122+ tab-width: 8
4123+ fill-column: 120
4124+ scroll-step: 1
4125+ End:
4126+*/
4127diff --git a/fs/reiser4/carry.h b/fs/reiser4/carry.h
4128new file mode 100644
4129index 0000000..6341d73
4130--- /dev/null
4131+++ b/fs/reiser4/carry.h
4132@@ -0,0 +1,442 @@
4133+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4134+
4135+/* Functions and data types to "carry" tree modification(s) upward.
4136+ See fs/reiser4/carry.c for details. */
4137+
4138+#if !defined( __FS_REISER4_CARRY_H__ )
4139+#define __FS_REISER4_CARRY_H__
4140+
4141+#include "forward.h"
4142+#include "debug.h"
4143+#include "pool.h"
4144+#include "znode.h"
4145+
4146+#include <linux/types.h>
4147+
4148+/* &carry_node - "location" of carry node.
4149+
4150+ "location" of node that is involved or going to be involved into
4151+ carry process. Node where operation will be carried to on the
4152+ parent level cannot be recorded explicitly. Operation will be carried
4153+ usually to the parent of some node (where changes are performed at
4154+ the current level) or, to the left neighbor of its parent. But while
4155+ modifications are performed at the current level, parent may
4156+ change. So, we have to allow some indirection (or, positevly,
4157+ flexibility) in locating carry nodes.
4158+
4159+*/
4160+typedef struct carry_node {
4161+ /* pool linkage */
4162+ reiser4_pool_header header;
4163+
4164+ /* base node from which real_node is calculated. See
4165+ fs/reiser4/carry.c:lock_carry_node(). */
4166+ znode *node;
4167+
4168+ /* how to get ->real_node */
4169+ /* to get ->real_node obtain parent of ->node */
4170+ __u32 parent:1;
4171+ /* to get ->real_node obtain left neighbor of parent of
4172+ ->node */
4173+ __u32 left:1;
4174+ __u32 left_before:1;
4175+
4176+ /* locking */
4177+
4178+ /* this node was locked by carry process and should be
4179+ unlocked when carry leaves a level */
4180+ __u32 unlock:1;
4181+
4182+ /* disk block for this node was allocated by carry process and
4183+ should be deallocated when carry leaves a level */
4184+ __u32 deallocate:1;
4185+ /* this carry node was allocated by carry process and should be
4186+ freed when carry leaves a level */
4187+ __u32 free:1;
4188+
4189+ /* type of lock we want to take on this node */
4190+ lock_handle lock_handle;
4191+} carry_node;
4192+
4193+/* &carry_opcode - elementary operations that can be carried upward
4194+
4195+ Operations that carry() can handle. This list is supposed to be
4196+ expanded.
4197+
4198+ Each carry operation (cop) is handled by appropriate function defined
4199+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
4200+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4201+ call plugins of nodes affected by operation to modify nodes' content
4202+ and to gather operations to be performed on the next level.
4203+
4204+*/
4205+typedef enum {
4206+ /* insert new item into node. */
4207+ COP_INSERT,
4208+ /* delete pointer from parent node */
4209+ COP_DELETE,
4210+ /* remove part of or whole node. */
4211+ COP_CUT,
4212+ /* increase size of item. */
4213+ COP_PASTE,
4214+ /* insert extent (that is sequence of unformatted nodes). */
4215+ COP_EXTENT,
4216+ /* update delimiting key in least common ancestor of two
4217+ nodes. This is performed when items are moved between two
4218+ nodes.
4219+ */
4220+ COP_UPDATE,
4221+ /* insert flow */
4222+ COP_INSERT_FLOW,
4223+ COP_LAST_OP,
4224+} carry_opcode;
4225+
4226+#define CARRY_FLOW_NEW_NODES_LIMIT 20
4227+
4228+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4229+ item is determined. */
4230+typedef enum {
4231+ /* target item is one containing pointer to the ->child node */
4232+ COPT_CHILD,
4233+ /* target item is given explicitly by @coord */
4234+ COPT_ITEM_DATA,
4235+ /* target item is given by key */
4236+ COPT_KEY,
4237+ /* see insert_paste_common() for more comments on this. */
4238+ COPT_PASTE_RESTARTED,
4239+} cop_insert_pos_type;
4240+
4241+/* flags to cut and delete */
4242+typedef enum {
4243+ /* don't kill node even if it became completely empty as results of
4244+ * cut. This is needed for eottl handling. See carry_extent() for
4245+ * details. */
4246+ DELETE_RETAIN_EMPTY = (1 << 0)
4247+} cop_delete_flag;
4248+
4249+/*
4250+ * carry() implements "lock handle tracking" feature.
4251+ *
4252+ * Callers supply carry with node where to perform initial operation and lock
4253+ * handle on this node. Trying to optimize node utilization carry may actually
4254+ * move insertion point to different node. Callers expect that lock handle
4255+ * will rebe transferred to the new node also.
4256+ *
4257+ */
4258+typedef enum {
4259+ /* transfer lock handle along with insertion point */
4260+ CARRY_TRACK_CHANGE = 1,
4261+ /* acquire new lock handle to the node where insertion point is. This
4262+ * is used when carry() client doesn't initially possess lock handle
4263+ * on the insertion point node, for example, by extent insertion
4264+ * code. See carry_extent(). */
4265+ CARRY_TRACK_NODE = 2
4266+} carry_track_type;
4267+
4268+/* data supplied to COP_{INSERT|PASTE} by callers */
4269+typedef struct carry_insert_data {
4270+ /* position where new item is to be inserted */
4271+ coord_t *coord;
4272+ /* new item description */
4273+ reiser4_item_data *data;
4274+ /* key of new item */
4275+ const reiser4_key *key;
4276+} carry_insert_data;
4277+
4278+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4279+struct cut_kill_params {
4280+ /* coord where cut starts (inclusive) */
4281+ coord_t *from;
4282+ /* coord where cut stops (inclusive, this item/unit will also be
4283+ * cut) */
4284+ coord_t *to;
4285+ /* starting key. This is necessary when item and unit pos don't
4286+ * uniquely identify what portion or tree to remove. For example, this
4287+ * indicates what portion of extent unit will be affected. */
4288+ const reiser4_key *from_key;
4289+ /* exclusive stop key */
4290+ const reiser4_key *to_key;
4291+ /* if this is not NULL, smallest actually removed key is stored
4292+ * here. */
4293+ reiser4_key *smallest_removed;
4294+ /* kill_node_content() is called for file truncate */
4295+ int truncate;
4296+};
4297+
4298+struct carry_cut_data {
4299+ struct cut_kill_params params;
4300+};
4301+
4302+struct carry_kill_data {
4303+ struct cut_kill_params params;
4304+ /* parameter to be passed to the ->kill_hook() method of item
4305+ * plugin */
4306+ /*void *iplug_params; *//* FIXME: unused currently */
4307+ /* if not NULL---inode whose items are being removed. This is needed
4308+ * for ->kill_hook() of extent item to update VM structures when
4309+ * removing pages. */
4310+ struct inode *inode;
4311+ /* sibling list maintenance is complicated by existence of eottl. When
4312+ * eottl whose left and right neighbors are formatted leaves is
4313+ * removed, one has to connect said leaves in the sibling list. This
4314+ * cannot be done when extent removal is just started as locking rules
4315+ * require sibling list update to happen atomically with removal of
4316+ * extent item. Therefore: 1. pointers to left and right neighbors
4317+ * have to be passed down to the ->kill_hook() of extent item, and
4318+ * 2. said neighbors have to be locked. */
4319+ lock_handle *left;
4320+ lock_handle *right;
4321+ /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4322+ unsigned flags;
4323+ char *buf;
4324+};
4325+
4326+/* &carry_tree_op - operation to "carry" upward.
4327+
4328+ Description of an operation we want to "carry" to the upper level of
4329+ a tree: e.g, when we insert something and there is not enough space
4330+ we allocate a new node and "carry" the operation of inserting a
4331+ pointer to the new node to the upper level, on removal of empty node,
4332+ we carry up operation of removing appropriate entry from parent.
4333+
4334+ There are two types of carry ops: when adding or deleting node we
4335+ node at the parent level where appropriate modification has to be
4336+ performed is known in advance. When shifting items between nodes
4337+ (split, merge), delimiting key should be changed in the least common
4338+ parent of the nodes involved that is not known in advance.
4339+
4340+ For the operations of the first type we store in &carry_op pointer to
4341+ the &carry_node at the parent level. For the operation of the second
4342+ type we store &carry_node or parents of the left and right nodes
4343+ modified and keep track of them upward until they coincide.
4344+
4345+*/
4346+typedef struct carry_op {
4347+ /* pool linkage */
4348+ reiser4_pool_header header;
4349+ carry_opcode op;
4350+ /* node on which operation is to be performed:
4351+
4352+ for insert, paste: node where new item is to be inserted
4353+
4354+ for delete: node where pointer is to be deleted
4355+
4356+ for cut: node to cut from
4357+
4358+ for update: node where delimiting key is to be modified
4359+
4360+ for modify: parent of modified node
4361+
4362+ */
4363+ carry_node *node;
4364+ union {
4365+ struct {
4366+ /* (sub-)type of insertion/paste. Taken from
4367+ cop_insert_pos_type. */
4368+ __u8 type;
4369+ /* various operation flags. Taken from
4370+ cop_insert_flag. */
4371+ __u8 flags;
4372+ carry_insert_data *d;
4373+ carry_node *child;
4374+ znode *brother;
4375+ } insert, paste, extent;
4376+
4377+ struct {
4378+ int is_cut;
4379+ union {
4380+ carry_kill_data *kill;
4381+ carry_cut_data *cut;
4382+ } u;
4383+ } cut_or_kill;
4384+
4385+ struct {
4386+ carry_node *left;
4387+ } update;
4388+ struct {
4389+ /* changed child */
4390+ carry_node *child;
4391+ /* bitmask of changes. See &cop_modify_flag */
4392+ __u32 flag;
4393+ } modify;
4394+ struct {
4395+ /* flags to deletion operation. Are taken from
4396+ cop_delete_flag */
4397+ __u32 flags;
4398+ /* child to delete from parent. If this is
4399+ NULL, delete op->node. */
4400+ carry_node *child;
4401+ } delete;
4402+ struct {
4403+ /* various operation flags. Taken from
4404+ cop_insert_flag. */
4405+ __u32 flags;
4406+ flow_t *flow;
4407+ coord_t *insert_point;
4408+ reiser4_item_data *data;
4409+ /* flow insertion is limited by number of new blocks
4410+ added in that operation which do not get any data
4411+ but part of flow. This limit is set by macro
4412+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4413+ of nodes added already during one carry_flow */
4414+ int new_nodes;
4415+ } insert_flow;
4416+ } u;
4417+} carry_op;
4418+
4419+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4420+typedef struct carry_pool {
4421+ carry_op op[CARRIES_POOL_SIZE];
4422+ reiser4_pool op_pool;
4423+ carry_node node[NODES_LOCKED_POOL_SIZE];
4424+ reiser4_pool node_pool;
4425+} carry_pool;
4426+
4427+/* &carry_tree_level - carry process on given level
4428+
4429+ Description of balancing process on the given level.
4430+
4431+ No need for locking here, as carry_tree_level is essentially per
4432+ thread thing (for now).
4433+
4434+*/
4435+struct carry_level {
4436+ /* this level may be restarted */
4437+ __u32 restartable:1;
4438+ /* list of carry nodes on this level, ordered by key order */
4439+ struct list_head nodes;
4440+ struct list_head ops;
4441+ /* pool where new objects are allocated from */
4442+ carry_pool *pool;
4443+ int ops_num;
4444+ int nodes_num;
4445+ /* new root created on this level, if any */
4446+ znode *new_root;
4447+ /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4448+ when they want ->tracked to automagically wander to the node where
4449+ insertion point moved after insert or paste.
4450+ */
4451+ carry_track_type track_type;
4452+ /* lock handle supplied by user that we are tracking. See
4453+ above. */
4454+ lock_handle *tracked;
4455+};
4456+
4457+/* information carry passes to plugin methods that may add new operations to
4458+ the @todo queue */
4459+struct carry_plugin_info {
4460+ carry_level *doing;
4461+ carry_level *todo;
4462+};
4463+
4464+int reiser4_carry(carry_level * doing, carry_level * done);
4465+
4466+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4467+ carry_node * reference);
4468+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4469+ carry_node * reference);
4470+
4471+extern carry_node *insert_carry_node(carry_level * doing,
4472+ carry_level * todo, const znode * node);
4473+
4474+extern carry_pool *init_carry_pool(int);
4475+extern void done_carry_pool(carry_pool * pool);
4476+
4477+extern void init_carry_level(carry_level * level, carry_pool * pool);
4478+
4479+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4480+ znode * node, int apply_to_parent);
4481+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4482+ znode * node, int apply_to_parent_p);
4483+
4484+carry_node *add_new_znode(znode * brother, carry_node * reference,
4485+ carry_level * doing, carry_level * todo);
4486+
4487+carry_node *find_carry_node(carry_level * level, const znode * node);
4488+
4489+extern znode *reiser4_carry_real(const carry_node * node);
4490+
4491+/* helper macros to iterate over carry queues */
4492+
4493+#define carry_node_next( node ) \
4494+ list_entry((node)->header.level_linkage.next, carry_node, \
4495+ header.level_linkage)
4496+
4497+#define carry_node_prev( node ) \
4498+ list_entry((node)->header.level_linkage.prev, carry_node, \
4499+ header.level_linkage)
4500+
4501+#define carry_node_front( level ) \
4502+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
4503+
4504+#define carry_node_back( level ) \
4505+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4506+
4507+#define carry_node_end( level, node ) \
4508+ (&(level)->nodes == &(node)->header.level_linkage)
4509+
4510+/* macro to iterate over all operations in a @level */
4511+#define for_all_ops( level /* carry level (of type carry_level *) */, \
4512+ op /* pointer to carry operation, modified by loop (of \
4513+ * type carry_op *) */, \
4514+ tmp /* pointer to carry operation (of type carry_op *), \
4515+ * used to make iterator stable in the face of \
4516+ * deletions from the level */ ) \
4517+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
4518+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
4519+ &op->header.level_linkage != &level->ops; \
4520+ op = tmp, \
4521+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4522+
4523+#if 0
4524+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \
4525+ tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \
4526+ ! pool_level_list_end( &level -> ops, &op -> header ) ; \
4527+ op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4528+#endif
4529+
4530+/* macro to iterate over all nodes in a @level */ \
4531+#define for_all_nodes( level /* carry level (of type carry_level *) */, \
4532+ node /* pointer to carry node, modified by loop (of \
4533+ * type carry_node *) */, \
4534+ tmp /* pointer to carry node (of type carry_node *), \
4535+ * used to make iterator stable in the face of * \
4536+ * deletions from the level */ ) \
4537+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
4538+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4539+ &node->header.level_linkage != &level->nodes; \
4540+ node = tmp, \
4541+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4542+
4543+#if 0
4544+for( node = carry_node_front( level ), \
4545+ tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \
4546+ node = tmp, tmp = carry_node_next( node ) )
4547+#endif
4548+
4549+/* macro to iterate over all nodes in a @level in reverse order
4550+
4551+ This is used, because nodes are unlocked in reversed order of locking */
4552+#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \
4553+ node /* pointer to carry node, modified by loop \
4554+ * (of type carry_node *) */, \
4555+ tmp /* pointer to carry node (of type carry_node \
4556+ * *), used to make iterator stable in the \
4557+ * face of deletions from the level */ ) \
4558+for( node = carry_node_back( level ), \
4559+ tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \
4560+ node = tmp, tmp = carry_node_prev( node ) )
4561+
4562+/* __FS_REISER4_CARRY_H__ */
4563+#endif
4564+
4565+/* Make Linus happy.
4566+ Local variables:
4567+ c-indentation-style: "K&R"
4568+ mode-name: "LC"
4569+ c-basic-offset: 8
4570+ tab-width: 8
4571+ fill-column: 120
4572+ scroll-step: 1
4573+ End:
4574+*/
4575diff --git a/fs/reiser4/carry_ops.c b/fs/reiser4/carry_ops.c
4576new file mode 100644
4577index 0000000..8ce8e95
4578--- /dev/null
4579+++ b/fs/reiser4/carry_ops.c
4580@@ -0,0 +1,2131 @@
4581+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4582+
4583+/* implementation of carry operations */
4584+
4585+#include "forward.h"
4586+#include "debug.h"
4587+#include "key.h"
4588+#include "coord.h"
4589+#include "plugin/item/item.h"
4590+#include "plugin/node/node.h"
4591+#include "jnode.h"
4592+#include "znode.h"
4593+#include "block_alloc.h"
4594+#include "tree_walk.h"
4595+#include "pool.h"
4596+#include "tree_mod.h"
4597+#include "carry.h"
4598+#include "carry_ops.h"
4599+#include "tree.h"
4600+#include "super.h"
4601+#include "reiser4.h"
4602+
4603+#include <linux/types.h>
4604+#include <linux/err.h>
4605+
4606+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4607+ carry_level * doing, carry_level * todo,
4608+ unsigned int including_insert_coord_p);
4609+
4610+extern int lock_carry_node(carry_level * level, carry_node * node);
4611+extern int lock_carry_node_tail(carry_node * node);
4612+
4613+/* find left neighbor of a carry node
4614+
4615+ Look for left neighbor of @node and add it to the @doing queue. See
4616+ comments in the body.
4617+
4618+*/
4619+static carry_node *find_left_neighbor(carry_op * op /* node to find left
4620+ * neighbor of */ ,
4621+ carry_level * doing /* level to scan */ )
4622+{
4623+ int result;
4624+ carry_node *node;
4625+ carry_node *left;
4626+ int flags;
4627+ reiser4_tree *tree;
4628+
4629+ node = op->node;
4630+
4631+ tree = current_tree;
4632+ read_lock_tree(tree);
4633+ /* first, check whether left neighbor is already in a @doing queue */
4634+ if (reiser4_carry_real(node)->left != NULL) {
4635+ /* NOTE: there is locking subtlety here. Look into
4636+ * find_right_neighbor() for more info */
4637+ if (find_carry_node(doing,
4638+ reiser4_carry_real(node)->left) != NULL) {
4639+ read_unlock_tree(tree);
4640+ left = node;
4641+ do {
4642+ left = list_entry(left->header.level_linkage.prev,
4643+ carry_node, header.level_linkage);
4644+ assert("nikita-3408", !carry_node_end(doing,
4645+ left));
4646+ } while (reiser4_carry_real(left) ==
4647+ reiser4_carry_real(node));
4648+ return left;
4649+ }
4650+ }
4651+ read_unlock_tree(tree);
4652+
4653+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4654+ if (IS_ERR(left))
4655+ return left;
4656+
4657+ left->node = node->node;
4658+ left->free = 1;
4659+
4660+ flags = GN_TRY_LOCK;
4661+ if (!op->u.insert.flags & COPI_LOAD_LEFT)
4662+ flags |= GN_NO_ALLOC;
4663+
4664+ /* then, feeling lucky, peek left neighbor in the cache. */
4665+ result = reiser4_get_left_neighbor(&left->lock_handle,
4666+ reiser4_carry_real(node),
4667+ ZNODE_WRITE_LOCK, flags);
4668+ if (result == 0) {
4669+ /* ok, node found and locked. */
4670+ result = lock_carry_node_tail(left);
4671+ if (result != 0)
4672+ left = ERR_PTR(result);
4673+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4674+ /* node is leftmost node in a tree, or neighbor wasn't in
4675+ cache, or there is an extent on the left. */
4676+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4677+ left = NULL;
4678+ } else if (doing->restartable) {
4679+ /* if left neighbor is locked, and level is restartable, add
4680+ new node to @doing and restart. */
4681+ assert("nikita-913", node->parent != 0);
4682+ assert("nikita-914", node->node != NULL);
4683+ left->left = 1;
4684+ left->free = 0;
4685+ left = ERR_PTR(-E_REPEAT);
4686+ } else {
4687+ /* left neighbor is locked, level cannot be restarted. Just
4688+ ignore left neighbor. */
4689+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
4690+ left = NULL;
4691+ }
4692+ return left;
4693+}
4694+
4695+/* find right neighbor of a carry node
4696+
4697+ Look for right neighbor of @node and add it to the @doing queue. See
4698+ comments in the body.
4699+
4700+*/
4701+static carry_node *find_right_neighbor(carry_op * op /* node to find right
4702+ * neighbor of */ ,
4703+ carry_level * doing /* level to scan */ )
4704+{
4705+ int result;
4706+ carry_node *node;
4707+ carry_node *right;
4708+ lock_handle lh;
4709+ int flags;
4710+ reiser4_tree *tree;
4711+
4712+ init_lh(&lh);
4713+
4714+ node = op->node;
4715+
4716+ tree = current_tree;
4717+ read_lock_tree(tree);
4718+ /* first, check whether right neighbor is already in a @doing queue */
4719+ if (reiser4_carry_real(node)->right != NULL) {
4720+ /*
4721+ * Tree lock is taken here anyway, because, even if _outcome_
4722+ * of (find_carry_node() != NULL) doesn't depends on
4723+ * concurrent updates to ->right, find_carry_node() cannot
4724+ * work with second argument NULL. Hence, following comment is
4725+ * of historic importance only.
4726+ *
4727+ * Subtle:
4728+ *
4729+ * Q: why don't we need tree lock here, looking for the right
4730+ * neighbor?
4731+ *
4732+ * A: even if value of node->real_node->right were changed
4733+ * during find_carry_node() execution, outcome of execution
4734+ * wouldn't change, because (in short) other thread cannot add
4735+ * elements to the @doing, and if node->real_node->right
4736+ * already was in @doing, value of node->real_node->right
4737+ * couldn't change, because node cannot be inserted between
4738+ * locked neighbors.
4739+ */
4740+ if (find_carry_node(doing,
4741+ reiser4_carry_real(node)->right) != NULL) {
4742+ read_unlock_tree(tree);
4743+ /*
4744+ * What we are doing here (this is also applicable to
4745+ * the find_left_neighbor()).
4746+ *
4747+ * tree_walk.c code requires that insertion of a
4748+ * pointer to a child, modification of parent pointer
4749+ * in the child, and insertion of the child into
4750+ * sibling list are atomic (see
4751+ * plugin/item/internal.c:create_hook_internal()).
4752+ *
4753+ * carry allocates new node long before pointer to it
4754+ * is inserted into parent and, actually, long before
4755+ * parent is even known. Such allocated-but-orphaned
4756+ * nodes are only trackable through carry level lists.
4757+ *
4758+ * Situation that is handled here is following: @node
4759+ * has valid ->right pointer, but there is
4760+ * allocated-but-orphaned node in the carry queue that
4761+ * is logically between @node and @node->right. Here
4762+ * we are searching for it. Critical point is that
4763+ * this is only possible if @node->right is also in
4764+ * the carry queue (this is checked above), because
4765+ * this is the only way new orphaned node could be
4766+ * inserted between them (before inserting new node,
4767+ * make_space() first tries to shift to the right, so,
4768+ * right neighbor will be locked and queued).
4769+ *
4770+ */
4771+ right = node;
4772+ do {
4773+ right = list_entry(right->header.level_linkage.next,
4774+ carry_node, header.level_linkage);
4775+ assert("nikita-3408", !carry_node_end(doing,
4776+ right));
4777+ } while (reiser4_carry_real(right) ==
4778+ reiser4_carry_real(node));
4779+ return right;
4780+ }
4781+ }
4782+ read_unlock_tree(tree);
4783+
4784+ flags = GN_CAN_USE_UPPER_LEVELS;
4785+ if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4786+ flags = GN_NO_ALLOC;
4787+
4788+ /* then, try to lock right neighbor */
4789+ init_lh(&lh);
4790+ result = reiser4_get_right_neighbor(&lh,
4791+ reiser4_carry_real(node),
4792+ ZNODE_WRITE_LOCK, flags);
4793+ if (result == 0) {
4794+ /* ok, node found and locked. */
4795+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4796+ if (!IS_ERR(right)) {
4797+ right->node = lh.node;
4798+ move_lh(&right->lock_handle, &lh);
4799+ right->free = 1;
4800+ result = lock_carry_node_tail(right);
4801+ if (result != 0)
4802+ right = ERR_PTR(result);
4803+ }
4804+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4805+ /* node is rightmost node in a tree, or neighbor wasn't in
4806+ cache, or there is an extent on the right. */
4807+ right = NULL;
4808+ } else
4809+ right = ERR_PTR(result);
4810+ done_lh(&lh);
4811+ return right;
4812+}
4813+
4814+/* how much free space in a @node is needed for @op
4815+
4816+ How much space in @node is required for completion of @op, where @op is
4817+ insert or paste operation.
4818+*/
4819+static unsigned int space_needed_for_op(znode * node /* znode data are
4820+ * inserted or
4821+ * pasted in */ ,
4822+ carry_op * op /* carry
4823+ operation */ )
4824+{
4825+ assert("nikita-919", op != NULL);
4826+
4827+ switch (op->op) {
4828+ default:
4829+ impossible("nikita-1701", "Wrong opcode");
4830+ case COP_INSERT:
4831+ return space_needed(node, NULL, op->u.insert.d->data, 1);
4832+ case COP_PASTE:
4833+ return space_needed(node, op->u.insert.d->coord,
4834+ op->u.insert.d->data, 0);
4835+ }
4836+}
4837+
4838+/* how much space in @node is required to insert or paste @data at
4839+ @coord. */
4840+unsigned int space_needed(const znode * node /* node data are inserted or
4841+ * pasted in */ ,
4842+ const coord_t * coord /* coord where data are
4843+ * inserted or pasted
4844+ * at */ ,
4845+ const reiser4_item_data * data /* data to insert or
4846+ * paste */ ,
4847+ int insertion /* non-0 is inserting, 0---paste */ )
4848+{
4849+ int result;
4850+ item_plugin *iplug;
4851+
4852+ assert("nikita-917", node != NULL);
4853+ assert("nikita-918", node_plugin_by_node(node) != NULL);
4854+ assert("vs-230", !insertion || (coord == NULL));
4855+
4856+ result = 0;
4857+ iplug = data->iplug;
4858+ if (iplug->b.estimate != NULL) {
4859+ /* ask item plugin how much space is needed to insert this
4860+ item */
4861+ result += iplug->b.estimate(insertion ? NULL : coord, data);
4862+ } else {
4863+ /* reasonable default */
4864+ result += data->length;
4865+ }
4866+ if (insertion) {
4867+ node_plugin *nplug;
4868+
4869+ nplug = node->nplug;
4870+ /* and add node overhead */
4871+ if (nplug->item_overhead != NULL) {
4872+ result += nplug->item_overhead(node, NULL);
4873+ }
4874+ }
4875+ return result;
4876+}
4877+
4878+/* find &coord in parent where pointer to new child is to be stored. */
4879+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
4880+ * insert pointer to new
4881+ * child */ )
4882+{
4883+ int result;
4884+ znode *node;
4885+ znode *child;
4886+
4887+ assert("nikita-941", op != NULL);
4888+ assert("nikita-942", op->op == COP_INSERT);
4889+
4890+ node = reiser4_carry_real(op->node);
4891+ assert("nikita-943", node != NULL);
4892+ assert("nikita-944", node_plugin_by_node(node) != NULL);
4893+
4894+ child = reiser4_carry_real(op->u.insert.child);
4895+ result =
4896+ find_new_child_ptr(node, child, op->u.insert.brother,
4897+ op->u.insert.d->coord);
4898+
4899+ build_child_ptr_data(child, op->u.insert.d->data);
4900+ return result;
4901+}
4902+
4903+/* additional amount of free space in @node required to complete @op */
4904+static int free_space_shortage(znode * node /* node to check */ ,
4905+ carry_op * op /* operation being performed */ )
4906+{
4907+ assert("nikita-1061", node != NULL);
4908+ assert("nikita-1062", op != NULL);
4909+
4910+ switch (op->op) {
4911+ default:
4912+ impossible("nikita-1702", "Wrong opcode");
4913+ case COP_INSERT:
4914+ case COP_PASTE:
4915+ return space_needed_for_op(node, op) - znode_free_space(node);
4916+ case COP_EXTENT:
4917+ /* when inserting extent shift data around until insertion
4918+ point is utmost in the node. */
4919+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4920+ return +1;
4921+ else
4922+ return -1;
4923+ }
4924+}
4925+
4926+/* helper function: update node pointer in operation after insertion
4927+ point was probably shifted into @target. */
4928+static znode *sync_op(carry_op * op, carry_node * target)
4929+{
4930+ znode *insertion_node;
4931+
4932+ /* reget node from coord: shift might move insertion coord to
4933+ the neighbor */
4934+ insertion_node = op->u.insert.d->coord->node;
4935+ /* if insertion point was actually moved into new node,
4936+ update carry node pointer in operation. */
4937+ if (insertion_node != reiser4_carry_real(op->node)) {
4938+ op->node = target;
4939+ assert("nikita-2540",
4940+ reiser4_carry_real(target) == insertion_node);
4941+ }
4942+ assert("nikita-2541",
4943+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4944+ return insertion_node;
4945+}
4946+
4947+/*
4948+ * complete make_space() call: update tracked lock handle if necessary. See
4949+ * comments for fs/reiser4/carry.h:carry_track_type
4950+ */
4951+static int
4952+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4953+{
4954+ int result;
4955+ carry_track_type tracking;
4956+ znode *node;
4957+
4958+ tracking = doing->track_type;
4959+ node = op->u.insert.d->coord->node;
4960+
4961+ if (tracking == CARRY_TRACK_NODE ||
4962+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4963+ /* inserting or pasting into node different from
4964+ original. Update lock handle supplied by caller. */
4965+ assert("nikita-1417", doing->tracked != NULL);
4966+ done_lh(doing->tracked);
4967+ init_lh(doing->tracked);
4968+ result = longterm_lock_znode(doing->tracked, node,
4969+ ZNODE_WRITE_LOCK,
4970+ ZNODE_LOCK_HIPRI);
4971+ } else
4972+ result = 0;
4973+ return result;
4974+}
4975+
4976+/* This is insertion policy function. It shifts data to the left and right
4977+ neighbors of insertion coord and allocates new nodes until there is enough
4978+ free space to complete @op.
4979+
4980+ See comments in the body.
4981+
4982+ Assumes that the node format favors insertions at the right end of the node
4983+ as node40 does.
4984+
4985+ See carry_flow() on detail about flow insertion
4986+*/
4987+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4988+ carry_level * doing /* current carry queue */ ,
4989+ carry_level * todo /* carry queue on the parent level */ )
4990+{
4991+ znode *node;
4992+ int result;
4993+ int not_enough_space;
4994+ int blk_alloc;
4995+ znode *orig_node;
4996+ __u32 flags;
4997+
4998+ coord_t *coord;
4999+
5000+ assert("nikita-890", op != NULL);
5001+ assert("nikita-891", todo != NULL);
5002+ assert("nikita-892",
5003+ op->op == COP_INSERT ||
5004+ op->op == COP_PASTE || op->op == COP_EXTENT);
5005+ assert("nikita-1607",
5006+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
5007+
5008+ flags = op->u.insert.flags;
5009+
5010+ /* NOTE check that new node can only be allocated after checking left
5011+ * and right neighbors. This is necessary for proper work of
5012+ * find_{left,right}_neighbor(). */
5013+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
5014+ flags & COPI_DONT_SHIFT_LEFT));
5015+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
5016+ flags & COPI_DONT_SHIFT_RIGHT));
5017+
5018+ coord = op->u.insert.d->coord;
5019+ orig_node = node = coord->node;
5020+
5021+ assert("nikita-908", node != NULL);
5022+ assert("nikita-909", node_plugin_by_node(node) != NULL);
5023+
5024+ result = 0;
5025+ /* If there is not enough space in a node, try to shift something to
5026+ the left neighbor. This is a bit tricky, as locking to the left is
5027+ low priority. This is handled by restart logic in carry().
5028+ */
5029+ not_enough_space = free_space_shortage(node, op);
5030+ if (not_enough_space <= 0)
5031+ /* it is possible that carry was called when there actually
5032+ was enough space in the node. For example, when inserting
5033+ leftmost item so that delimiting keys have to be updated.
5034+ */
5035+ return make_space_tail(op, doing, orig_node);
5036+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
5037+ carry_node *left;
5038+ /* make note in statistics of an attempt to move
5039+ something into the left neighbor */
5040+ left = find_left_neighbor(op, doing);
5041+ if (unlikely(IS_ERR(left))) {
5042+ if (PTR_ERR(left) == -E_REPEAT)
5043+ return -E_REPEAT;
5044+ else {
5045+ /* some error other than restart request
5046+ occurred. This shouldn't happen. Issue a
5047+ warning and continue as if left neighbor
5048+ weren't existing.
5049+ */
5050+ warning("nikita-924",
5051+ "Error accessing left neighbor: %li",
5052+ PTR_ERR(left));
5053+ }
5054+ } else if (left != NULL) {
5055+
5056+ /* shift everything possible on the left of and
5057+ including insertion coord into the left neighbor */
5058+ result = carry_shift_data(LEFT_SIDE, coord,
5059+ reiser4_carry_real(left),
5060+ doing, todo,
5061+ flags & COPI_GO_LEFT);
5062+
5063+ /* reget node from coord: shift_left() might move
5064+ insertion coord to the left neighbor */
5065+ node = sync_op(op, left);
5066+
5067+ not_enough_space = free_space_shortage(node, op);
5068+ /* There is not enough free space in @node, but
5069+ may be, there is enough free space in
5070+ @left. Various balancing decisions are valid here.
5071+ The same for the shifiting to the right.
5072+ */
5073+ }
5074+ }
5075+ /* If there still is not enough space, shift to the right */
5076+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
5077+ carry_node *right;
5078+
5079+ right = find_right_neighbor(op, doing);
5080+ if (IS_ERR(right)) {
5081+ warning("nikita-1065",
5082+ "Error accessing right neighbor: %li",
5083+ PTR_ERR(right));
5084+ } else if (right != NULL) {
5085+ /* node containing insertion point, and its right
5086+ neighbor node are write locked by now.
5087+
5088+ shift everything possible on the right of but
5089+ excluding insertion coord into the right neighbor
5090+ */
5091+ result = carry_shift_data(RIGHT_SIDE, coord,
5092+ reiser4_carry_real(right),
5093+ doing, todo,
5094+ flags & COPI_GO_RIGHT);
5095+ /* reget node from coord: shift_right() might move
5096+ insertion coord to the right neighbor */
5097+ node = sync_op(op, right);
5098+ not_enough_space = free_space_shortage(node, op);
5099+ }
5100+ }
5101+ /* If there is still not enough space, allocate new node(s).
5102+
5103+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
5104+ the carry operation flags (currently this is needed during flush
5105+ only).
5106+ */
5107+ for (blk_alloc = 0;
5108+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
5109+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
5110+ carry_node *fresh; /* new node we are allocating */
5111+ coord_t coord_shadow; /* remembered insertion point before
5112+ * shifting data into new node */
5113+ carry_node *node_shadow; /* remembered insertion node before
5114+ * shifting */
5115+ unsigned int gointo; /* whether insertion point should move
5116+ * into newly allocated node */
5117+
5118+ /* allocate new node on the right of @node. Znode and disk
5119+ fake block number for new node are allocated.
5120+
5121+ add_new_znode() posts carry operation COP_INSERT with
5122+ COPT_CHILD option to the parent level to add
5123+ pointer to newly created node to its parent.
5124+
5125+ Subtle point: if several new nodes are required to complete
5126+ insertion operation at this level, they will be inserted
5127+ into their parents in the order of creation, which means
5128+ that @node will be valid "cookie" at the time of insertion.
5129+
5130+ */
5131+ fresh = add_new_znode(node, op->node, doing, todo);
5132+ if (IS_ERR(fresh))
5133+ return PTR_ERR(fresh);
5134+
5135+ /* Try to shift into new node. */
5136+ result = lock_carry_node(doing, fresh);
5137+ zput(reiser4_carry_real(fresh));
5138+ if (result != 0) {
5139+ warning("nikita-947",
5140+ "Cannot lock new node: %i", result);
5141+ return result;
5142+ }
5143+
5144+ /* both nodes are write locked by now.
5145+
5146+ shift everything possible on the right of and
5147+ including insertion coord into the right neighbor.
5148+ */
5149+ coord_dup(&coord_shadow, op->u.insert.d->coord);
5150+ node_shadow = op->node;
5151+ /* move insertion point into newly created node if:
5152+
5153+ . insertion point is rightmost in the source node, or
5154+ . this is not the first node we are allocating in a row.
5155+ */
5156+ gointo =
5157+ (blk_alloc > 0) ||
5158+ coord_is_after_rightmost(op->u.insert.d->coord);
5159+
5160+ if (gointo &&
5161+ op->op == COP_PASTE &&
5162+ coord_is_existing_item(op->u.insert.d->coord) &&
5163+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
5164+ /* paste into solid (atomic) item, which can contain
5165+ only one unit, so we need to shift it right, where
5166+ insertion point supposed to be */
5167+
5168+ assert("edward-1444", op->u.insert.d->data->iplug ==
5169+ item_plugin_by_id(STATIC_STAT_DATA_ID));
5170+ assert("edward-1445",
5171+ op->u.insert.d->data->length >
5172+ node_plugin_by_node(coord->node)->free_space
5173+ (coord->node));
5174+
5175+ op->u.insert.d->coord->between = BEFORE_UNIT;
5176+ }
5177+
5178+ result = carry_shift_data(RIGHT_SIDE, coord,
5179+ reiser4_carry_real(fresh),
5180+ doing, todo, gointo);
5181+ /* if insertion point was actually moved into new node,
5182+ update carry node pointer in operation. */
5183+ node = sync_op(op, fresh);
5184+ not_enough_space = free_space_shortage(node, op);
5185+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
5186+ /* there is not enough free in new node. Shift
5187+ insertion point back to the @shadow_node so that
5188+ next new node would be inserted between
5189+ @shadow_node and @fresh.
5190+ */
5191+ coord_normalize(&coord_shadow);
5192+ coord_dup(coord, &coord_shadow);
5193+ node = coord->node;
5194+ op->node = node_shadow;
5195+ if (1 || (flags & COPI_STEP_BACK)) {
5196+ /* still not enough space?! Maybe there is
5197+ enough space in the source node (i.e., node
5198+ data are moved from) now.
5199+ */
5200+ not_enough_space =
5201+ free_space_shortage(node, op);
5202+ }
5203+ }
5204+ }
5205+ if (not_enough_space > 0) {
5206+ if (!(flags & COPI_DONT_ALLOCATE))
5207+ warning("nikita-948", "Cannot insert new item");
5208+ result = -E_NODE_FULL;
5209+ }
5210+ assert("nikita-1622", ergo(result == 0,
5211+ reiser4_carry_real(op->node) == coord->node));
5212+ assert("nikita-2616", coord == op->u.insert.d->coord);
5213+ if (result == 0)
5214+ result = make_space_tail(op, doing, orig_node);
5215+ return result;
5216+}
5217+
5218+/* insert_paste_common() - common part of insert and paste operations
5219+
5220+ This function performs common part of COP_INSERT and COP_PASTE.
5221+
5222+ There are two ways in which insertion/paste can be requested:
5223+
5224+ . by directly supplying reiser4_item_data. In this case, op ->
5225+ u.insert.type is set to COPT_ITEM_DATA.
5226+
5227+ . by supplying child pointer to which is to inserted into parent. In this
5228+ case op -> u.insert.type == COPT_CHILD.
5229+
5230+ . by supplying key of new item/unit. This is currently only used during
5231+ extent insertion
5232+
5233+ This is required, because when new node is allocated we don't know at what
5234+ position pointer to it is to be stored in the parent. Actually, we don't
5235+ even know what its parent will be, because parent can be re-balanced
5236+ concurrently and new node re-parented, and because parent can be full and
5237+ pointer to the new node will go into some other node.
5238+
5239+ insert_paste_common() resolves pointer to child node into position in the
5240+ parent by calling find_new_child_coord(), that fills
5241+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
5242+
5243+ Another complication is with finding free space during pasting. It may
5244+ happen that while shifting items to the neighbors and newly allocated
5245+ nodes, insertion coord can no longer be in the item we wanted to paste
5246+ into. At this point, paste becomes (morphs) into insert. Moreover free
5247+ space analysis has to be repeated, because amount of space required for
5248+ insertion is different from that of paste (item header overhead, etc).
5249+
5250+ This function "unifies" different insertion modes (by resolving child
5251+ pointer or key into insertion coord), and then calls make_space() to free
5252+ enough space in the node by shifting data to the left and right and by
5253+ allocating new nodes if necessary. Carry operation knows amount of space
5254+ required for its completion. After enough free space is obtained, caller of
5255+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
5256+ by calling item plugin method.
5257+
5258+*/
5259+static int insert_paste_common(carry_op * op /* carry operation being
5260+ * performed */ ,
5261+ carry_level * doing /* current carry level */ ,
5262+ carry_level * todo /* next carry level */ ,
5263+ carry_insert_data * cdata /* pointer to
5264+ * cdata */ ,
5265+ coord_t * coord /* insertion/paste coord */ ,
5266+ reiser4_item_data * data /* data to be
5267+ * inserted/pasted */ )
5268+{
5269+ assert("nikita-981", op != NULL);
5270+ assert("nikita-980", todo != NULL);
5271+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
5272+ || (op->op == COP_EXTENT));
5273+
5274+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
5275+ /* nothing to do. Fall through to make_space(). */
5276+ ;
5277+ } else if (op->u.insert.type == COPT_KEY) {
5278+ node_search_result intra_node;
5279+ znode *node;
5280+ /* Problem with doing batching at the lowest level, is that
5281+ operations here are given by coords where modification is
5282+ to be performed, and one modification can invalidate coords
5283+ of all following operations.
5284+
5285+ So, we are implementing yet another type for operation that
5286+ will use (the only) "locator" stable across shifting of
5287+ data between nodes, etc.: key (COPT_KEY).
5288+
5289+ This clause resolves key to the coord in the node.
5290+
5291+ But node can change also. Probably some pieces have to be
5292+ added to the lock_carry_node(), to lock node by its key.
5293+
5294+ */
5295+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5296+ if you need something else. */
5297+ op->u.insert.d->coord = coord;
5298+ node = reiser4_carry_real(op->node);
5299+ intra_node = node_plugin_by_node(node)->lookup
5300+ (node, op->u.insert.d->key, FIND_EXACT,
5301+ op->u.insert.d->coord);
5302+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5303+ warning("nikita-1715", "Intra node lookup failure: %i",
5304+ intra_node);
5305+ return intra_node;
5306+ }
5307+ } else if (op->u.insert.type == COPT_CHILD) {
5308+ /* if we are asked to insert pointer to the child into
5309+ internal node, first convert pointer to the child into
5310+ coord within parent node.
5311+ */
5312+ znode *child;
5313+ int result;
5314+
5315+ op->u.insert.d = cdata;
5316+ op->u.insert.d->coord = coord;
5317+ op->u.insert.d->data = data;
5318+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
5319+ result = find_new_child_coord(op);
5320+ child = reiser4_carry_real(op->u.insert.child);
5321+ if (result != NS_NOT_FOUND) {
5322+ warning("nikita-993",
5323+ "Cannot find a place for child pointer: %i",
5324+ result);
5325+ return result;
5326+ }
5327+ /* This only happens when we did multiple insertions at
5328+ the previous level, trying to insert single item and
5329+ it so happened, that insertion of pointers to all new
5330+ nodes before this one already caused parent node to
5331+ split (may be several times).
5332+
5333+ I am going to come up with better solution.
5334+
5335+ You are not expected to understand this.
5336+ -- v6root/usr/sys/ken/slp.c
5337+
5338+ Basically, what happens here is the following: carry came
5339+ to the parent level and is about to insert internal item
5340+ pointing to the child node that it just inserted in the
5341+ level below. Position where internal item is to be inserted
5342+ was found by find_new_child_coord() above, but node of the
5343+ current carry operation (that is, parent node of child
5344+ inserted on the previous level), was determined earlier in
5345+ the lock_carry_level/lock_carry_node. It could so happen
5346+ that other carry operations already performed on the parent
5347+ level already split parent node, so that insertion point
5348+ moved into another node. Handle this by creating new carry
5349+ node for insertion point if necessary.
5350+ */
5351+ if (reiser4_carry_real(op->node) !=
5352+ op->u.insert.d->coord->node) {
5353+ pool_ordering direction;
5354+ znode *z1;
5355+ znode *z2;
5356+ reiser4_key k1;
5357+ reiser4_key k2;
5358+
5359+ /*
5360+ * determine in what direction insertion point
5361+ * moved. Do this by comparing delimiting keys.
5362+ */
5363+ z1 = op->u.insert.d->coord->node;
5364+ z2 = reiser4_carry_real(op->node);
5365+ if (keyle(leftmost_key_in_node(z1, &k1),
5366+ leftmost_key_in_node(z2, &k2)))
5367+ /* insertion point moved to the left */
5368+ direction = POOLO_BEFORE;
5369+ else
5370+ /* insertion point moved to the right */
5371+ direction = POOLO_AFTER;
5372+
5373+ op->node = reiser4_add_carry_skip(doing,
5374+ direction, op->node);
5375+ if (IS_ERR(op->node))
5376+ return PTR_ERR(op->node);
5377+ op->node->node = op->u.insert.d->coord->node;
5378+ op->node->free = 1;
5379+ result = lock_carry_node(doing, op->node);
5380+ if (result != 0)
5381+ return result;
5382+ }
5383+
5384+ /*
5385+ * set up key of an item being inserted: we are inserting
5386+ * internal item and its key is (by the very definition of
5387+ * search tree) is leftmost key in the child node.
5388+ */
5389+ write_lock_dk(znode_get_tree(child));
5390+ op->u.insert.d->key = leftmost_key_in_node(child,
5391+ znode_get_ld_key(child));
5392+ write_unlock_dk(znode_get_tree(child));
5393+ op->u.insert.d->data->arg = op->u.insert.brother;
5394+ } else {
5395+ assert("vs-243", op->u.insert.d->coord != NULL);
5396+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
5397+ }
5398+
5399+ /* find free space. */
5400+ return make_space(op, doing, todo);
5401+}
5402+
5403+/* handle carry COP_INSERT operation.
5404+
5405+ Insert new item into node. New item can be given in one of two ways:
5406+
5407+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5408+ only applicable at the leaf/twig level.
5409+
5410+ - by passing a child node pointer to which is to be inserted by this
5411+ operation.
5412+
5413+*/
5414+static int carry_insert(carry_op * op /* operation to perform */ ,
5415+ carry_level * doing /* queue of operations @op
5416+ * is part of */ ,
5417+ carry_level * todo /* queue where new operations
5418+ * are accumulated */ )
5419+{
5420+ znode *node;
5421+ carry_insert_data cdata;
5422+ coord_t coord;
5423+ reiser4_item_data data;
5424+ carry_plugin_info info;
5425+ int result;
5426+
5427+ assert("nikita-1036", op != NULL);
5428+ assert("nikita-1037", todo != NULL);
5429+ assert("nikita-1038", op->op == COP_INSERT);
5430+
5431+ coord_init_zero(&coord);
5432+
5433+ /* perform common functionality of insert and paste. */
5434+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5435+ if (result != 0)
5436+ return result;
5437+
5438+ node = op->u.insert.d->coord->node;
5439+ assert("nikita-1039", node != NULL);
5440+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
5441+
5442+ assert("nikita-949",
5443+ space_needed_for_op(node, op) <= znode_free_space(node));
5444+
5445+ /* ask node layout to create new item. */
5446+ info.doing = doing;
5447+ info.todo = todo;
5448+ result = node_plugin_by_node(node)->create_item
5449+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5450+ &info);
5451+ doing->restartable = 0;
5452+ znode_make_dirty(node);
5453+
5454+ return result;
5455+}
5456+
5457+/*
5458+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5459+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5460+ * by slicing into multiple items.
5461+ */
5462+
5463+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5464+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5465+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5466+
5467+static size_t item_data_overhead(carry_op * op)
5468+{
5469+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
5470+ return 0;
5471+ return (flow_insert_data(op)->iplug->b.
5472+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5473+ flow_insert_data(op)->length);
5474+}
5475+
5476+/* FIXME-VS: this is called several times during one make_flow_for_insertion
5477+ and it will always return the same result. Some optimization could be made
5478+ by calculating this value once at the beginning and passing it around. That
5479+ would reduce some flexibility in future changes
5480+*/
5481+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5482+static size_t flow_insertion_overhead(carry_op * op)
5483+{
5484+ znode *node;
5485+ size_t insertion_overhead;
5486+
5487+ node = flow_insert_point(op)->node;
5488+ insertion_overhead = 0;
5489+ if (node->nplug->item_overhead &&
5490+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5491+ flow_insert_data(op)))
5492+ insertion_overhead =
5493+ node->nplug->item_overhead(node, NULL) +
5494+ item_data_overhead(op);
5495+ return insertion_overhead;
5496+}
5497+
5498+/* how many bytes of flow does fit to the node */
5499+static int what_can_fit_into_node(carry_op * op)
5500+{
5501+ size_t free, overhead;
5502+
5503+ overhead = flow_insertion_overhead(op);
5504+ free = znode_free_space(flow_insert_point(op)->node);
5505+ if (free <= overhead)
5506+ return 0;
5507+ free -= overhead;
5508+ /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5509+ if (free < op->u.insert_flow.flow->length)
5510+ return free;
5511+ return (int)op->u.insert_flow.flow->length;
5512+}
5513+
5514+/* in make_space_for_flow_insertion we need to check either whether whole flow
5515+ fits into a node or whether minimal fraction of flow fits into a node */
5516+static int enough_space_for_whole_flow(carry_op * op)
5517+{
5518+ return (unsigned)what_can_fit_into_node(op) ==
5519+ op->u.insert_flow.flow->length;
5520+}
5521+
5522+#define MIN_FLOW_FRACTION 1
5523+static int enough_space_for_min_flow_fraction(carry_op * op)
5524+{
5525+ assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5526+
5527+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5528+}
5529+
5530+/* this returns 0 if left neighbor was obtained successfully and everything
5531+ upto insertion point including it were shifted and left neighbor still has
5532+ some free space to put minimal fraction of flow into it */
5533+static int
5534+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5535+{
5536+ carry_node *left;
5537+ znode *orig;
5538+
5539+ left = find_left_neighbor(op, doing);
5540+ if (unlikely(IS_ERR(left))) {
5541+ warning("vs-899",
5542+ "make_space_by_shift_left: "
5543+ "error accessing left neighbor: %li", PTR_ERR(left));
5544+ return 1;
5545+ }
5546+ if (left == NULL)
5547+ /* left neighbor either does not exist or is unformatted
5548+ node */
5549+ return 1;
5550+
5551+ orig = flow_insert_point(op)->node;
5552+ /* try to shift content of node @orig from its head upto insert point
5553+ including insertion point into the left neighbor */
5554+ carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5555+ reiser4_carry_real(left), doing, todo,
5556+ 1 /* including insert point */);
5557+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5558+ /* insertion point did not move */
5559+ return 1;
5560+ }
5561+
5562+ /* insertion point is set after last item in the node */
5563+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5564+
5565+ if (!enough_space_for_min_flow_fraction(op)) {
5566+ /* insertion point node does not have enough free space to put
5567+ even minimal portion of flow into it, therefore, move
5568+ insertion point back to orig node (before first item) */
5569+ coord_init_before_first_item(flow_insert_point(op), orig);
5570+ return 1;
5571+ }
5572+
5573+ /* part of flow is to be written to the end of node */
5574+ op->node = left;
5575+ return 0;
5576+}
5577+
5578+/* this returns 0 if right neighbor was obtained successfully and everything to
5579+ the right of insertion point was shifted to it and node got enough free
5580+ space to put minimal fraction of flow into it */
5581+static int
5582+make_space_by_shift_right(carry_op * op, carry_level * doing,
5583+ carry_level * todo)
5584+{
5585+ carry_node *right;
5586+
5587+ right = find_right_neighbor(op, doing);
5588+ if (unlikely(IS_ERR(right))) {
5589+ warning("nikita-1065", "shift_right_excluding_insert_point: "
5590+ "error accessing right neighbor: %li", PTR_ERR(right));
5591+ return 1;
5592+ }
5593+ if (right) {
5594+ /* shift everything possible on the right of but excluding
5595+ insertion coord into the right neighbor */
5596+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5597+ reiser4_carry_real(right), doing, todo,
5598+ 0 /* not including insert point */);
5599+ } else {
5600+ /* right neighbor either does not exist or is unformatted
5601+ node */
5602+ ;
5603+ }
5604+ if (coord_is_after_rightmost(flow_insert_point(op))) {
5605+ if (enough_space_for_min_flow_fraction(op)) {
5606+ /* part of flow is to be written to the end of node */
5607+ return 0;
5608+ }
5609+ }
5610+
5611+ /* new node is to be added if insert point node did not get enough
5612+ space for whole flow */
5613+ return 1;
5614+}
5615+
5616+/* this returns 0 when insert coord is set at the node end and fraction of flow
5617+ fits into that node */
5618+static int
5619+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5620+{
5621+ int result;
5622+ znode *node;
5623+ carry_node *new;
5624+
5625+ node = flow_insert_point(op)->node;
5626+
5627+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5628+ return RETERR(-E_NODE_FULL);
5629+ /* add new node after insert point node */
5630+ new = add_new_znode(node, op->node, doing, todo);
5631+ if (unlikely(IS_ERR(new))) {
5632+ return PTR_ERR(new);
5633+ }
5634+ result = lock_carry_node(doing, new);
5635+ zput(reiser4_carry_real(new));
5636+ if (unlikely(result)) {
5637+ return result;
5638+ }
5639+ op->u.insert_flow.new_nodes++;
5640+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
5641+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5642+ reiser4_carry_real(new), doing, todo,
5643+ 0 /* not including insert point */);
5644+ assert("vs-901",
5645+ coord_is_after_rightmost(flow_insert_point(op)));
5646+
5647+ if (enough_space_for_min_flow_fraction(op)) {
5648+ return 0;
5649+ }
5650+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5651+ return RETERR(-E_NODE_FULL);
5652+
5653+ /* add one more new node */
5654+ new = add_new_znode(node, op->node, doing, todo);
5655+ if (unlikely(IS_ERR(new))) {
5656+ return PTR_ERR(new);
5657+ }
5658+ result = lock_carry_node(doing, new);
5659+ zput(reiser4_carry_real(new));
5660+ if (unlikely(result)) {
5661+ return result;
5662+ }
5663+ op->u.insert_flow.new_nodes++;
5664+ }
5665+
5666+ /* move insertion point to new node */
5667+ coord_init_before_first_item(flow_insert_point(op),
5668+ reiser4_carry_real(new));
5669+ op->node = new;
5670+ return 0;
5671+}
5672+
5673+static int
5674+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5675+ carry_level * todo)
5676+{
5677+ __u32 flags = op->u.insert_flow.flags;
5678+
5679+ if (enough_space_for_whole_flow(op)) {
5680+ /* whole flow fits into insert point node */
5681+ return 0;
5682+ }
5683+
5684+ if (!(flags & COPI_DONT_SHIFT_LEFT)
5685+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
5686+ /* insert point is shifted to left neighbor of original insert
5687+ point node and is set after last unit in that node. It has
5688+ enough space to fit at least minimal fraction of flow. */
5689+ return 0;
5690+ }
5691+
5692+ if (enough_space_for_whole_flow(op)) {
5693+ /* whole flow fits into insert point node */
5694+ return 0;
5695+ }
5696+
5697+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
5698+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
5699+ /* insert point is still set to the same node, but there is
5700+ nothing to the right of insert point. */
5701+ return 0;
5702+ }
5703+
5704+ if (enough_space_for_whole_flow(op)) {
5705+ /* whole flow fits into insert point node */
5706+ return 0;
5707+ }
5708+
5709+ return make_space_by_new_nodes(op, doing, todo);
5710+}
5711+
5712+/* implements COP_INSERT_FLOW operation */
5713+static int
5714+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5715+{
5716+ int result;
5717+ flow_t *f;
5718+ coord_t *insert_point;
5719+ node_plugin *nplug;
5720+ carry_plugin_info info;
5721+ znode *orig_node;
5722+ lock_handle *orig_lh;
5723+
5724+ f = op->u.insert_flow.flow;
5725+ result = 0;
5726+
5727+ /* carry system needs this to work */
5728+ info.doing = doing;
5729+ info.todo = todo;
5730+
5731+ orig_node = flow_insert_point(op)->node;
5732+ orig_lh = doing->tracked;
5733+
5734+ while (f->length) {
5735+ result = make_space_for_flow_insertion(op, doing, todo);
5736+ if (result)
5737+ break;
5738+
5739+ insert_point = flow_insert_point(op);
5740+ nplug = node_plugin_by_node(insert_point->node);
5741+
5742+ /* compose item data for insertion/pasting */
5743+ flow_insert_data(op)->data = f->data;
5744+ flow_insert_data(op)->length = what_can_fit_into_node(op);
5745+
5746+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5747+ /* insert point is set to item of file we are writing to and we have to append to it */
5748+ assert("vs-903", insert_point->between == AFTER_UNIT);
5749+ nplug->change_item_size(insert_point,
5750+ flow_insert_data(op)->length);
5751+ flow_insert_data(op)->iplug->b.paste(insert_point,
5752+ flow_insert_data
5753+ (op), &info);
5754+ } else {
5755+ /* new item must be inserted */
5756+ pos_in_node_t new_pos;
5757+ flow_insert_data(op)->length += item_data_overhead(op);
5758+
5759+ /* FIXME-VS: this is because node40_create_item changes
5760+ insert_point for obscure reasons */
5761+ switch (insert_point->between) {
5762+ case AFTER_ITEM:
5763+ new_pos = insert_point->item_pos + 1;
5764+ break;
5765+ case EMPTY_NODE:
5766+ new_pos = 0;
5767+ break;
5768+ case BEFORE_ITEM:
5769+ assert("vs-905", insert_point->item_pos == 0);
5770+ new_pos = 0;
5771+ break;
5772+ default:
5773+ impossible("vs-906",
5774+ "carry_insert_flow: invalid coord");
5775+ new_pos = 0;
5776+ break;
5777+ }
5778+
5779+ nplug->create_item(insert_point, &f->key,
5780+ flow_insert_data(op), &info);
5781+ coord_set_item_pos(insert_point, new_pos);
5782+ }
5783+ coord_init_after_item_end(insert_point);
5784+ doing->restartable = 0;
5785+ znode_make_dirty(insert_point->node);
5786+
5787+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5788+ }
5789+
5790+ if (orig_node != flow_insert_point(op)->node) {
5791+ /* move lock to new insert point */
5792+ done_lh(orig_lh);
5793+ init_lh(orig_lh);
5794+ result =
5795+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5796+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5797+ }
5798+
5799+ return result;
5800+}
5801+
5802+/* implements COP_DELETE operation
5803+
5804+ Remove pointer to @op -> u.delete.child from it's parent.
5805+
5806+ This function also handles killing of a tree root is last pointer from it
5807+ was removed. This is complicated by our handling of "twig" level: root on
5808+ twig level is never killed.
5809+
5810+*/
5811+static int carry_delete(carry_op * op /* operation to be performed */ ,
5812+ carry_level * doing UNUSED_ARG /* current carry
5813+ * level */ ,
5814+ carry_level * todo /* next carry level */ )
5815+{
5816+ int result;
5817+ coord_t coord;
5818+ coord_t coord2;
5819+ znode *parent;
5820+ znode *child;
5821+ carry_plugin_info info;
5822+ reiser4_tree *tree;
5823+
5824+ /*
5825+ * This operation is called to delete internal item pointing to the
5826+ * child node that was removed by carry from the tree on the previous
5827+ * tree level.
5828+ */
5829+
5830+ assert("nikita-893", op != NULL);
5831+ assert("nikita-894", todo != NULL);
5832+ assert("nikita-895", op->op == COP_DELETE);
5833+
5834+ coord_init_zero(&coord);
5835+ coord_init_zero(&coord2);
5836+
5837+ parent = reiser4_carry_real(op->node);
5838+ child = op->u.delete.child ?
5839+ reiser4_carry_real(op->u.delete.child) : op->node->node;
5840+ tree = znode_get_tree(child);
5841+ read_lock_tree(tree);
5842+
5843+ /*
5844+ * @parent was determined when carry entered parent level
5845+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
5846+ * @child node could change due to other carry operations performed on
5847+ * the parent level. Check for this.
5848+ */
5849+
5850+ if (znode_parent(child) != parent) {
5851+ /* NOTE-NIKITA add stat counter for this. */
5852+ parent = znode_parent(child);
5853+ assert("nikita-2581", find_carry_node(doing, parent));
5854+ }
5855+ read_unlock_tree(tree);
5856+
5857+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5858+
5859+ /* Twig level horrors: tree should be of height at least 2. So, last
5860+ pointer from the root at twig level is preserved even if child is
5861+ empty. This is ugly, but so it was architectured.
5862+ */
5863+
5864+ if (znode_is_root(parent) &&
5865+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5866+ node_num_items(parent) == 1) {
5867+ /* Delimiting key manipulations. */
5868+ write_lock_dk(tree);
5869+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5870+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5871+ ZF_SET(child, JNODE_DKSET);
5872+ write_unlock_dk(tree);
5873+
5874+ /* @child escaped imminent death! */
5875+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
5876+ return 0;
5877+ }
5878+
5879+ /* convert child pointer to the coord_t */
5880+ result = find_child_ptr(parent, child, &coord);
5881+ if (result != NS_FOUND) {
5882+ warning("nikita-994", "Cannot find child pointer: %i", result);
5883+ print_coord_content("coord", &coord);
5884+ return result;
5885+ }
5886+
5887+ coord_dup(&coord2, &coord);
5888+ info.doing = doing;
5889+ info.todo = todo;
5890+ {
5891+ /*
5892+ * Actually kill internal item: prepare structure with
5893+ * arguments for ->cut_and_kill() method...
5894+ */
5895+
5896+ struct carry_kill_data kdata;
5897+ kdata.params.from = &coord;
5898+ kdata.params.to = &coord2;
5899+ kdata.params.from_key = NULL;
5900+ kdata.params.to_key = NULL;
5901+ kdata.params.smallest_removed = NULL;
5902+ kdata.params.truncate = 1;
5903+ kdata.flags = op->u.delete.flags;
5904+ kdata.inode = NULL;
5905+ kdata.left = NULL;
5906+ kdata.right = NULL;
5907+ kdata.buf = NULL;
5908+ /* ... and call it. */
5909+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5910+ &info);
5911+ }
5912+ doing->restartable = 0;
5913+
5914+ /* check whether root should be killed violently */
5915+ if (znode_is_root(parent) &&
5916+ /* don't kill roots at and lower than twig level */
5917+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5918+ node_num_items(parent) == 1) {
5919+ result = reiser4_kill_tree_root(coord.node);
5920+ }
5921+
5922+ return result < 0 ? : 0;
5923+}
5924+
5925+/* implements COP_CUT opration
5926+
5927+ Cuts part or whole content of node.
5928+
5929+*/
5930+static int carry_cut(carry_op * op /* operation to be performed */ ,
5931+ carry_level * doing /* current carry level */ ,
5932+ carry_level * todo /* next carry level */ )
5933+{
5934+ int result;
5935+ carry_plugin_info info;
5936+ node_plugin *nplug;
5937+
5938+ assert("nikita-896", op != NULL);
5939+ assert("nikita-897", todo != NULL);
5940+ assert("nikita-898", op->op == COP_CUT);
5941+
5942+ info.doing = doing;
5943+ info.todo = todo;
5944+
5945+ nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5946+ if (op->u.cut_or_kill.is_cut)
5947+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5948+ else
5949+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5950+
5951+ doing->restartable = 0;
5952+ return result < 0 ? : 0;
5953+}
5954+
5955+/* helper function for carry_paste(): returns true if @op can be continued as
5956+ paste */
5957+static int
5958+can_paste(coord_t * icoord, const reiser4_key * key,
5959+ const reiser4_item_data * data)
5960+{
5961+ coord_t circa;
5962+ item_plugin *new_iplug;
5963+ item_plugin *old_iplug;
5964+ int result = 0; /* to keep gcc shut */
5965+
5966+ assert("", icoord->between != AT_UNIT);
5967+
5968+ /* obviously, one cannot paste when node is empty---there is nothing
5969+ to paste into. */
5970+ if (node_is_empty(icoord->node))
5971+ return 0;
5972+ /* if insertion point is at the middle of the item, then paste */
5973+ if (!coord_is_between_items(icoord))
5974+ return 1;
5975+ coord_dup(&circa, icoord);
5976+ circa.between = AT_UNIT;
5977+
5978+ old_iplug = item_plugin_by_coord(&circa);
5979+ new_iplug = data->iplug;
5980+
5981+ /* check whether we can paste to the item @icoord is "at" when we
5982+ ignore ->between field */
5983+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5984+ result = 1;
5985+ } else if (icoord->between == BEFORE_UNIT
5986+ || icoord->between == BEFORE_ITEM) {
5987+ /* otherwise, try to glue to the item at the left, if any */
5988+ coord_dup(&circa, icoord);
5989+ if (coord_set_to_left(&circa)) {
5990+ result = 0;
5991+ coord_init_before_item(icoord);
5992+ } else {
5993+ old_iplug = item_plugin_by_coord(&circa);
5994+ result = (old_iplug == new_iplug)
5995+ && item_can_contain_key(icoord, key, data);
5996+ if (result) {
5997+ coord_dup(icoord, &circa);
5998+ icoord->between = AFTER_UNIT;
5999+ }
6000+ }
6001+ } else if (icoord->between == AFTER_UNIT
6002+ || icoord->between == AFTER_ITEM) {
6003+ coord_dup(&circa, icoord);
6004+ /* otherwise, try to glue to the item at the right, if any */
6005+ if (coord_set_to_right(&circa)) {
6006+ result = 0;
6007+ coord_init_after_item(icoord);
6008+ } else {
6009+ int (*cck) (const coord_t *, const reiser4_key *,
6010+ const reiser4_item_data *);
6011+
6012+ old_iplug = item_plugin_by_coord(&circa);
6013+
6014+ cck = old_iplug->b.can_contain_key;
6015+ if (cck == NULL)
6016+ /* item doesn't define ->can_contain_key
6017+ method? So it is not expandable. */
6018+ result = 0;
6019+ else {
6020+ result = (old_iplug == new_iplug)
6021+ && cck(&circa /*icoord */ , key, data);
6022+ if (result) {
6023+ coord_dup(icoord, &circa);
6024+ icoord->between = BEFORE_UNIT;
6025+ }
6026+ }
6027+ }
6028+ } else
6029+ impossible("nikita-2513", "Nothing works");
6030+ if (result) {
6031+ if (icoord->between == BEFORE_ITEM) {
6032+ assert("vs-912", icoord->unit_pos == 0);
6033+ icoord->between = BEFORE_UNIT;
6034+ } else if (icoord->between == AFTER_ITEM) {
6035+ coord_init_after_item_end(icoord);
6036+ }
6037+ }
6038+ return result;
6039+}
6040+
6041+/* implements COP_PASTE operation
6042+
6043+ Paste data into existing item. This is complicated by the fact that after
6044+ we shifted something to the left or right neighbors trying to free some
6045+ space, item we were supposed to paste into can be in different node than
6046+ insertion coord. If so, we are no longer doing paste, but insert. See
6047+ comments in insert_paste_common().
6048+
6049+*/
6050+static int carry_paste(carry_op * op /* operation to be performed */ ,
6051+ carry_level * doing UNUSED_ARG /* current carry
6052+ * level */ ,
6053+ carry_level * todo /* next carry level */ )
6054+{
6055+ znode *node;
6056+ carry_insert_data cdata;
6057+ coord_t dcoord;
6058+ reiser4_item_data data;
6059+ int result;
6060+ int real_size;
6061+ item_plugin *iplug;
6062+ carry_plugin_info info;
6063+ coord_t *coord;
6064+
6065+ assert("nikita-982", op != NULL);
6066+ assert("nikita-983", todo != NULL);
6067+ assert("nikita-984", op->op == COP_PASTE);
6068+
6069+ coord_init_zero(&dcoord);
6070+
6071+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
6072+ if (result != 0)
6073+ return result;
6074+
6075+ coord = op->u.insert.d->coord;
6076+
6077+ /* handle case when op -> u.insert.coord doesn't point to the item
6078+ of required type. restart as insert. */
6079+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
6080+ op->op = COP_INSERT;
6081+ op->u.insert.type = COPT_PASTE_RESTARTED;
6082+ result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
6083+
6084+ return result;
6085+ }
6086+
6087+ node = coord->node;
6088+ iplug = item_plugin_by_coord(coord);
6089+ assert("nikita-992", iplug != NULL);
6090+
6091+ assert("nikita-985", node != NULL);
6092+ assert("nikita-986", node_plugin_by_node(node) != NULL);
6093+
6094+ assert("nikita-987",
6095+ space_needed_for_op(node, op) <= znode_free_space(node));
6096+
6097+ assert("nikita-1286", coord_is_existing_item(coord));
6098+
6099+ /*
6100+ * if item is expanded as a result of this operation, we should first
6101+ * change item size, than call ->b.paste item method. If item is
6102+ * shrunk, it should be done other way around: first call ->b.paste
6103+ * method, then reduce item size.
6104+ */
6105+
6106+ real_size = space_needed_for_op(node, op);
6107+ if (real_size > 0)
6108+ node->nplug->change_item_size(coord, real_size);
6109+
6110+ doing->restartable = 0;
6111+ info.doing = doing;
6112+ info.todo = todo;
6113+
6114+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
6115+
6116+ if (real_size < 0)
6117+ node->nplug->change_item_size(coord, real_size);
6118+
6119+ /* if we pasted at the beginning of the item, update item's key. */
6120+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
6121+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
6122+
6123+ znode_make_dirty(node);
6124+ return result;
6125+}
6126+
6127+/* handle carry COP_EXTENT operation. */
6128+static int carry_extent(carry_op * op /* operation to perform */ ,
6129+ carry_level * doing /* queue of operations @op
6130+ * is part of */ ,
6131+ carry_level * todo /* queue where new operations
6132+ * are accumulated */ )
6133+{
6134+ znode *node;
6135+ carry_insert_data cdata;
6136+ coord_t coord;
6137+ reiser4_item_data data;
6138+ carry_op *delete_dummy;
6139+ carry_op *insert_extent;
6140+ int result;
6141+ carry_plugin_info info;
6142+
6143+ assert("nikita-1751", op != NULL);
6144+ assert("nikita-1752", todo != NULL);
6145+ assert("nikita-1753", op->op == COP_EXTENT);
6146+
6147+ /* extent insertion overview:
6148+
6149+ extents live on the TWIG LEVEL, which is level one above the leaf
6150+ one. This complicates extent insertion logic somewhat: it may
6151+ happen (and going to happen all the time) that in logical key
6152+ ordering extent has to be placed between items I1 and I2, located
6153+ at the leaf level, but I1 and I2 are in the same formatted leaf
6154+ node N1. To insert extent one has to
6155+
6156+ (1) reach node N1 and shift data between N1, its neighbors and
6157+ possibly newly allocated nodes until I1 and I2 fall into different
6158+ nodes. Since I1 and I2 are still neighboring items in logical key
6159+ order, they will be necessary utmost items in their respective
6160+ nodes.
6161+
6162+ (2) After this new extent item is inserted into node on the twig
6163+ level.
6164+
6165+ Fortunately this process can reuse almost all code from standard
6166+ insertion procedure (viz. make_space() and insert_paste_common()),
6167+ due to the following observation: make_space() only shifts data up
6168+ to and excluding or including insertion point. It never
6169+ "over-moves" through insertion point. Thus, one can use
6170+ make_space() to perform step (1). All required for this is just to
6171+ instruct free_space_shortage() to keep make_space() shifting data
6172+ until insertion point is at the node border.
6173+
6174+ */
6175+
6176+ /* perform common functionality of insert and paste. */
6177+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
6178+ if (result != 0)
6179+ return result;
6180+
6181+ node = op->u.extent.d->coord->node;
6182+ assert("nikita-1754", node != NULL);
6183+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
6184+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
6185+
6186+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
6187+ extent fits between items. */
6188+
6189+ info.doing = doing;
6190+ info.todo = todo;
6191+
6192+ /* there is another complication due to placement of extents on the
6193+ twig level: extents are "rigid" in the sense that key-range
6194+ occupied by extent cannot grow indefinitely to the right as it is
6195+ for the formatted leaf nodes. Because of this when search finds two
6196+ adjacent extents on the twig level, it has to "drill" to the leaf
6197+ level, creating new node. Here we are removing this node.
6198+ */
6199+ if (node_is_empty(node)) {
6200+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
6201+ if (IS_ERR(delete_dummy))
6202+ return PTR_ERR(delete_dummy);
6203+ delete_dummy->u.delete.child = NULL;
6204+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
6205+ ZF_SET(node, JNODE_HEARD_BANSHEE);
6206+ }
6207+
6208+ /* proceed with inserting extent item into parent. We are definitely
6209+ inserting rather than pasting if we get that far. */
6210+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
6211+ if (IS_ERR(insert_extent))
6212+ /* @delete_dummy will be automatically destroyed on the level
6213+ exiting */
6214+ return PTR_ERR(insert_extent);
6215+ /* NOTE-NIKITA insertion by key is simplest option here. Another
6216+ possibility is to insert on the left or right of already existing
6217+ item.
6218+ */
6219+ insert_extent->u.insert.type = COPT_KEY;
6220+ insert_extent->u.insert.d = op->u.extent.d;
6221+ assert("nikita-1719", op->u.extent.d->key != NULL);
6222+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
6223+ insert_extent->u.insert.flags =
6224+ znode_get_tree(node)->carry.new_extent_flags;
6225+
6226+ /*
6227+ * if carry was asked to track lock handle we should actually track
6228+ * lock handle on the twig node rather than on the leaf where
6229+ * operation was started from. Transfer tracked lock handle.
6230+ */
6231+ if (doing->track_type) {
6232+ assert("nikita-3242", doing->tracked != NULL);
6233+ assert("nikita-3244", todo->tracked == NULL);
6234+ todo->tracked = doing->tracked;
6235+ todo->track_type = CARRY_TRACK_NODE;
6236+ doing->tracked = NULL;
6237+ doing->track_type = 0;
6238+ }
6239+
6240+ return 0;
6241+}
6242+
6243+/* update key in @parent between pointers to @left and @right.
6244+
6245+ Find coords of @left and @right and update delimiting key between them.
6246+ This is helper function called by carry_update(). Finds position of
6247+ internal item involved. Updates item key. Updates delimiting keys of child
6248+ nodes involved.
6249+*/
6250+static int update_delimiting_key(znode * parent /* node key is updated
6251+ * in */ ,
6252+ znode * left /* child of @parent */ ,
6253+ znode * right /* child of @parent */ ,
6254+ carry_level * doing /* current carry
6255+ * level */ ,
6256+ carry_level * todo /* parent carry
6257+ * level */ ,
6258+ const char **error_msg /* place to
6259+ * store error
6260+ * message */ )
6261+{
6262+ coord_t left_pos;
6263+ coord_t right_pos;
6264+ int result;
6265+ reiser4_key ldkey;
6266+ carry_plugin_info info;
6267+
6268+ assert("nikita-1177", right != NULL);
6269+ /* find position of right left child in a parent */
6270+ result = find_child_ptr(parent, right, &right_pos);
6271+ if (result != NS_FOUND) {
6272+ *error_msg = "Cannot find position of right child";
6273+ return result;
6274+ }
6275+
6276+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
6277+ /* find position of the left child in a parent */
6278+ result = find_child_ptr(parent, left, &left_pos);
6279+ if (result != NS_FOUND) {
6280+ *error_msg = "Cannot find position of left child";
6281+ return result;
6282+ }
6283+ assert("nikita-1355", left_pos.node != NULL);
6284+ } else
6285+ left_pos.node = NULL;
6286+
6287+ /* check that they are separated by exactly one key and are basically
6288+ sane */
6289+ if (REISER4_DEBUG) {
6290+ if ((left_pos.node != NULL)
6291+ && !coord_is_existing_unit(&left_pos)) {
6292+ *error_msg = "Left child is bastard";
6293+ return RETERR(-EIO);
6294+ }
6295+ if (!coord_is_existing_unit(&right_pos)) {
6296+ *error_msg = "Right child is bastard";
6297+ return RETERR(-EIO);
6298+ }
6299+ if (left_pos.node != NULL &&
6300+ !coord_are_neighbors(&left_pos, &right_pos)) {
6301+ *error_msg = "Children are not direct siblings";
6302+ return RETERR(-EIO);
6303+ }
6304+ }
6305+ *error_msg = NULL;
6306+
6307+ info.doing = doing;
6308+ info.todo = todo;
6309+
6310+ /*
6311+ * If child node is not empty, new key of internal item is a key of
6312+ * leftmost item in the child node. If the child is empty, take its
6313+ * right delimiting key as a new key of the internal item. Precise key
6314+ * in the latter case is not important per se, because the child (and
6315+ * the internal item) are going to be killed shortly anyway, but we
6316+ * have to preserve correct order of keys in the parent node.
6317+ */
6318+
6319+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6320+ leftmost_key_in_node(right, &ldkey);
6321+ else {
6322+ read_lock_dk(znode_get_tree(parent));
6323+ ldkey = *znode_get_rd_key(right);
6324+ read_unlock_dk(znode_get_tree(parent));
6325+ }
6326+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6327+ doing->restartable = 0;
6328+ znode_make_dirty(parent);
6329+ return 0;
6330+}
6331+
6332+/* implements COP_UPDATE opration
6333+
6334+ Update delimiting keys.
6335+
6336+*/
6337+static int carry_update(carry_op * op /* operation to be performed */ ,
6338+ carry_level * doing /* current carry level */ ,
6339+ carry_level * todo /* next carry level */ )
6340+{
6341+ int result;
6342+ carry_node *missing UNUSED_ARG;
6343+ znode *left;
6344+ znode *right;
6345+ carry_node *lchild;
6346+ carry_node *rchild;
6347+ const char *error_msg;
6348+ reiser4_tree *tree;
6349+
6350+ /*
6351+ * This operation is called to update key of internal item. This is
6352+ * necessary when carry shifted of cut data on the child
6353+ * level. Arguments of this operation are:
6354+ *
6355+ * @right --- child node. Operation should update key of internal
6356+ * item pointing to @right.
6357+ *
6358+ * @left --- left neighbor of @right. This parameter is optional.
6359+ */
6360+
6361+ assert("nikita-902", op != NULL);
6362+ assert("nikita-903", todo != NULL);
6363+ assert("nikita-904", op->op == COP_UPDATE);
6364+
6365+ lchild = op->u.update.left;
6366+ rchild = op->node;
6367+
6368+ if (lchild != NULL) {
6369+ assert("nikita-1001", lchild->parent);
6370+ assert("nikita-1003", !lchild->left);
6371+ left = reiser4_carry_real(lchild);
6372+ } else
6373+ left = NULL;
6374+
6375+ tree = znode_get_tree(rchild->node);
6376+ read_lock_tree(tree);
6377+ right = znode_parent(rchild->node);
6378+ read_unlock_tree(tree);
6379+
6380+ if (right != NULL) {
6381+ result = update_delimiting_key(right,
6382+ lchild ? lchild->node : NULL,
6383+ rchild->node,
6384+ doing, todo, &error_msg);
6385+ } else {
6386+ error_msg = "Cannot find node to update key in";
6387+ result = RETERR(-EIO);
6388+ }
6389+ /* operation will be reposted to the next level by the
6390+ ->update_item_key() method of node plugin, if necessary. */
6391+
6392+ if (result != 0) {
6393+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
6394+ error_msg ? : "", result);
6395+ }
6396+ return result;
6397+}
6398+
6399+/* move items from @node during carry */
6400+static int carry_shift_data(sideof side /* in what direction to move data */ ,
6401+ coord_t * insert_coord /* coord where new item
6402+ * is to be inserted */ ,
6403+ znode * node /* node which data are moved from */ ,
6404+ carry_level * doing /* active carry queue */ ,
6405+ carry_level * todo /* carry queue where new
6406+ * operations are to be put
6407+ * in */ ,
6408+ unsigned int including_insert_coord_p /* true if
6409+ * @insertion_coord
6410+ * can be moved */ )
6411+{
6412+ int result;
6413+ znode *source;
6414+ carry_plugin_info info;
6415+ node_plugin *nplug;
6416+
6417+ source = insert_coord->node;
6418+
6419+ info.doing = doing;
6420+ info.todo = todo;
6421+
6422+ nplug = node_plugin_by_node(node);
6423+ result = nplug->shift(insert_coord, node,
6424+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6425+ (int)including_insert_coord_p, &info);
6426+ /* the only error ->shift() method of node plugin can return is
6427+ -ENOMEM due to carry node/operation allocation. */
6428+ assert("nikita-915", result >= 0 || result == -ENOMEM);
6429+ if (result > 0) {
6430+ /*
6431+ * if some number of bytes was actually shifted, mark nodes
6432+ * dirty, and carry level as non-restartable.
6433+ */
6434+ doing->restartable = 0;
6435+ znode_make_dirty(source);
6436+ znode_make_dirty(node);
6437+ }
6438+
6439+ assert("nikita-2077", coord_check(insert_coord));
6440+ return 0;
6441+}
6442+
6443+typedef carry_node *(*carry_iterator) (carry_node * node);
6444+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6445+ carry_iterator iterator);
6446+
6447+static carry_node *pool_level_list_prev(carry_node *node)
6448+{
6449+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6450+}
6451+
6452+/* look for the left neighbor of given carry node in a carry queue.
6453+
6454+ This is used by find_left_neighbor(), but I am not sure that this
6455+ really gives any advantage. More statistics required.
6456+
6457+*/
6458+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
6459+ * of */ ,
6460+ carry_level * level /* level to scan */ )
6461+{
6462+ return find_dir_carry(node, level,
6463+ (carry_iterator) pool_level_list_prev);
6464+}
6465+
6466+static carry_node *pool_level_list_next(carry_node *node)
6467+{
6468+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6469+}
6470+
6471+/* look for the right neighbor of given carry node in a
6472+ carry queue.
6473+
6474+ This is used by find_right_neighbor(), but I am not sure that this
6475+ really gives any advantage. More statistics required.
6476+
6477+*/
6478+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6479+ * of */ ,
6480+ carry_level * level /* level to scan */ )
6481+{
6482+ return find_dir_carry(node, level,
6483+ (carry_iterator) pool_level_list_next);
6484+}
6485+
6486+/* look for the left or right neighbor of given carry node in a carry
6487+ queue.
6488+
6489+ Helper function used by find_{left|right}_carry().
6490+*/
6491+static carry_node *find_dir_carry(carry_node * node /* node to start scanning
6492+ * from */ ,
6493+ carry_level * level /* level to scan */ ,
6494+ carry_iterator iterator /* operation to
6495+ * move to the next
6496+ * node */ )
6497+{
6498+ carry_node *neighbor;
6499+
6500+ assert("nikita-1059", node != NULL);
6501+ assert("nikita-1060", level != NULL);
6502+
6503+ /* scan list of carry nodes on this list dir-ward, skipping all
6504+ carry nodes referencing the same znode. */
6505+ neighbor = node;
6506+ while (1) {
6507+ neighbor = iterator(neighbor);
6508+ if (carry_node_end(level, neighbor))
6509+ /* list head is reached */
6510+ return NULL;
6511+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6512+ return neighbor;
6513+ }
6514+}
6515+
6516+/*
6517+ * Memory reservation estimation.
6518+ *
6519+ * Carry process proceeds through tree levels upwards. Carry assumes that it
6520+ * takes tree in consistent state (e.g., that search tree invariants hold),
6521+ * and leaves tree consistent after it finishes. This means that when some
6522+ * error occurs carry cannot simply return if there are pending carry
6523+ * operations. Generic solution for this problem is carry-undo either as
6524+ * transaction manager feature (requiring checkpoints and isolation), or
6525+ * through some carry specific mechanism.
6526+ *
6527+ * Our current approach is to panic if carry hits an error while tree is
6528+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6529+ * this "memory reservation" mechanism was added.
6530+ *
6531+ * Memory reservation is implemented by perthread-pages.diff patch from
6532+ * core-patches. Its API is defined in <linux/gfp.h>
6533+ *
6534+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
6535+ * void perthread_pages_release(int nrpages);
6536+ * int perthread_pages_count(void);
6537+ *
6538+ * carry estimates its worst case memory requirements at the entry, reserved
6539+ * enough memory, and released unused pages before returning.
6540+ *
6541+ * Code below estimates worst case memory requirements for a given carry
6542+ * queue. This is dome by summing worst case memory requirements for each
6543+ * operation in the queue.
6544+ *
6545+ */
6546+
6547+/*
6548+ * Memory memory requirements of many operations depends on the tree
6549+ * height. For example, item insertion requires new node to be inserted at
6550+ * each tree level in the worst case. What tree height should be used for
6551+ * estimation? Current tree height is wrong, because tree height can change
6552+ * between the time when estimation was done and the time when operation is
6553+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6554+ * is also not desirable, because it would lead to the huge over-estimation
6555+ * all the time. Plausible solution is "capped tree height": if current tree
6556+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6557+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6558+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6559+ * to be increased even more during short interval of time.
6560+ */
6561+#define TREE_HEIGHT_CAP (5)
6562+
6563+/* return capped tree height for the @tree. See comment above. */
6564+static int cap_tree_height(reiser4_tree * tree)
6565+{
6566+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
6567+}
6568+
6569+/* return capped tree height for the current tree. */
6570+static int capped_height(void)
6571+{
6572+ return cap_tree_height(current_tree);
6573+}
6574+
6575+/* return number of pages required to store given number of bytes */
6576+static int bytes_to_pages(int bytes)
6577+{
6578+ return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6579+}
6580+
6581+/* how many pages are required to allocate znodes during item insertion. */
6582+static int carry_estimate_znodes(void)
6583+{
6584+ /*
6585+ * Note, that there we have some problem here: there is no way to
6586+ * reserve pages specifically for the given slab. This means that
6587+ * these pages can be hijacked for some other end.
6588+ */
6589+
6590+ /* in the worst case we need 3 new znode on each tree level */
6591+ return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6592+}
6593+
6594+/*
6595+ * how many pages are required to load bitmaps. One bitmap per level.
6596+ */
6597+static int carry_estimate_bitmaps(void)
6598+{
6599+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6600+ int bytes;
6601+
6602+ bytes = capped_height() * (0 + /* bnode should be added, but its is private to
6603+ * bitmap.c, skip for now. */
6604+ 2 * sizeof(jnode)); /* working and commit jnodes */
6605+ return bytes_to_pages(bytes) + 2; /* and their contents */
6606+ } else
6607+ /* bitmaps were pre-loaded during mount */
6608+ return 0;
6609+}
6610+
6611+/* worst case item insertion memory requirements */
6612+static int carry_estimate_insert(carry_op * op, carry_level * level)
6613+{
6614+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6615+ capped_height() + /* new block on each level */
6616+ 1 + /* and possibly extra new block at the leaf level */
6617+ 3; /* loading of leaves into memory */
6618+}
6619+
6620+/* worst case item deletion memory requirements */
6621+static int carry_estimate_delete(carry_op * op, carry_level * level)
6622+{
6623+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6624+ 3; /* loading of leaves into memory */
6625+}
6626+
6627+/* worst case tree cut memory requirements */
6628+static int carry_estimate_cut(carry_op * op, carry_level * level)
6629+{
6630+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6631+ 3; /* loading of leaves into memory */
6632+}
6633+
6634+/* worst case memory requirements of pasting into item */
6635+static int carry_estimate_paste(carry_op * op, carry_level * level)
6636+{
6637+ return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6638+ capped_height() + /* new block on each level */
6639+ 1 + /* and possibly extra new block at the leaf level */
6640+ 3; /* loading of leaves into memory */
6641+}
6642+
6643+/* worst case memory requirements of extent insertion */
6644+static int carry_estimate_extent(carry_op * op, carry_level * level)
6645+{
6646+ return carry_estimate_insert(op, level) + /* insert extent */
6647+ carry_estimate_delete(op, level); /* kill leaf */
6648+}
6649+
6650+/* worst case memory requirements of key update */
6651+static int carry_estimate_update(carry_op * op, carry_level * level)
6652+{
6653+ return 0;
6654+}
6655+
6656+/* worst case memory requirements of flow insertion */
6657+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6658+{
6659+ int newnodes;
6660+
6661+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6662+ CARRY_FLOW_NEW_NODES_LIMIT);
6663+ /*
6664+ * roughly estimate insert_flow as a sequence of insertions.
6665+ */
6666+ return newnodes * carry_estimate_insert(op, level);
6667+}
6668+
6669+/* This is dispatch table for carry operations. It can be trivially
6670+ abstracted into useful plugin: tunable balancing policy is a good
6671+ thing. */
6672+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6673+ [COP_INSERT] = {
6674+ .handler = carry_insert,
6675+ .estimate = carry_estimate_insert}
6676+ ,
6677+ [COP_DELETE] = {
6678+ .handler = carry_delete,
6679+ .estimate = carry_estimate_delete}
6680+ ,
6681+ [COP_CUT] = {
6682+ .handler = carry_cut,
6683+ .estimate = carry_estimate_cut}
6684+ ,
6685+ [COP_PASTE] = {
6686+ .handler = carry_paste,
6687+ .estimate = carry_estimate_paste}
6688+ ,
6689+ [COP_EXTENT] = {
6690+ .handler = carry_extent,
6691+ .estimate = carry_estimate_extent}
6692+ ,
6693+ [COP_UPDATE] = {
6694+ .handler = carry_update,
6695+ .estimate = carry_estimate_update}
6696+ ,
6697+ [COP_INSERT_FLOW] = {
6698+ .handler = carry_insert_flow,
6699+ .estimate = carry_estimate_insert_flow}
6700+};
6701+
6702+/* Make Linus happy.
6703+ Local variables:
6704+ c-indentation-style: "K&R"
6705+ mode-name: "LC"
6706+ c-basic-offset: 8
6707+ tab-width: 8
6708+ fill-column: 120
6709+ scroll-step: 1
6710+ End:
6711+*/
6712diff --git a/fs/reiser4/carry_ops.h b/fs/reiser4/carry_ops.h
6713new file mode 100644
6714index 0000000..688ca8f
6715--- /dev/null
6716+++ b/fs/reiser4/carry_ops.h
6717@@ -0,0 +1,42 @@
6718+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6719+
6720+/* implementation of carry operations. See carry_ops.c for details. */
6721+
6722+#if !defined( __CARRY_OPS_H__ )
6723+#define __CARRY_OPS_H__
6724+
6725+#include "forward.h"
6726+#include "znode.h"
6727+#include "carry.h"
6728+
6729+/* carry operation handlers */
6730+typedef struct carry_op_handler {
6731+ /* perform operation */
6732+ int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6733+ /* estimate memory requirements for @op */
6734+ int (*estimate) (carry_op * op, carry_level * level);
6735+} carry_op_handler;
6736+
6737+/* This is dispatch table for carry operations. It can be trivially
6738+ abstracted into useful plugin: tunable balancing policy is a good
6739+ thing. */
6740+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6741+
6742+unsigned int space_needed(const znode * node, const coord_t * coord,
6743+ const reiser4_item_data * data, int inserting);
6744+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6745+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6746+
6747+/* __CARRY_OPS_H__ */
6748+#endif
6749+
6750+/* Make Linus happy.
6751+ Local variables:
6752+ c-indentation-style: "K&R"
6753+ mode-name: "LC"
6754+ c-basic-offset: 8
6755+ tab-width: 8
6756+ fill-column: 120
6757+ scroll-step: 1
6758+ End:
6759+*/
6760diff --git a/fs/reiser4/context.c b/fs/reiser4/context.c
6761new file mode 100644
6762index 0000000..4b3137f
6763--- /dev/null
6764+++ b/fs/reiser4/context.c
6765@@ -0,0 +1,288 @@
6766+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6767+
6768+/* Manipulation of reiser4_context */
6769+
6770+/*
6771+ * global context used during system call. Variable of this type is allocated
6772+ * on the stack at the beginning of the reiser4 part of the system call and
6773+ * pointer to it is stored in the current->fs_context. This allows us to avoid
6774+ * passing pointer to current transaction and current lockstack (both in
6775+ * one-to-one mapping with threads) all over the call chain.
6776+ *
6777+ * It's kind of like those global variables the prof used to tell you not to
6778+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6779+ *
6780+ * In some situations it is desirable to have ability to enter reiser4_context
6781+ * more than once for the same thread (nested contexts). For example, there
6782+ * are some functions that can be called either directly from VFS/VM or from
6783+ * already active reiser4 context (->writepage, for example).
6784+ *
6785+ * In such situations "child" context acts like dummy: all activity is
6786+ * actually performed in the top level context, and get_current_context()
6787+ * always returns top level context.
6788+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6789+ * nested any way.
6790+ *
6791+ * Note that there is an important difference between reiser4 uses
6792+ * ->fs_context and the way other file systems use it. Other file systems
6793+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6794+ * (this is why ->fs_context was initially called ->journal_info). This means,
6795+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6796+ * to the file system, they assume that some transaction is already underway,
6797+ * and usually bail out, because starting nested transaction would most likely
6798+ * lead to the deadlock. This gives false positives with reiser4, because we
6799+ * set ->fs_context before starting transaction.
6800+ */
6801+
6802+#include "debug.h"
6803+#include "super.h"
6804+#include "context.h"
6805+
6806+#include <linux/writeback.h> /* balance_dirty_pages() */
6807+#include <linux/hardirq.h>
6808+
6809+static void _reiser4_init_context(reiser4_context * context,
6810+ struct super_block *super)
6811+{
6812+ memset(context, 0, sizeof(*context));
6813+
6814+ context->super = super;
6815+ context->magic = context_magic;
6816+ context->outer = current->journal_info;
6817+ current->journal_info = (void *)context;
6818+ context->nr_children = 0;
6819+ context->gfp_mask = GFP_KERNEL;
6820+
6821+ init_lock_stack(&context->stack);
6822+
6823+ reiser4_txn_begin(context);
6824+
6825+ /* initialize head of tap list */
6826+ INIT_LIST_HEAD(&context->taps);
6827+#if REISER4_DEBUG
6828+ context->task = current;
6829+#endif
6830+ grab_space_enable();
6831+}
6832+
6833+/* initialize context and bind it to the current thread
6834+
6835+ This function should be called at the beginning of reiser4 part of
6836+ syscall.
6837+*/
6838+reiser4_context * reiser4_init_context(struct super_block * super)
6839+{
6840+ reiser4_context *context;
6841+
6842+ assert("nikita-2662", !in_interrupt() && !in_irq());
6843+ assert("nikita-3357", super != NULL);
6844+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6845+
6846+ context = get_current_context_check();
6847+ if (context && context->super == super) {
6848+ context = (reiser4_context *) current->journal_info;
6849+ context->nr_children++;
6850+ return context;
6851+ }
6852+
6853+ context = kmalloc(sizeof(*context), GFP_KERNEL);
6854+ if (context == NULL)
6855+ return ERR_PTR(RETERR(-ENOMEM));
6856+
6857+ _reiser4_init_context(context, super);
6858+ return context;
6859+}
6860+
6861+/* this is used in scan_mgr which is called with spinlock held and in
6862+ reiser4_fill_super magic */
6863+void init_stack_context(reiser4_context *context, struct super_block *super)
6864+{
6865+ assert("nikita-2662", !in_interrupt() && !in_irq());
6866+ assert("nikita-3357", super != NULL);
6867+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6868+ assert("vs-12", !is_in_reiser4_context());
6869+
6870+ _reiser4_init_context(context, super);
6871+ context->on_stack = 1;
6872+ return;
6873+}
6874+
6875+/* cast lock stack embedded into reiser4 context up to its container */
6876+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6877+{
6878+ return container_of(owner, reiser4_context, stack);
6879+}
6880+
6881+/* true if there is already _any_ reiser4 context for the current thread */
6882+int is_in_reiser4_context(void)
6883+{
6884+ reiser4_context *ctx;
6885+
6886+ ctx = current->journal_info;
6887+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6888+}
6889+
6890+/*
6891+ * call balance dirty pages for the current context.
6892+ *
6893+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
6894+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6895+ * write---this covers vast majority of all dirty traffic), but we cannot do
6896+ * this immediately when formatted node is dirtied, because long term lock is
6897+ * usually held at that time. To work around this, dirtying of formatted node
6898+ * simply increases ->nr_marked_dirty counter in the current reiser4
6899+ * context. When we are about to leave this context,
6900+ * balance_dirty_pages_ratelimited() is called, if necessary.
6901+ *
6902+ * This introduces another problem: sometimes we do not want to run
6903+ * balance_dirty_pages_ratelimited() when leaving a context, for example
6904+ * because some important lock (like ->i_mutex on the parent directory) is
6905+ * held. To achieve this, ->nobalance flag can be set in the current context.
6906+ */
6907+static void balance_dirty_pages_at(reiser4_context *context)
6908+{
6909+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
6910+
6911+ /*
6912+ * call balance_dirty_pages_ratelimited() to process formatted nodes
6913+ * dirtied during this system call. Do that only if we are not in mount
6914+ * and there were nodes dirtied in this context and we are not in
6915+ * writepage (to avoid deadlock) and not in pdflush
6916+ */
6917+ if (sbinfo != NULL && sbinfo->fake != NULL &&
6918+ context->nr_marked_dirty != 0 &&
6919+ !(current->flags & PF_MEMALLOC) &&
6920+ !current_is_pdflush())
6921+ balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6922+}
6923+
6924+/* release resources associated with context.
6925+
6926+ This function should be called at the end of "session" with reiser4,
6927+ typically just before leaving reiser4 driver back to VFS.
6928+
6929+ This is good place to put some degugging consistency checks, like that
6930+ thread released all locks and closed transcrash etc.
6931+
6932+*/
6933+static void reiser4_done_context(reiser4_context * context /* context being released */ )
6934+{
6935+ assert("nikita-860", context != NULL);
6936+ assert("nikita-859", context->magic == context_magic);
6937+ assert("vs-646", (reiser4_context *) current->journal_info == context);
6938+ assert("zam-686", !in_interrupt() && !in_irq());
6939+
6940+ /* only do anything when leaving top-level reiser4 context. All nested
6941+ * contexts are just dummies. */
6942+ if (context->nr_children == 0) {
6943+ assert("jmacd-673", context->trans == NULL);
6944+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
6945+ assert("nikita-1936", reiser4_no_counters_are_held());
6946+ assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6947+ assert("zam-1004", ergo(get_super_private(context->super),
6948+ get_super_private(context->super)->delete_mutex_owner !=
6949+ current));
6950+
6951+ /* release all grabbed but as yet unused blocks */
6952+ if (context->grabbed_blocks != 0)
6953+ all_grabbed2free();
6954+
6955+ /*
6956+ * synchronize against longterm_unlock_znode():
6957+ * wake_up_requestor() wakes up requestors without holding
6958+ * zlock (otherwise they will immediately bump into that lock
6959+ * after wake up on another CPU). To work around (rare)
6960+ * situation where requestor has been woken up asynchronously
6961+ * and managed to run until completion (and destroy its
6962+ * context and lock stack) before wake_up_requestor() called
6963+ * wake_up() on it, wake_up_requestor() synchronize on lock
6964+ * stack spin lock. It has actually been observed that spin
6965+ * lock _was_ locked at this point, because
6966+ * wake_up_requestor() took interrupt.
6967+ */
6968+ spin_lock_stack(&context->stack);
6969+ spin_unlock_stack(&context->stack);
6970+
6971+ assert("zam-684", context->nr_children == 0);
6972+ /* restore original ->fs_context value */
6973+ current->journal_info = context->outer;
6974+ if (context->on_stack == 0)
6975+ kfree(context);
6976+ } else {
6977+ context->nr_children--;
6978+#if REISER4_DEBUG
6979+ assert("zam-685", context->nr_children >= 0);
6980+#endif
6981+ }
6982+}
6983+
6984+/*
6985+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6986+ * transaction. Call done_context() to do context related book-keeping.
6987+ */
6988+void reiser4_exit_context(reiser4_context * context)
6989+{
6990+ assert("nikita-3021", reiser4_schedulable());
6991+
6992+ if (context->nr_children == 0) {
6993+ if (!context->nobalance) {
6994+ reiser4_txn_restart(context);
6995+ balance_dirty_pages_at(context);
6996+ }
6997+
6998+ /* if filesystem is mounted with -o sync or -o dirsync - commit
6999+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
7000+ commiting on exit_context when inode semaphore is held and
7001+ to have ktxnmgrd to do commit instead to get better
7002+ concurrent filesystem accesses. But, when one mounts with -o
7003+ sync, he cares more about reliability than about
7004+ performance. So, for now we have this simple mount -o sync
7005+ support. */
7006+ if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
7007+ txn_atom *atom;
7008+
7009+ atom = get_current_atom_locked_nocheck();
7010+ if (atom) {
7011+ atom->flags |= ATOM_FORCE_COMMIT;
7012+ context->trans->flags &= ~TXNH_DONT_COMMIT;
7013+ spin_unlock_atom(atom);
7014+ }
7015+ }
7016+ reiser4_txn_end(context);
7017+ }
7018+ reiser4_done_context(context);
7019+}
7020+
7021+void reiser4_ctx_gfp_mask_set(void)
7022+{
7023+ reiser4_context *ctx;
7024+
7025+ ctx = get_current_context();
7026+ if (ctx->entd == 0 &&
7027+ list_empty(&ctx->stack.locks) &&
7028+ ctx->trans->atom == NULL)
7029+ ctx->gfp_mask = GFP_KERNEL;
7030+ else
7031+ ctx->gfp_mask = GFP_NOFS;
7032+}
7033+
7034+void reiser4_ctx_gfp_mask_force (gfp_t mask)
7035+{
7036+ reiser4_context *ctx;
7037+ ctx = get_current_context();
7038+
7039+ assert("edward-1454", ctx != NULL);
7040+
7041+ ctx->gfp_mask = mask;
7042+}
7043+
7044+/*
7045+ * Local variables:
7046+ * c-indentation-style: "K&R"
7047+ * mode-name: "LC"
7048+ * c-basic-offset: 8
7049+ * tab-width: 8
7050+ * fill-column: 120
7051+ * scroll-step: 1
7052+ * End:
7053+ */
7054diff --git a/fs/reiser4/context.h b/fs/reiser4/context.h
7055new file mode 100644
7056index 0000000..da240a9
7057--- /dev/null
7058+++ b/fs/reiser4/context.h
7059@@ -0,0 +1,228 @@
7060+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
7061+ * reiser4/README */
7062+
7063+/* Reiser4 context. See context.c for details. */
7064+
7065+#if !defined( __REISER4_CONTEXT_H__ )
7066+#define __REISER4_CONTEXT_H__
7067+
7068+#include "forward.h"
7069+#include "debug.h"
7070+#include "dformat.h"
7071+#include "tap.h"
7072+#include "lock.h"
7073+
7074+#include <linux/types.h> /* for __u?? */
7075+#include <linux/fs.h> /* for struct super_block */
7076+#include <linux/spinlock.h>
7077+#include <linux/sched.h> /* for struct task_struct */
7078+
7079+/* reiser4 per-thread context */
7080+struct reiser4_context {
7081+ /* magic constant. For identification of reiser4 contexts. */
7082+ __u32 magic;
7083+
7084+ /* current lock stack. See lock.[ch]. This is where list of all
7085+ locks taken by current thread is kept. This is also used in
7086+ deadlock detection. */
7087+ lock_stack stack;
7088+
7089+ /* current transcrash. */
7090+ txn_handle *trans;
7091+ /* transaction handle embedded into reiser4_context. ->trans points
7092+ * here by default. */
7093+ txn_handle trans_in_ctx;
7094+
7095+ /* super block we are working with. To get the current tree
7096+ use &get_super_private (reiser4_get_current_sb ())->tree. */
7097+ struct super_block *super;
7098+
7099+ /* parent fs activation */
7100+ struct fs_activation *outer;
7101+
7102+ /* per-thread grabbed (for further allocation) blocks counter */
7103+ reiser4_block_nr grabbed_blocks;
7104+
7105+ /* list of taps currently monitored. See tap.c */
7106+ struct list_head taps;
7107+
7108+ /* grabbing space is enabled */
7109+ unsigned int grab_enabled:1;
7110+ /* should be set when we are write dirty nodes to disk in jnode_flush or
7111+ * reiser4_write_logs() */
7112+ unsigned int writeout_mode:1;
7113+ /* true, if current thread is an ent thread */
7114+ unsigned int entd:1;
7115+ /* true, if balance_dirty_pages() should not be run when leaving this
7116+ * context. This is used to avoid lengthly balance_dirty_pages()
7117+ * operation when holding some important resource, like directory
7118+ * ->i_mutex */
7119+ unsigned int nobalance:1;
7120+
7121+ /* this bit is used on reiser4_done_context to decide whether context is
7122+ kmalloc-ed and has to be kfree-ed */
7123+ unsigned int on_stack:1;
7124+
7125+ /* count non-trivial jnode_set_dirty() calls */
7126+ unsigned long nr_marked_dirty;
7127+
7128+ /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
7129+ * reiser4_writepages for each of dirty inodes. Reiser4_writepages
7130+ * captures pages. When number of pages captured in one
7131+ * reiser4_sync_inodes reaches some threshold - some atoms get
7132+ * flushed */
7133+ int nr_captured;
7134+ int nr_children; /* number of child contexts */
7135+#if REISER4_DEBUG
7136+ /* debugging information about reiser4 locks held by the current
7137+ * thread */
7138+ reiser4_lock_counters_info locks;
7139+ struct task_struct *task; /* so we can easily find owner of the stack */
7140+
7141+ /*
7142+ * disk space grabbing debugging support
7143+ */
7144+ /* how many disk blocks were grabbed by the first call to
7145+ * reiser4_grab_space() in this context */
7146+ reiser4_block_nr grabbed_initially;
7147+
7148+ /* list of all threads doing flush currently */
7149+ struct list_head flushers_link;
7150+ /* information about last error encountered by reiser4 */
7151+ err_site err;
7152+#endif
7153+ void *vp;
7154+ gfp_t gfp_mask;
7155+};
7156+
7157+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
7158+
7159+/* Debugging helps. */
7160+#if REISER4_DEBUG
7161+extern void print_contexts(void);
7162+#endif
7163+
7164+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
7165+#define current_blocksize reiser4_get_current_sb()->s_blocksize
7166+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
7167+
7168+extern reiser4_context *reiser4_init_context(struct super_block *);
7169+extern void init_stack_context(reiser4_context *, struct super_block *);
7170+extern void reiser4_exit_context(reiser4_context *);
7171+
7172+/* magic constant we store in reiser4_context allocated at the stack. Used to
7173+ catch accesses to staled or uninitialized contexts. */
7174+#define context_magic ((__u32) 0x4b1b5d0b)
7175+
7176+extern int is_in_reiser4_context(void);
7177+
7178+/*
7179+ * return reiser4_context for the thread @tsk
7180+ */
7181+static inline reiser4_context *get_context(const struct task_struct *tsk)
7182+{
7183+ assert("vs-1682",
7184+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
7185+ return (reiser4_context *) tsk->journal_info;
7186+}
7187+
7188+/*
7189+ * return reiser4 context of the current thread, or NULL if there is none.
7190+ */
7191+static inline reiser4_context *get_current_context_check(void)
7192+{
7193+ if (is_in_reiser4_context())
7194+ return get_context(current);
7195+ else
7196+ return NULL;
7197+}
7198+
7199+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
7200+
7201+/* return context associated with current thread */
7202+static inline reiser4_context *get_current_context(void)
7203+{
7204+ return get_context(current);
7205+}
7206+
7207+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
7208+{
7209+ reiser4_context *ctx;
7210+
7211+ ctx = get_current_context_check();
7212+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
7213+}
7214+
7215+void reiser4_ctx_gfp_mask_set(void);
7216+void reiser4_ctx_gfp_mask_force (gfp_t mask);
7217+
7218+/*
7219+ * true if current thread is in the write-out mode. Thread enters write-out
7220+ * mode during jnode_flush and reiser4_write_logs().
7221+ */
7222+static inline int is_writeout_mode(void)
7223+{
7224+ return get_current_context()->writeout_mode;
7225+}
7226+
7227+/*
7228+ * enter write-out mode
7229+ */
7230+static inline void writeout_mode_enable(void)
7231+{
7232+ assert("zam-941", !get_current_context()->writeout_mode);
7233+ get_current_context()->writeout_mode = 1;
7234+}
7235+
7236+/*
7237+ * leave write-out mode
7238+ */
7239+static inline void writeout_mode_disable(void)
7240+{
7241+ assert("zam-942", get_current_context()->writeout_mode);
7242+ get_current_context()->writeout_mode = 0;
7243+}
7244+
7245+static inline void grab_space_enable(void)
7246+{
7247+ get_current_context()->grab_enabled = 1;
7248+}
7249+
7250+static inline void grab_space_disable(void)
7251+{
7252+ get_current_context()->grab_enabled = 0;
7253+}
7254+
7255+static inline void grab_space_set_enabled(int enabled)
7256+{
7257+ get_current_context()->grab_enabled = enabled;
7258+}
7259+
7260+static inline int is_grab_enabled(reiser4_context * ctx)
7261+{
7262+ return ctx->grab_enabled;
7263+}
7264+
7265+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
7266+ * flush would be performed when it is closed. This is necessary when handle
7267+ * has to be closed under some coarse semaphore, like i_mutex of
7268+ * directory. Commit will be performed by ktxnmgrd. */
7269+static inline void context_set_commit_async(reiser4_context * context)
7270+{
7271+ context->nobalance = 1;
7272+ context->trans->flags |= TXNH_DONT_COMMIT;
7273+}
7274+
7275+/* __REISER4_CONTEXT_H__ */
7276+#endif
7277+
7278+/* Make Linus happy.
7279+ Local variables:
7280+ c-indentation-style: "K&R"
7281+ mode-name: "LC"
7282+ c-basic-offset: 8
7283+ tab-width: 8
7284+ fill-column: 120
7285+ scroll-step: 1
7286+ End:
7287+*/
7288diff --git a/fs/reiser4/coord.c b/fs/reiser4/coord.c
7289new file mode 100644
7290index 0000000..d171786
7291--- /dev/null
7292+++ b/fs/reiser4/coord.c
7293@@ -0,0 +1,935 @@
7294+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7295+
7296+#include "forward.h"
7297+#include "debug.h"
7298+#include "dformat.h"
7299+#include "tree.h"
7300+#include "plugin/item/item.h"
7301+#include "znode.h"
7302+#include "coord.h"
7303+
7304+/* Internal constructor. */
7305+static inline void
7306+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
7307+ pos_in_node_t unit_pos, between_enum between)
7308+{
7309+ coord->node = (znode *) node;
7310+ coord_set_item_pos(coord, item_pos);
7311+ coord->unit_pos = unit_pos;
7312+ coord->between = between;
7313+ ON_DEBUG(coord->plug_v = 0);
7314+ ON_DEBUG(coord->body_v = 0);
7315+
7316+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
7317+}
7318+
7319+/* after shifting of node content, coord previously set properly may become
7320+ invalid, try to "normalize" it. */
7321+void coord_normalize(coord_t * coord)
7322+{
7323+ znode *node;
7324+
7325+ node = coord->node;
7326+ assert("vs-683", node);
7327+
7328+ coord_clear_iplug(coord);
7329+
7330+ if (node_is_empty(node)) {
7331+ coord_init_first_unit(coord, node);
7332+ } else if ((coord->between == AFTER_ITEM)
7333+ || (coord->between == AFTER_UNIT)) {
7334+ return;
7335+ } else if (coord->item_pos == coord_num_items(coord)
7336+ && coord->between == BEFORE_ITEM) {
7337+ coord_dec_item_pos(coord);
7338+ coord->between = AFTER_ITEM;
7339+ } else if (coord->unit_pos == coord_num_units(coord)
7340+ && coord->between == BEFORE_UNIT) {
7341+ coord->unit_pos--;
7342+ coord->between = AFTER_UNIT;
7343+ } else if (coord->item_pos == coord_num_items(coord)
7344+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
7345+ coord_dec_item_pos(coord);
7346+ coord->unit_pos = 0;
7347+ coord->between = AFTER_ITEM;
7348+ }
7349+}
7350+
7351+/* Copy a coordinate. */
7352+void coord_dup(coord_t * coord, const coord_t * old_coord)
7353+{
7354+ assert("jmacd-9800", coord_check(old_coord));
7355+ coord_dup_nocheck(coord, old_coord);
7356+}
7357+
7358+/* Copy a coordinate without check. Useful when old_coord->node is not
7359+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
7360+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
7361+{
7362+ coord->node = old_coord->node;
7363+ coord_set_item_pos(coord, old_coord->item_pos);
7364+ coord->unit_pos = old_coord->unit_pos;
7365+ coord->between = old_coord->between;
7366+ coord->iplugid = old_coord->iplugid;
7367+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
7368+ ON_DEBUG(coord->body_v = old_coord->body_v);
7369+}
7370+
7371+/* Initialize an invalid coordinate. */
7372+void coord_init_invalid(coord_t * coord, const znode * node)
7373+{
7374+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
7375+}
7376+
7377+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
7378+{
7379+ coord_init_values(coord, node, 0, 0, AT_UNIT);
7380+}
7381+
7382+/* Initialize a coordinate to point at the first unit of the first item. If the node is
7383+ empty, it is positioned at the EMPTY_NODE. */
7384+void coord_init_first_unit(coord_t * coord, const znode * node)
7385+{
7386+ int is_empty = node_is_empty(node);
7387+
7388+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
7389+
7390+ assert("jmacd-9801", coord_check(coord));
7391+}
7392+
7393+/* Initialize a coordinate to point at the last unit of the last item. If the node is
7394+ empty, it is positioned at the EMPTY_NODE. */
7395+void coord_init_last_unit(coord_t * coord, const znode * node)
7396+{
7397+ int is_empty = node_is_empty(node);
7398+
7399+ coord_init_values(coord, node,
7400+ (is_empty ? 0 : node_num_items(node) - 1), 0,
7401+ (is_empty ? EMPTY_NODE : AT_UNIT));
7402+ if (!is_empty)
7403+ coord->unit_pos = coord_last_unit_pos(coord);
7404+ assert("jmacd-9802", coord_check(coord));
7405+}
7406+
7407+/* Initialize a coordinate to before the first item. If the node is empty, it is
7408+ positioned at the EMPTY_NODE. */
7409+void coord_init_before_first_item(coord_t * coord, const znode * node)
7410+{
7411+ int is_empty = node_is_empty(node);
7412+
7413+ coord_init_values(coord, node, 0, 0,
7414+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
7415+
7416+ assert("jmacd-9803", coord_check(coord));
7417+}
7418+
7419+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
7420+ at the EMPTY_NODE. */
7421+void coord_init_after_last_item(coord_t * coord, const znode * node)
7422+{
7423+ int is_empty = node_is_empty(node);
7424+
7425+ coord_init_values(coord, node,
7426+ (is_empty ? 0 : node_num_items(node) - 1), 0,
7427+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
7428+
7429+ assert("jmacd-9804", coord_check(coord));
7430+}
7431+
7432+/* Initialize a coordinate to after last unit in the item. Coord must be set
7433+ already to existing item */
7434+void coord_init_after_item_end(coord_t * coord)
7435+{
7436+ coord->between = AFTER_UNIT;
7437+ coord->unit_pos = coord_last_unit_pos(coord);
7438+}
7439+
7440+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7441+void coord_init_before_item(coord_t * coord)
7442+{
7443+ coord->unit_pos = 0;
7444+ coord->between = BEFORE_ITEM;
7445+}
7446+
7447+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7448+void coord_init_after_item(coord_t * coord)
7449+{
7450+ coord->unit_pos = 0;
7451+ coord->between = AFTER_ITEM;
7452+}
7453+
7454+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7455+ it was not clear how actually */
7456+void coord_init_zero(coord_t * coord)
7457+{
7458+ memset(coord, 0, sizeof(*coord));
7459+}
7460+
7461+/* Return the number of units at the present item. Asserts coord_is_existing_item(). */
7462+unsigned coord_num_units(const coord_t * coord)
7463+{
7464+ assert("jmacd-9806", coord_is_existing_item(coord));
7465+
7466+ return item_plugin_by_coord(coord)->b.nr_units(coord);
7467+}
7468+
7469+/* Returns true if the coord was initializewd by coord_init_invalid (). */
7470+/* Audited by: green(2002.06.15) */
7471+int coord_is_invalid(const coord_t * coord)
7472+{
7473+ return coord->between == INVALID_COORD;
7474+}
7475+
7476+/* Returns true if the coordinate is positioned at an existing item, not before or after
7477+ an item. It may be placed at, before, or after any unit within the item, whether
7478+ existing or not. */
7479+int coord_is_existing_item(const coord_t * coord)
7480+{
7481+ switch (coord->between) {
7482+ case EMPTY_NODE:
7483+ case BEFORE_ITEM:
7484+ case AFTER_ITEM:
7485+ case INVALID_COORD:
7486+ return 0;
7487+
7488+ case BEFORE_UNIT:
7489+ case AT_UNIT:
7490+ case AFTER_UNIT:
7491+ return coord->item_pos < coord_num_items(coord);
7492+ }
7493+
7494+ impossible("jmacd-9900", "unreachable coord: %p", coord);
7495+ return 0;
7496+}
7497+
7498+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7499+ unit. */
7500+/* Audited by: green(2002.06.15) */
7501+int coord_is_existing_unit(const coord_t * coord)
7502+{
7503+ switch (coord->between) {
7504+ case EMPTY_NODE:
7505+ case BEFORE_UNIT:
7506+ case AFTER_UNIT:
7507+ case BEFORE_ITEM:
7508+ case AFTER_ITEM:
7509+ case INVALID_COORD:
7510+ return 0;
7511+
7512+ case AT_UNIT:
7513+ return (coord->item_pos < coord_num_items(coord)
7514+ && coord->unit_pos < coord_num_units(coord));
7515+ }
7516+
7517+ impossible("jmacd-9902", "unreachable");
7518+ return 0;
7519+}
7520+
7521+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
7522+ true for empty nodes nor coordinates positioned before the first item. */
7523+/* Audited by: green(2002.06.15) */
7524+int coord_is_leftmost_unit(const coord_t * coord)
7525+{
7526+ return (coord->between == AT_UNIT && coord->item_pos == 0
7527+ && coord->unit_pos == 0);
7528+}
7529+
7530+#if REISER4_DEBUG
7531+/* For assertions only, checks for a valid coordinate. */
7532+int coord_check(const coord_t * coord)
7533+{
7534+ if (coord->node == NULL) {
7535+ return 0;
7536+ }
7537+ if (znode_above_root(coord->node))
7538+ return 1;
7539+
7540+ switch (coord->between) {
7541+ default:
7542+ case INVALID_COORD:
7543+ return 0;
7544+ case EMPTY_NODE:
7545+ if (!node_is_empty(coord->node)) {
7546+ return 0;
7547+ }
7548+ return coord->item_pos == 0 && coord->unit_pos == 0;
7549+
7550+ case BEFORE_UNIT:
7551+ case AFTER_UNIT:
7552+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
7553+ && (coord->unit_pos == 0))
7554+ return 1;
7555+ case AT_UNIT:
7556+ break;
7557+ case AFTER_ITEM:
7558+ case BEFORE_ITEM:
7559+ /* before/after item should not set unit_pos. */
7560+ if (coord->unit_pos != 0) {
7561+ return 0;
7562+ }
7563+ break;
7564+ }
7565+
7566+ if (coord->item_pos >= node_num_items(coord->node)) {
7567+ return 0;
7568+ }
7569+
7570+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7571+ between is set either AFTER_ITEM or BEFORE_ITEM */
7572+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7573+ return 1;
7574+
7575+ if (coord_is_iplug_set(coord) &&
7576+ coord->unit_pos >
7577+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7578+ return 0;
7579+ }
7580+ return 1;
7581+}
7582+#endif
7583+
7584+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7585+ Returns 1 if the new position is does not exist. */
7586+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7587+{
7588+ /* If the node is invalid, leave it. */
7589+ if (coord->between == INVALID_COORD) {
7590+ return 1;
7591+ }
7592+
7593+ /* If the node is empty, set it appropriately. */
7594+ if (items == 0) {
7595+ coord->between = EMPTY_NODE;
7596+ coord_set_item_pos(coord, 0);
7597+ coord->unit_pos = 0;
7598+ return 1;
7599+ }
7600+
7601+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7602+ if (coord->between == EMPTY_NODE) {
7603+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7604+ coord_set_item_pos(coord, 0);
7605+ coord->unit_pos = 0;
7606+ return 0;
7607+ }
7608+
7609+ /* If the item_pos is out-of-range, set it appropriatly. */
7610+ if (coord->item_pos >= items) {
7611+ coord->between = AFTER_ITEM;
7612+ coord_set_item_pos(coord, items - 1);
7613+ coord->unit_pos = 0;
7614+ /* If is_next, return 1 (can't go any further). */
7615+ return is_next;
7616+ }
7617+
7618+ return 0;
7619+}
7620+
7621+/* Advances the coordinate by one unit to the right. If empty, no change. If
7622+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an
7623+ existing unit. */
7624+int coord_next_unit(coord_t * coord)
7625+{
7626+ unsigned items = coord_num_items(coord);
7627+
7628+ if (coord_adjust_items(coord, items, 1) == 1) {
7629+ return 1;
7630+ }
7631+
7632+ switch (coord->between) {
7633+ case BEFORE_UNIT:
7634+ /* Now it is positioned at the same unit. */
7635+ coord->between = AT_UNIT;
7636+ return 0;
7637+
7638+ case AFTER_UNIT:
7639+ case AT_UNIT:
7640+ /* If it was at or after a unit and there are more units in this item,
7641+ advance to the next one. */
7642+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
7643+ coord->unit_pos += 1;
7644+ coord->between = AT_UNIT;
7645+ return 0;
7646+ }
7647+
7648+ /* Otherwise, it is crossing an item boundary and treated as if it was
7649+ after the current item. */
7650+ coord->between = AFTER_ITEM;
7651+ coord->unit_pos = 0;
7652+ /* FALLTHROUGH */
7653+
7654+ case AFTER_ITEM:
7655+ /* Check for end-of-node. */
7656+ if (coord->item_pos == items - 1) {
7657+ return 1;
7658+ }
7659+
7660+ coord_inc_item_pos(coord);
7661+ coord->unit_pos = 0;
7662+ coord->between = AT_UNIT;
7663+ return 0;
7664+
7665+ case BEFORE_ITEM:
7666+ /* The adjust_items checks ensure that we are valid here. */
7667+ coord->unit_pos = 0;
7668+ coord->between = AT_UNIT;
7669+ return 0;
7670+
7671+ case INVALID_COORD:
7672+ case EMPTY_NODE:
7673+ /* Handled in coord_adjust_items(). */
7674+ break;
7675+ }
7676+
7677+ impossible("jmacd-9902", "unreachable");
7678+ return 0;
7679+}
7680+
7681+/* Advances the coordinate by one item to the right. If empty, no change. If
7682+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
7683+ an existing item. */
7684+int coord_next_item(coord_t * coord)
7685+{
7686+ unsigned items = coord_num_items(coord);
7687+
7688+ if (coord_adjust_items(coord, items, 1) == 1) {
7689+ return 1;
7690+ }
7691+
7692+ switch (coord->between) {
7693+ case AFTER_UNIT:
7694+ case AT_UNIT:
7695+ case BEFORE_UNIT:
7696+ case AFTER_ITEM:
7697+ /* Check for end-of-node. */
7698+ if (coord->item_pos == items - 1) {
7699+ coord->between = AFTER_ITEM;
7700+ coord->unit_pos = 0;
7701+ coord_clear_iplug(coord);
7702+ return 1;
7703+ }
7704+
7705+ /* Anywhere in an item, go to the next one. */
7706+ coord->between = AT_UNIT;
7707+ coord_inc_item_pos(coord);
7708+ coord->unit_pos = 0;
7709+ return 0;
7710+
7711+ case BEFORE_ITEM:
7712+ /* The out-of-range check ensures that we are valid here. */
7713+ coord->unit_pos = 0;
7714+ coord->between = AT_UNIT;
7715+ return 0;
7716+ case INVALID_COORD:
7717+ case EMPTY_NODE:
7718+ /* Handled in coord_adjust_items(). */
7719+ break;
7720+ }
7721+
7722+ impossible("jmacd-9903", "unreachable");
7723+ return 0;
7724+}
7725+
7726+/* Advances the coordinate by one unit to the left. If empty, no change. If
7727+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7728+ is an existing unit. */
7729+int coord_prev_unit(coord_t * coord)
7730+{
7731+ unsigned items = coord_num_items(coord);
7732+
7733+ if (coord_adjust_items(coord, items, 0) == 1) {
7734+ return 1;
7735+ }
7736+
7737+ switch (coord->between) {
7738+ case AT_UNIT:
7739+ case BEFORE_UNIT:
7740+ if (coord->unit_pos > 0) {
7741+ coord->unit_pos -= 1;
7742+ coord->between = AT_UNIT;
7743+ return 0;
7744+ }
7745+
7746+ if (coord->item_pos == 0) {
7747+ coord->between = BEFORE_ITEM;
7748+ return 1;
7749+ }
7750+
7751+ coord_dec_item_pos(coord);
7752+ coord->unit_pos = coord_last_unit_pos(coord);
7753+ coord->between = AT_UNIT;
7754+ return 0;
7755+
7756+ case AFTER_UNIT:
7757+ /* What if unit_pos is out-of-range? */
7758+ assert("jmacd-5442",
7759+ coord->unit_pos <= coord_last_unit_pos(coord));
7760+ coord->between = AT_UNIT;
7761+ return 0;
7762+
7763+ case BEFORE_ITEM:
7764+ if (coord->item_pos == 0) {
7765+ return 1;
7766+ }
7767+
7768+ coord_dec_item_pos(coord);
7769+ /* FALLTHROUGH */
7770+
7771+ case AFTER_ITEM:
7772+ coord->between = AT_UNIT;
7773+ coord->unit_pos = coord_last_unit_pos(coord);
7774+ return 0;
7775+
7776+ case INVALID_COORD:
7777+ case EMPTY_NODE:
7778+ break;
7779+ }
7780+
7781+ impossible("jmacd-9904", "unreachable");
7782+ return 0;
7783+}
7784+
7785+/* Advances the coordinate by one item to the left. If empty, no change. If
7786+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
7787+ is an existing item. */
7788+int coord_prev_item(coord_t * coord)
7789+{
7790+ unsigned items = coord_num_items(coord);
7791+
7792+ if (coord_adjust_items(coord, items, 0) == 1) {
7793+ return 1;
7794+ }
7795+
7796+ switch (coord->between) {
7797+ case AT_UNIT:
7798+ case AFTER_UNIT:
7799+ case BEFORE_UNIT:
7800+ case BEFORE_ITEM:
7801+
7802+ if (coord->item_pos == 0) {
7803+ coord->between = BEFORE_ITEM;
7804+ coord->unit_pos = 0;
7805+ return 1;
7806+ }
7807+
7808+ coord_dec_item_pos(coord);
7809+ coord->unit_pos = 0;
7810+ coord->between = AT_UNIT;
7811+ return 0;
7812+
7813+ case AFTER_ITEM:
7814+ coord->between = AT_UNIT;
7815+ coord->unit_pos = 0;
7816+ return 0;
7817+
7818+ case INVALID_COORD:
7819+ case EMPTY_NODE:
7820+ break;
7821+ }
7822+
7823+ impossible("jmacd-9905", "unreachable");
7824+ return 0;
7825+}
7826+
7827+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7828+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7829+{
7830+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7831+ if (dir == LEFT_SIDE) {
7832+ coord_init_first_unit(coord, node);
7833+ } else {
7834+ coord_init_last_unit(coord, node);
7835+ }
7836+}
7837+
7838+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7839+ argument. */
7840+/* Audited by: green(2002.06.15) */
7841+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7842+{
7843+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7844+ if (dir == LEFT_SIDE) {
7845+ return coord_is_before_leftmost(coord);
7846+ } else {
7847+ return coord_is_after_rightmost(coord);
7848+ }
7849+}
7850+
7851+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7852+/* Audited by: green(2002.06.15) */
7853+int coord_sideof_unit(coord_t * coord, sideof dir)
7854+{
7855+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7856+ if (dir == LEFT_SIDE) {
7857+ return coord_prev_unit(coord);
7858+ } else {
7859+ return coord_next_unit(coord);
7860+ }
7861+}
7862+
7863+#if REISER4_DEBUG
7864+int coords_equal(const coord_t * c1, const coord_t * c2)
7865+{
7866+ assert("nikita-2840", c1 != NULL);
7867+ assert("nikita-2841", c2 != NULL);
7868+
7869+ return
7870+ c1->node == c2->node &&
7871+ c1->item_pos == c2->item_pos &&
7872+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7873+}
7874+#endif /* REISER4_DEBUG */
7875+
7876+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7877+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7878+/* Audited by: green(2002.06.15) */
7879+coord_wrt_node coord_wrt(const coord_t * coord)
7880+{
7881+ if (coord_is_before_leftmost(coord)) {
7882+ return COORD_ON_THE_LEFT;
7883+ }
7884+
7885+ if (coord_is_after_rightmost(coord)) {
7886+ return COORD_ON_THE_RIGHT;
7887+ }
7888+
7889+ return COORD_INSIDE;
7890+}
7891+
7892+/* Returns true if the coordinate is positioned after the last item or after the last unit
7893+ of the last item or it is an empty node. */
7894+/* Audited by: green(2002.06.15) */
7895+int coord_is_after_rightmost(const coord_t * coord)
7896+{
7897+ assert("jmacd-7313", coord_check(coord));
7898+
7899+ switch (coord->between) {
7900+ case INVALID_COORD:
7901+ case AT_UNIT:
7902+ case BEFORE_UNIT:
7903+ case BEFORE_ITEM:
7904+ return 0;
7905+
7906+ case EMPTY_NODE:
7907+ return 1;
7908+
7909+ case AFTER_ITEM:
7910+ return (coord->item_pos == node_num_items(coord->node) - 1);
7911+
7912+ case AFTER_UNIT:
7913+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7914+ coord->unit_pos == coord_last_unit_pos(coord));
7915+ }
7916+
7917+ impossible("jmacd-9908", "unreachable");
7918+ return 0;
7919+}
7920+
7921+/* Returns true if the coordinate is positioned before the first item or it is an empty
7922+ node. */
7923+int coord_is_before_leftmost(const coord_t * coord)
7924+{
7925+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7926+ necessary to check if coord is set before leftmost
7927+ assert ("jmacd-7313", coord_check (coord)); */
7928+ switch (coord->between) {
7929+ case INVALID_COORD:
7930+ case AT_UNIT:
7931+ case AFTER_ITEM:
7932+ case AFTER_UNIT:
7933+ return 0;
7934+
7935+ case EMPTY_NODE:
7936+ return 1;
7937+
7938+ case BEFORE_ITEM:
7939+ case BEFORE_UNIT:
7940+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
7941+ }
7942+
7943+ impossible("jmacd-9908", "unreachable");
7944+ return 0;
7945+}
7946+
7947+/* Returns true if the coordinate is positioned after a item, before a item, after the
7948+ last unit of an item, before the first unit of an item, or at an empty node. */
7949+/* Audited by: green(2002.06.15) */
7950+int coord_is_between_items(const coord_t * coord)
7951+{
7952+ assert("jmacd-7313", coord_check(coord));
7953+
7954+ switch (coord->between) {
7955+ case INVALID_COORD:
7956+ case AT_UNIT:
7957+ return 0;
7958+
7959+ case AFTER_ITEM:
7960+ case BEFORE_ITEM:
7961+ case EMPTY_NODE:
7962+ return 1;
7963+
7964+ case BEFORE_UNIT:
7965+ return coord->unit_pos == 0;
7966+
7967+ case AFTER_UNIT:
7968+ return coord->unit_pos == coord_last_unit_pos(coord);
7969+ }
7970+
7971+ impossible("jmacd-9908", "unreachable");
7972+ return 0;
7973+}
7974+
7975+#if REISER4_DEBUG
7976+/* Returns true if the coordinates are positioned at adjacent units, regardless of
7977+ before-after or item boundaries. */
7978+int coord_are_neighbors(coord_t * c1, coord_t * c2)
7979+{
7980+ coord_t *left;
7981+ coord_t *right;
7982+
7983+ assert("nikita-1241", c1 != NULL);
7984+ assert("nikita-1242", c2 != NULL);
7985+ assert("nikita-1243", c1->node == c2->node);
7986+ assert("nikita-1244", coord_is_existing_unit(c1));
7987+ assert("nikita-1245", coord_is_existing_unit(c2));
7988+
7989+ left = right = NULL;
7990+ switch (coord_compare(c1, c2)) {
7991+ case COORD_CMP_ON_LEFT:
7992+ left = c1;
7993+ right = c2;
7994+ break;
7995+ case COORD_CMP_ON_RIGHT:
7996+ left = c2;
7997+ right = c1;
7998+ break;
7999+ case COORD_CMP_SAME:
8000+ return 0;
8001+ default:
8002+ wrong_return_value("nikita-1246", "compare_coords()");
8003+ }
8004+ assert("vs-731", left && right);
8005+ if (left->item_pos == right->item_pos) {
8006+ return left->unit_pos + 1 == right->unit_pos;
8007+ } else if (left->item_pos + 1 == right->item_pos) {
8008+ return (left->unit_pos == coord_last_unit_pos(left))
8009+ && (right->unit_pos == 0);
8010+ } else {
8011+ return 0;
8012+ }
8013+}
8014+#endif /* REISER4_DEBUG */
8015+
8016+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
8017+ COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */
8018+/* Audited by: green(2002.06.15) */
8019+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
8020+{
8021+ assert("vs-209", c1->node == c2->node);
8022+ assert("vs-194", coord_is_existing_unit(c1)
8023+ && coord_is_existing_unit(c2));
8024+
8025+ if (c1->item_pos > c2->item_pos)
8026+ return COORD_CMP_ON_RIGHT;
8027+ if (c1->item_pos < c2->item_pos)
8028+ return COORD_CMP_ON_LEFT;
8029+ if (c1->unit_pos > c2->unit_pos)
8030+ return COORD_CMP_ON_RIGHT;
8031+ if (c1->unit_pos < c2->unit_pos)
8032+ return COORD_CMP_ON_LEFT;
8033+ return COORD_CMP_SAME;
8034+}
8035+
8036+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8037+ non-zero if there is no position to the right. */
8038+int coord_set_to_right(coord_t * coord)
8039+{
8040+ unsigned items = coord_num_items(coord);
8041+
8042+ if (coord_adjust_items(coord, items, 1) == 1) {
8043+ return 1;
8044+ }
8045+
8046+ switch (coord->between) {
8047+ case AT_UNIT:
8048+ return 0;
8049+
8050+ case BEFORE_ITEM:
8051+ case BEFORE_UNIT:
8052+ coord->between = AT_UNIT;
8053+ return 0;
8054+
8055+ case AFTER_UNIT:
8056+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
8057+ coord->unit_pos += 1;
8058+ coord->between = AT_UNIT;
8059+ return 0;
8060+ } else {
8061+
8062+ coord->unit_pos = 0;
8063+
8064+ if (coord->item_pos == items - 1) {
8065+ coord->between = AFTER_ITEM;
8066+ return 1;
8067+ }
8068+
8069+ coord_inc_item_pos(coord);
8070+ coord->between = AT_UNIT;
8071+ return 0;
8072+ }
8073+
8074+ case AFTER_ITEM:
8075+ if (coord->item_pos == items - 1) {
8076+ return 1;
8077+ }
8078+
8079+ coord_inc_item_pos(coord);
8080+ coord->unit_pos = 0;
8081+ coord->between = AT_UNIT;
8082+ return 0;
8083+
8084+ case EMPTY_NODE:
8085+ return 1;
8086+
8087+ case INVALID_COORD:
8088+ break;
8089+ }
8090+
8091+ impossible("jmacd-9920", "unreachable");
8092+ return 0;
8093+}
8094+
8095+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8096+ non-zero if there is no position to the left. */
8097+int coord_set_to_left(coord_t * coord)
8098+{
8099+ unsigned items = coord_num_items(coord);
8100+
8101+ if (coord_adjust_items(coord, items, 0) == 1) {
8102+ return 1;
8103+ }
8104+
8105+ switch (coord->between) {
8106+ case AT_UNIT:
8107+ return 0;
8108+
8109+ case AFTER_UNIT:
8110+ coord->between = AT_UNIT;
8111+ return 0;
8112+
8113+ case AFTER_ITEM:
8114+ coord->between = AT_UNIT;
8115+ coord->unit_pos = coord_last_unit_pos(coord);
8116+ return 0;
8117+
8118+ case BEFORE_UNIT:
8119+ if (coord->unit_pos > 0) {
8120+ coord->unit_pos -= 1;
8121+ coord->between = AT_UNIT;
8122+ return 0;
8123+ } else {
8124+
8125+ if (coord->item_pos == 0) {
8126+ coord->between = BEFORE_ITEM;
8127+ return 1;
8128+ }
8129+
8130+ coord->unit_pos = coord_last_unit_pos(coord);
8131+ coord_dec_item_pos(coord);
8132+ coord->between = AT_UNIT;
8133+ return 0;
8134+ }
8135+
8136+ case BEFORE_ITEM:
8137+ if (coord->item_pos == 0) {
8138+ return 1;
8139+ }
8140+
8141+ coord_dec_item_pos(coord);
8142+ coord->unit_pos = coord_last_unit_pos(coord);
8143+ coord->between = AT_UNIT;
8144+ return 0;
8145+
8146+ case EMPTY_NODE:
8147+ return 1;
8148+
8149+ case INVALID_COORD:
8150+ break;
8151+ }
8152+
8153+ impossible("jmacd-9920", "unreachable");
8154+ return 0;
8155+}
8156+
8157+static const char *coord_tween_tostring(between_enum n)
8158+{
8159+ switch (n) {
8160+ case BEFORE_UNIT:
8161+ return "before unit";
8162+ case BEFORE_ITEM:
8163+ return "before item";
8164+ case AT_UNIT:
8165+ return "at unit";
8166+ case AFTER_UNIT:
8167+ return "after unit";
8168+ case AFTER_ITEM:
8169+ return "after item";
8170+ case EMPTY_NODE:
8171+ return "empty node";
8172+ case INVALID_COORD:
8173+ return "invalid";
8174+ default:
8175+ {
8176+ static char buf[30];
8177+
8178+ sprintf(buf, "unknown: %i", n);
8179+ return buf;
8180+ }
8181+ }
8182+}
8183+
8184+void print_coord(const char *mes, const coord_t * coord, int node)
8185+{
8186+ if (coord == NULL) {
8187+ printk("%s: null\n", mes);
8188+ return;
8189+ }
8190+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
8191+ mes, coord->item_pos, coord->unit_pos,
8192+ coord_tween_tostring(coord->between), coord->iplugid);
8193+}
8194+
8195+int
8196+item_utmost_child_real_block(const coord_t * coord, sideof side,
8197+ reiser4_block_nr * blk)
8198+{
8199+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
8200+ side,
8201+ blk);
8202+}
8203+
8204+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
8205+{
8206+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
8207+}
8208+
8209+/* @count bytes of flow @f got written, update correspondingly f->length,
8210+ f->data and f->key */
8211+void move_flow_forward(flow_t * f, unsigned count)
8212+{
8213+ if (f->data)
8214+ f->data += count;
8215+ f->length -= count;
8216+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
8217+}
8218+
8219+/*
8220+ Local variables:
8221+ c-indentation-style: "K&R"
8222+ mode-name: "LC"
8223+ c-basic-offset: 8
8224+ tab-width: 8
8225+ fill-column: 120
8226+ scroll-step: 1
8227+ End:
8228+*/
8229diff --git a/fs/reiser4/coord.h b/fs/reiser4/coord.h
8230new file mode 100644
8231index 0000000..313e615
8232--- /dev/null
8233+++ b/fs/reiser4/coord.h
8234@@ -0,0 +1,389 @@
8235+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8236+
8237+/* Coords */
8238+
8239+#if !defined( __REISER4_COORD_H__ )
8240+#define __REISER4_COORD_H__
8241+
8242+#include "forward.h"
8243+#include "debug.h"
8244+#include "dformat.h"
8245+#include "key.h"
8246+
8247+/* insertions happen between coords in the tree, so we need some means
8248+ of specifying the sense of betweenness. */
8249+typedef enum {
8250+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
8251+ AT_UNIT,
8252+ AFTER_UNIT,
8253+ BEFORE_ITEM,
8254+ AFTER_ITEM,
8255+ INVALID_COORD,
8256+ EMPTY_NODE,
8257+} between_enum;
8258+
8259+/* location of coord w.r.t. its node */
8260+typedef enum {
8261+ COORD_ON_THE_LEFT = -1,
8262+ COORD_ON_THE_RIGHT = +1,
8263+ COORD_INSIDE = 0
8264+} coord_wrt_node;
8265+
8266+typedef enum {
8267+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
8268+} coord_cmp;
8269+
8270+struct coord {
8271+ /* node in a tree */
8272+ /* 0 */ znode *node;
8273+
8274+ /* position of item within node */
8275+ /* 4 */ pos_in_node_t item_pos;
8276+ /* position of unit within item */
8277+ /* 6 */ pos_in_node_t unit_pos;
8278+ /* optimization: plugin of item is stored in coord_t. Until this was
8279+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
8280+ is invalidated (set to 0xff) on each modification of ->item_pos,
8281+ and all such modifications are funneled through coord_*_item_pos()
8282+ functions below.
8283+ */
8284+ /* 8 */ char iplugid;
8285+ /* position of coord w.r.t. to neighboring items and/or units.
8286+ Values are taken from &between_enum above.
8287+ */
8288+ /* 9 */ char between;
8289+ /* padding. It will be added by the compiler anyway to conform to the
8290+ * C language alignment requirements. We keep it here to be on the
8291+ * safe side and to have a clear picture of the memory layout of this
8292+ * structure. */
8293+ /* 10 */ __u16 pad;
8294+ /* 12 */ int offset;
8295+#if REISER4_DEBUG
8296+ unsigned long plug_v;
8297+ unsigned long body_v;
8298+#endif
8299+};
8300+
8301+#define INVALID_PLUGID ((char)((1 << 8) - 1))
8302+#define INVALID_OFFSET -1
8303+
8304+static inline void coord_clear_iplug(coord_t * coord)
8305+{
8306+ assert("nikita-2835", coord != NULL);
8307+ coord->iplugid = INVALID_PLUGID;
8308+ coord->offset = INVALID_OFFSET;
8309+}
8310+
8311+static inline int coord_is_iplug_set(const coord_t * coord)
8312+{
8313+ assert("nikita-2836", coord != NULL);
8314+ return coord->iplugid != INVALID_PLUGID;
8315+}
8316+
8317+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
8318+{
8319+ assert("nikita-2478", coord != NULL);
8320+ coord->item_pos = pos;
8321+ coord_clear_iplug(coord);
8322+}
8323+
8324+static inline void coord_dec_item_pos(coord_t * coord)
8325+{
8326+ assert("nikita-2480", coord != NULL);
8327+ --coord->item_pos;
8328+ coord_clear_iplug(coord);
8329+}
8330+
8331+static inline void coord_inc_item_pos(coord_t * coord)
8332+{
8333+ assert("nikita-2481", coord != NULL);
8334+ ++coord->item_pos;
8335+ coord_clear_iplug(coord);
8336+}
8337+
8338+static inline void coord_add_item_pos(coord_t * coord, int delta)
8339+{
8340+ assert("nikita-2482", coord != NULL);
8341+ coord->item_pos += delta;
8342+ coord_clear_iplug(coord);
8343+}
8344+
8345+static inline void coord_invalid_item_pos(coord_t * coord)
8346+{
8347+ assert("nikita-2832", coord != NULL);
8348+ coord->item_pos = (unsigned short)~0;
8349+ coord_clear_iplug(coord);
8350+}
8351+
8352+/* Reverse a direction. */
8353+static inline sideof sideof_reverse(sideof side)
8354+{
8355+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
8356+}
8357+
8358+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
8359+
8360+ "first" and "last"
8361+ "next" and "prev"
8362+ "before" and "after"
8363+ "leftmost" and "rightmost"
8364+
8365+ But I think the chosen names are decent the way they are.
8366+*/
8367+
8368+/* COORD INITIALIZERS */
8369+
8370+/* Initialize an invalid coordinate. */
8371+extern void coord_init_invalid(coord_t * coord, const znode * node);
8372+
8373+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
8374+
8375+/* Initialize a coordinate to point at the first unit of the first item. If the node is
8376+ empty, it is positioned at the EMPTY_NODE. */
8377+extern void coord_init_first_unit(coord_t * coord, const znode * node);
8378+
8379+/* Initialize a coordinate to point at the last unit of the last item. If the node is
8380+ empty, it is positioned at the EMPTY_NODE. */
8381+extern void coord_init_last_unit(coord_t * coord, const znode * node);
8382+
8383+/* Initialize a coordinate to before the first item. If the node is empty, it is
8384+ positioned at the EMPTY_NODE. */
8385+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
8386+
8387+/* Initialize a coordinate to after the last item. If the node is empty, it is positioned
8388+ at the EMPTY_NODE. */
8389+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
8390+
8391+/* Initialize a coordinate to after last unit in the item. Coord must be set
8392+ already to existing item */
8393+void coord_init_after_item_end(coord_t * coord);
8394+
8395+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
8396+void coord_init_before_item(coord_t *);
8397+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
8398+void coord_init_after_item(coord_t *);
8399+
8400+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
8401+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
8402+ sideof dir);
8403+
8404+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8405+ it was not clear how actually
8406+ FIXME-VS: added by vs (2002, june, 8) */
8407+extern void coord_init_zero(coord_t * coord);
8408+
8409+/* COORD METHODS */
8410+
8411+/* after shifting of node content, coord previously set properly may become
8412+ invalid, try to "normalize" it. */
8413+void coord_normalize(coord_t * coord);
8414+
8415+/* Copy a coordinate. */
8416+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
8417+
8418+/* Copy a coordinate without check. */
8419+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
8420+
8421+unsigned coord_num_units(const coord_t * coord);
8422+
8423+/* Return the last valid unit number at the present item (i.e.,
8424+ coord_num_units() - 1). */
8425+static inline unsigned coord_last_unit_pos(const coord_t * coord)
8426+{
8427+ return coord_num_units(coord) - 1;
8428+}
8429+
8430+#if REISER4_DEBUG
8431+/* For assertions only, checks for a valid coordinate. */
8432+extern int coord_check(const coord_t * coord);
8433+
8434+extern unsigned long znode_times_locked(const znode * z);
8435+
8436+static inline void coord_update_v(coord_t * coord)
8437+{
8438+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
8439+}
8440+#endif
8441+
8442+extern int coords_equal(const coord_t * c1, const coord_t * c2);
8443+
8444+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8445+
8446+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8447+ return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8448+extern coord_wrt_node coord_wrt(const coord_t * coord);
8449+
8450+/* Returns true if the coordinates are positioned at adjacent units, regardless of
8451+ before-after or item boundaries. */
8452+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8453+
8454+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8455+ NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */
8456+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8457+
8458+/* COORD PREDICATES */
8459+
8460+/* Returns true if the coord was initializewd by coord_init_invalid (). */
8461+extern int coord_is_invalid(const coord_t * coord);
8462+
8463+/* Returns true if the coordinate is positioned at an existing item, not before or after
8464+ an item. It may be placed at, before, or after any unit within the item, whether
8465+ existing or not. If this is true you can call methods of the item plugin. */
8466+extern int coord_is_existing_item(const coord_t * coord);
8467+
8468+/* Returns true if the coordinate is positioned after a item, before a item, after the
8469+ last unit of an item, before the first unit of an item, or at an empty node. */
8470+extern int coord_is_between_items(const coord_t * coord);
8471+
8472+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8473+ unit. */
8474+extern int coord_is_existing_unit(const coord_t * coord);
8475+
8476+/* Returns true if the coordinate is positioned at an empty node. */
8477+extern int coord_is_empty(const coord_t * coord);
8478+
8479+/* Returns true if the coordinate is positioned at the first unit of the first item. Not
8480+ true for empty nodes nor coordinates positioned before the first item. */
8481+extern int coord_is_leftmost_unit(const coord_t * coord);
8482+
8483+/* Returns true if the coordinate is positioned after the last item or after the last unit
8484+ of the last item or it is an empty node. */
8485+extern int coord_is_after_rightmost(const coord_t * coord);
8486+
8487+/* Returns true if the coordinate is positioned before the first item or it is an empty
8488+ node. */
8489+extern int coord_is_before_leftmost(const coord_t * coord);
8490+
8491+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8492+ argument. */
8493+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8494+
8495+/* COORD MODIFIERS */
8496+
8497+/* Advances the coordinate by one unit to the right. If empty, no change. If
8498+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8499+ an existing unit. */
8500+extern int coord_next_unit(coord_t * coord);
8501+
8502+/* Advances the coordinate by one item to the right. If empty, no change. If
8503+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is
8504+ an existing item. */
8505+extern int coord_next_item(coord_t * coord);
8506+
8507+/* Advances the coordinate by one unit to the left. If empty, no change. If
8508+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8509+ is an existing unit. */
8510+extern int coord_prev_unit(coord_t * coord);
8511+
8512+/* Advances the coordinate by one item to the left. If empty, no change. If
8513+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position
8514+ is an existing item. */
8515+extern int coord_prev_item(coord_t * coord);
8516+
8517+/* If the coordinate is between items, shifts it to the right. Returns 0 on success and
8518+ non-zero if there is no position to the right. */
8519+extern int coord_set_to_right(coord_t * coord);
8520+
8521+/* If the coordinate is between items, shifts it to the left. Returns 0 on success and
8522+ non-zero if there is no position to the left. */
8523+extern int coord_set_to_left(coord_t * coord);
8524+
8525+/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success
8526+ and non-zero if the unit did not exist. */
8527+extern int coord_set_after_unit(coord_t * coord);
8528+
8529+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8530+extern int coord_sideof_unit(coord_t * coord, sideof dir);
8531+
8532+/* iterate over all units in @node */
8533+#define for_all_units( coord, node ) \
8534+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8535+ coord_next_unit( coord ) == 0 ; )
8536+
8537+/* iterate over all items in @node */
8538+#define for_all_items( coord, node ) \
8539+ for( coord_init_before_first_item( ( coord ), ( node ) ) ; \
8540+ coord_next_item( coord ) == 0 ; )
8541+
8542+/* COORD/ITEM METHODS */
8543+
8544+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8545+ reiser4_block_nr * blk);
8546+extern int item_utmost_child(const coord_t * coord, sideof side,
8547+ jnode ** child);
8548+
8549+/* a flow is a sequence of bytes being written to or read from the tree. The
8550+ tree will slice the flow into items while storing it into nodes, but all of
8551+ that is hidden from anything outside the tree. */
8552+
8553+struct flow {
8554+ reiser4_key key; /* key of start of flow's sequence of bytes */
8555+ loff_t length; /* length of flow's sequence of bytes */
8556+ char *data; /* start of flow's sequence of bytes */
8557+ int user; /* if 1 data is user space, 0 - kernel space */
8558+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
8559+};
8560+
8561+void move_flow_forward(flow_t * f, unsigned count);
8562+
8563+/* &reiser4_item_data - description of data to be inserted or pasted
8564+
8565+ Q: articulate the reasons for the difference between this and flow.
8566+
8567+ A: Becides flow we insert into tree other things: stat data, directory
8568+ entry, etc. To insert them into tree one has to provide this structure. If
8569+ one is going to insert flow - he can use insert_flow, where this structure
8570+ does not have to be created
8571+*/
8572+struct reiser4_item_data {
8573+ /* actual data to be inserted. If NULL, ->create_item() will not
8574+ do xmemcpy itself, leaving this up to the caller. This can
8575+ save some amount of unnecessary memory copying, for example,
8576+ during insertion of stat data.
8577+
8578+ */
8579+ char *data;
8580+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
8581+ kernel space */
8582+ int user;
8583+ /* amount of data we are going to insert or paste */
8584+ int length;
8585+ /* "Arg" is opaque data that is passed down to the
8586+ ->create_item() method of node layout, which in turn
8587+ hands it to the ->create_hook() of item being created. This
8588+ arg is currently used by:
8589+
8590+ . ->create_hook() of internal item
8591+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8592+ . ->paste() method of directory item.
8593+ . ->create_hook() of extent item
8594+
8595+ For internal item, this is left "brother" of new node being
8596+ inserted and it is used to add new node into sibling list
8597+ after parent to it was just inserted into parent.
8598+
8599+ While ->arg does look somewhat of unnecessary compication,
8600+ it actually saves a lot of headache in many places, because
8601+ all data necessary to insert or paste new data into tree are
8602+ collected in one place, and this eliminates a lot of extra
8603+ argument passing and storing everywhere.
8604+
8605+ */
8606+ void *arg;
8607+ /* plugin of item we are inserting */
8608+ item_plugin *iplug;
8609+};
8610+
8611+/* __REISER4_COORD_H__ */
8612+#endif
8613+
8614+/* Make Linus happy.
8615+ Local variables:
8616+ c-indentation-style: "K&R"
8617+ mode-name: "LC"
8618+ c-basic-offset: 8
8619+ tab-width: 8
8620+ fill-column: 120
8621+ scroll-step: 1
8622+ End:
8623+*/
8624diff --git a/fs/reiser4/debug.c b/fs/reiser4/debug.c
8625new file mode 100644
8626index 0000000..3c55fe8
8627--- /dev/null
8628+++ b/fs/reiser4/debug.c
8629@@ -0,0 +1,308 @@
8630+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8631+ * reiser4/README */
8632+
8633+/* Debugging facilities. */
8634+
8635+/*
8636+ * This file contains generic debugging functions used by reiser4. Roughly
8637+ * following:
8638+ *
8639+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
8640+ *
8641+ * locking:
8642+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8643+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8644+ *
8645+ * error code monitoring (see comment before RETERR macro):
8646+ * reiser4_return_err(), reiser4_report_err().
8647+ *
8648+ * stack back-tracing: fill_backtrace()
8649+ *
8650+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8651+ * reiser4_debugtrap().
8652+ *
8653+ */
8654+
8655+#include "reiser4.h"
8656+#include "context.h"
8657+#include "super.h"
8658+#include "txnmgr.h"
8659+#include "znode.h"
8660+
8661+#include <linux/sysfs.h>
8662+#include <linux/slab.h>
8663+#include <linux/types.h>
8664+#include <linux/fs.h>
8665+#include <linux/spinlock.h>
8666+#include <linux/kallsyms.h>
8667+#include <linux/vmalloc.h>
8668+#include <linux/ctype.h>
8669+#include <linux/sysctl.h>
8670+#include <linux/hardirq.h>
8671+
8672+#if 0
8673+#if REISER4_DEBUG
8674+static void reiser4_report_err(void);
8675+#else
8676+#define reiser4_report_err() noop
8677+#endif
8678+#endif /* 0 */
8679+
8680+/*
8681+ * global buffer where message given to reiser4_panic is formatted.
8682+ */
8683+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8684+
8685+/*
8686+ * lock protecting consistency of panic_buf under concurrent panics
8687+ */
8688+static DEFINE_SPINLOCK(panic_guard);
8689+
8690+/* Your best friend. Call it on each occasion. This is called by
8691+ fs/reiser4/debug.h:reiser4_panic(). */
8692+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8693+{
8694+ static int in_panic = 0;
8695+ va_list args;
8696+
8697+ /*
8698+ * check for recursive panic.
8699+ */
8700+ if (in_panic == 0) {
8701+ in_panic = 1;
8702+
8703+ spin_lock(&panic_guard);
8704+ va_start(args, format);
8705+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8706+ va_end(args);
8707+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8708+ spin_unlock(&panic_guard);
8709+
8710+ /*
8711+ * if kernel debugger is configured---drop in. Early dropping
8712+ * into kgdb is not always convenient, because panic message
8713+ * is not yet printed most of the times. But:
8714+ *
8715+ * (1) message can be extracted from printk_buf[]
8716+ * (declared static inside of printk()), and
8717+ *
8718+ * (2) sometimes serial/kgdb combo dies while printing
8719+ * long panic message, so it's more prudent to break into
8720+ * debugger earlier.
8721+ *
8722+ */
8723+ DEBUGON(1);
8724+ }
8725+ /* to make gcc happy about noreturn attribute */
8726+ panic("%s", panic_buf);
8727+}
8728+
8729+#if 0
8730+void
8731+reiser4_print_prefix(const char *level, int reperr, const char *mid,
8732+ const char *function, const char *file, int lineno)
8733+{
8734+ const char *comm;
8735+ int pid;
8736+
8737+ if (unlikely(in_interrupt() || in_irq())) {
8738+ comm = "interrupt";
8739+ pid = 0;
8740+ } else {
8741+ comm = current->comm;
8742+ pid = current->pid;
8743+ }
8744+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8745+ level, comm, pid, function, file, lineno, mid);
8746+ if (reperr)
8747+ reiser4_report_err();
8748+}
8749+#endif /* 0 */
8750+
8751+/* Preemption point: this should be called periodically during long running
8752+ operations (carry, allocate, and squeeze are best examples) */
8753+int reiser4_preempt_point(void)
8754+{
8755+ assert("nikita-3008", reiser4_schedulable());
8756+ cond_resched();
8757+ return signal_pending(current);
8758+}
8759+
8760+#if REISER4_DEBUG
8761+/* Debugging aid: return struct where information about locks taken by current
8762+ thread is accumulated. This can be used to formulate lock ordering
8763+ constraints and various assertions.
8764+
8765+*/
8766+reiser4_lock_counters_info *reiser4_lock_counters(void)
8767+{
8768+ reiser4_context *ctx = get_current_context();
8769+ assert("jmacd-1123", ctx != NULL);
8770+ return &ctx->locks;
8771+}
8772+
8773+/*
8774+ * print human readable information about locks held by the reiser4 context.
8775+ */
8776+static void print_lock_counters(const char *prefix,
8777+ const reiser4_lock_counters_info * info)
8778+{
8779+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8780+ "jload: %i, "
8781+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8782+ "ktxnmgrd: %i, fq: %i\n"
8783+ "inode: %i, "
8784+ "cbk_cache: %i (r:%i,w%i), "
8785+ "eflush: %i, "
8786+ "zlock: %i,\n"
8787+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8788+ "d: %i, x: %i, t: %i\n", prefix,
8789+ info->spin_locked_jnode,
8790+ info->rw_locked_tree, info->read_locked_tree,
8791+ info->write_locked_tree,
8792+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8793+ info->spin_locked_jload,
8794+ info->spin_locked_txnh,
8795+ info->spin_locked_atom, info->spin_locked_stack,
8796+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8797+ info->spin_locked_fq,
8798+ info->spin_locked_inode,
8799+ info->rw_locked_cbk_cache,
8800+ info->read_locked_cbk_cache,
8801+ info->write_locked_cbk_cache,
8802+ info->spin_locked_super_eflush,
8803+ info->spin_locked_zlock,
8804+ info->spin_locked,
8805+ info->long_term_locked_znode,
8806+ info->inode_sem_r, info->inode_sem_w,
8807+ info->d_refs, info->x_refs, info->t_refs);
8808+}
8809+
8810+/* check that no spinlocks are held */
8811+int reiser4_schedulable(void)
8812+{
8813+ if (get_current_context_check() != NULL) {
8814+ if (!LOCK_CNT_NIL(spin_locked)) {
8815+ print_lock_counters("in atomic", reiser4_lock_counters());
8816+ return 0;
8817+ }
8818+ }
8819+ might_sleep();
8820+ return 1;
8821+}
8822+/*
8823+ * return true, iff no locks are held.
8824+ */
8825+int reiser4_no_counters_are_held(void)
8826+{
8827+ reiser4_lock_counters_info *counters;
8828+
8829+ counters = reiser4_lock_counters();
8830+ return
8831+ (counters->spin_locked_zlock == 0) &&
8832+ (counters->spin_locked_jnode == 0) &&
8833+ (counters->rw_locked_tree == 0) &&
8834+ (counters->read_locked_tree == 0) &&
8835+ (counters->write_locked_tree == 0) &&
8836+ (counters->rw_locked_dk == 0) &&
8837+ (counters->read_locked_dk == 0) &&
8838+ (counters->write_locked_dk == 0) &&
8839+ (counters->spin_locked_txnh == 0) &&
8840+ (counters->spin_locked_atom == 0) &&
8841+ (counters->spin_locked_stack == 0) &&
8842+ (counters->spin_locked_txnmgr == 0) &&
8843+ (counters->spin_locked_inode == 0) &&
8844+ (counters->spin_locked == 0) &&
8845+ (counters->long_term_locked_znode == 0) &&
8846+ (counters->inode_sem_r == 0) &&
8847+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8848+}
8849+
8850+/*
8851+ * return true, iff transaction commit can be done under locks held by the
8852+ * current thread.
8853+ */
8854+int reiser4_commit_check_locks(void)
8855+{
8856+ reiser4_lock_counters_info *counters;
8857+ int inode_sem_r;
8858+ int inode_sem_w;
8859+ int result;
8860+
8861+ /*
8862+ * inode's read/write semaphore is the only reiser4 lock that can be
8863+ * held during commit.
8864+ */
8865+
8866+ counters = reiser4_lock_counters();
8867+ inode_sem_r = counters->inode_sem_r;
8868+ inode_sem_w = counters->inode_sem_w;
8869+
8870+ counters->inode_sem_r = counters->inode_sem_w = 0;
8871+ result = reiser4_no_counters_are_held();
8872+ counters->inode_sem_r = inode_sem_r;
8873+ counters->inode_sem_w = inode_sem_w;
8874+ return result;
8875+}
8876+
8877+/*
8878+ * fill "error site" in the current reiser4 context. See comment before RETERR
8879+ * macro for more details.
8880+ */
8881+void reiser4_return_err(int code, const char *file, int line)
8882+{
8883+ if (code < 0 && is_in_reiser4_context()) {
8884+ reiser4_context *ctx = get_current_context();
8885+
8886+ if (ctx != NULL) {
8887+ ctx->err.code = code;
8888+ ctx->err.file = file;
8889+ ctx->err.line = line;
8890+ }
8891+ }
8892+}
8893+
8894+#if 0
8895+/*
8896+ * report error information recorder by reiser4_return_err().
8897+ */
8898+static void reiser4_report_err(void)
8899+{
8900+ reiser4_context *ctx = get_current_context_check();
8901+
8902+ if (ctx != NULL) {
8903+ if (ctx->err.code != 0) {
8904+ printk("code: %i at %s:%i\n",
8905+ ctx->err.code, ctx->err.file, ctx->err.line);
8906+ }
8907+ }
8908+}
8909+#endif /* 0 */
8910+
8911+#endif /* REISER4_DEBUG */
8912+
8913+#if KERNEL_DEBUGGER
8914+
8915+/*
8916+ * this functions just drops into kernel debugger. It is a convenient place to
8917+ * put breakpoint in.
8918+ */
8919+void reiser4_debugtrap(void)
8920+{
8921+ /* do nothing. Put break point here. */
8922+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8923+ extern void breakpoint(void);
8924+ breakpoint();
8925+#endif
8926+}
8927+#endif
8928+
8929+/* Make Linus happy.
8930+ Local variables:
8931+ c-indentation-style: "K&R"
8932+ mode-name: "LC"
8933+ c-basic-offset: 8
8934+ tab-width: 8
8935+ fill-column: 120
8936+ End:
8937+*/
8938diff --git a/fs/reiser4/debug.h b/fs/reiser4/debug.h
8939new file mode 100644
8940index 0000000..68e7f31
8941--- /dev/null
8942+++ b/fs/reiser4/debug.h
8943@@ -0,0 +1,350 @@
8944+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8945+
8946+/* Declarations of debug macros. */
8947+
8948+#if !defined( __FS_REISER4_DEBUG_H__ )
8949+#define __FS_REISER4_DEBUG_H__
8950+
8951+#include "forward.h"
8952+#include "reiser4.h"
8953+
8954+/* generic function to produce formatted output, decorating it with
8955+ whatever standard prefixes/postfixes we want. "Fun" is a function
8956+ that will be actually called, can be printk, panic etc.
8957+ This is for use by other debugging macros, not by users. */
8958+#define DCALL(lev, fun, reperr, label, format, ...) \
8959+({ \
8960+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
8961+ current->comm, current->pid, __FUNCTION__, \
8962+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
8963+})
8964+
8965+/*
8966+ * cause kernel to crash
8967+ */
8968+#define reiser4_panic(mid, format, ...) \
8969+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8970+
8971+/* print message with indication of current process, file, line and
8972+ function */
8973+#define reiser4_log(label, format, ...) \
8974+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8975+
8976+/* Assertion checked during compilation.
8977+ If "cond" is false (0) we get duplicate case label in switch.
8978+ Use this to check something like famous
8979+ cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8980+ in 3.x journal.c. If cassertion fails you get compiler error,
8981+ so no "maintainer-id".
8982+*/
8983+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8984+
8985+#define noop do {;} while(0)
8986+
8987+#if REISER4_DEBUG
8988+/* version of info that only actually prints anything when _d_ebugging
8989+ is on */
8990+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8991+/* macro to catch logical errors. Put it into `default' clause of
8992+ switch() statement. */
8993+#define impossible(label, format, ...) \
8994+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8995+/* assert assures that @cond is true. If it is not, reiser4_panic() is
8996+ called. Use this for checking logical consistency and _never_ call
8997+ this to check correctness of external data: disk blocks and user-input . */
8998+#define assert(label, cond) \
8999+({ \
9000+ /* call_on_each_assert(); */ \
9001+ if (cond) { \
9002+ /* put negated check to avoid using !(cond) that would lose \
9003+ * warnings for things like assert(a = b); */ \
9004+ ; \
9005+ } else { \
9006+ DEBUGON(1); \
9007+ reiser4_panic(label, "assertion failed: %s", #cond); \
9008+ } \
9009+})
9010+
9011+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
9012+#define check_me( label, expr ) assert( label, ( expr ) )
9013+
9014+#define ON_DEBUG( exp ) exp
9015+
9016+extern int reiser4_schedulable(void);
9017+extern void call_on_each_assert(void);
9018+
9019+#else
9020+
9021+#define dinfo( format, args... ) noop
9022+#define impossible( label, format, args... ) noop
9023+#define assert( label, cond ) noop
9024+#define check_me( label, expr ) ( ( void ) ( expr ) )
9025+#define ON_DEBUG( exp )
9026+#define reiser4_schedulable() might_sleep()
9027+
9028+/* REISER4_DEBUG */
9029+#endif
9030+
9031+#if REISER4_DEBUG
9032+/* per-thread information about lock acquired by this thread. Used by lock
9033+ * ordering checking in spin_macros.h */
9034+typedef struct reiser4_lock_counters_info {
9035+ int rw_locked_tree;
9036+ int read_locked_tree;
9037+ int write_locked_tree;
9038+
9039+ int rw_locked_dk;
9040+ int read_locked_dk;
9041+ int write_locked_dk;
9042+
9043+ int rw_locked_cbk_cache;
9044+ int read_locked_cbk_cache;
9045+ int write_locked_cbk_cache;
9046+
9047+ int spin_locked_zlock;
9048+ int spin_locked_jnode;
9049+ int spin_locked_jload;
9050+ int spin_locked_txnh;
9051+ int spin_locked_atom;
9052+ int spin_locked_stack;
9053+ int spin_locked_txnmgr;
9054+ int spin_locked_ktxnmgrd;
9055+ int spin_locked_fq;
9056+ int spin_locked_inode;
9057+ int spin_locked_super_eflush;
9058+ int spin_locked;
9059+ int long_term_locked_znode;
9060+
9061+ int inode_sem_r;
9062+ int inode_sem_w;
9063+
9064+ int d_refs;
9065+ int x_refs;
9066+ int t_refs;
9067+} reiser4_lock_counters_info;
9068+
9069+extern reiser4_lock_counters_info *reiser4_lock_counters(void);
9070+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
9071+
9072+/* increment lock-counter @counter, if present */
9073+#define LOCK_CNT_INC(counter) \
9074+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
9075+
9076+/* decrement lock-counter @counter, if present */
9077+#define LOCK_CNT_DEC(counter) \
9078+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
9079+
9080+/* check that lock-counter is zero. This is for use in assertions */
9081+#define LOCK_CNT_NIL(counter) \
9082+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
9083+
9084+/* check that lock-counter is greater than zero. This is for use in
9085+ * assertions */
9086+#define LOCK_CNT_GTZ(counter) \
9087+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
9088+#define LOCK_CNT_LT(counter,n) \
9089+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
9090+
9091+#else /* REISER4_DEBUG */
9092+
9093+/* no-op versions on the above */
9094+
9095+typedef struct reiser4_lock_counters_info {
9096+} reiser4_lock_counters_info;
9097+
9098+#define reiser4_lock_counters() ((reiser4_lock_counters_info *)NULL)
9099+#define LOCK_CNT_INC(counter) noop
9100+#define LOCK_CNT_DEC(counter) noop
9101+#define LOCK_CNT_NIL(counter) (1)
9102+#define LOCK_CNT_GTZ(counter) (1)
9103+#define LOCK_CNT_LT(counter,n) (1)
9104+
9105+#endif /* REISER4_DEBUG */
9106+
9107+#define assert_spin_not_locked(lock) BUG_ON(0)
9108+#define assert_rw_write_locked(lock) BUG_ON(0)
9109+#define assert_rw_read_locked(lock) BUG_ON(0)
9110+#define assert_rw_locked(lock) BUG_ON(0)
9111+#define assert_rw_not_write_locked(lock) BUG_ON(0)
9112+#define assert_rw_not_read_locked(lock) BUG_ON(0)
9113+#define assert_rw_not_locked(lock) BUG_ON(0)
9114+
9115+/* flags controlling debugging behavior. Are set through debug_flags=N mount
9116+ option. */
9117+typedef enum {
9118+ /* print a lot of information during panic. When this is on all jnodes
9119+ * are listed. This can be *very* large output. Usually you don't want
9120+ * this. Especially over serial line. */
9121+ REISER4_VERBOSE_PANIC = 0x00000001,
9122+ /* print a lot of information during umount */
9123+ REISER4_VERBOSE_UMOUNT = 0x00000002,
9124+ /* print gathered statistics on umount */
9125+ REISER4_STATS_ON_UMOUNT = 0x00000004,
9126+ /* check node consistency */
9127+ REISER4_CHECK_NODE = 0x00000008
9128+} reiser4_debug_flags;
9129+
9130+extern int is_in_reiser4_context(void);
9131+
9132+/*
9133+ * evaluate expression @e only if with reiser4 context
9134+ */
9135+#define ON_CONTEXT(e) do { \
9136+ if(is_in_reiser4_context()) { \
9137+ e; \
9138+ } } while(0)
9139+
9140+/*
9141+ * evaluate expression @e only when within reiser4_context and debugging is
9142+ * on.
9143+ */
9144+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
9145+
9146+/*
9147+ * complain about unexpected function result and crash. Used in "default"
9148+ * branches of switch statements and alike to assert that invalid results are
9149+ * not silently ignored.
9150+ */
9151+#define wrong_return_value( label, function ) \
9152+ impossible( label, "wrong return value from " function )
9153+
9154+/* Issue different types of reiser4 messages to the console */
9155+#define warning( label, format, ... ) \
9156+ DCALL( KERN_WARNING, \
9157+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
9158+#define notice( label, format, ... ) \
9159+ DCALL( KERN_NOTICE, \
9160+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
9161+
9162+/* mark not yet implemented functionality */
9163+#define not_yet( label, format, ... ) \
9164+ reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
9165+
9166+extern void reiser4_do_panic(const char *format, ...)
9167+ __attribute__ ((noreturn, format(printf, 1, 2)));
9168+
9169+extern int reiser4_preempt_point(void);
9170+extern void reiser4_print_stats(void);
9171+
9172+#if REISER4_DEBUG
9173+extern int reiser4_no_counters_are_held(void);
9174+extern int reiser4_commit_check_locks(void);
9175+#else
9176+#define reiser4_no_counters_are_held() (1)
9177+#define reiser4_commit_check_locks() (1)
9178+#endif
9179+
9180+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
9181+#define IS_POW(i) \
9182+({ \
9183+ typeof(i) __i; \
9184+ \
9185+ __i = (i); \
9186+ !(__i & (__i - 1)); \
9187+})
9188+
9189+#define KERNEL_DEBUGGER (1)
9190+
9191+#if KERNEL_DEBUGGER
9192+
9193+extern void reiser4_debugtrap(void);
9194+
9195+/*
9196+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
9197+ * kgdb is not compiled in, do nothing.
9198+ */
9199+#define DEBUGON(cond) \
9200+({ \
9201+ if (unlikely(cond)) \
9202+ reiser4_debugtrap(); \
9203+})
9204+#else
9205+#define DEBUGON(cond) noop
9206+#endif
9207+
9208+/*
9209+ * Error code tracing facility. (Idea is borrowed from XFS code.)
9210+ *
9211+ * Suppose some strange and/or unexpected code is returned from some function
9212+ * (for example, write(2) returns -EEXIST). It is possible to place a
9213+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
9214+ * in what particular place -EEXIST was generated first?
9215+ *
9216+ * In reiser4 all places where actual error codes are produced (that is,
9217+ * statements of the form
9218+ *
9219+ * return -EFOO; // (1), or
9220+ *
9221+ * result = -EFOO; // (2)
9222+ *
9223+ * are replaced with
9224+ *
9225+ * return RETERR(-EFOO); // (1a), and
9226+ *
9227+ * result = RETERR(-EFOO); // (2a) respectively
9228+ *
9229+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
9230+ * printed in error and warning messages. Moreover, it's possible to put a
9231+ * conditional breakpoint in reiser4_return_err (low-level function called
9232+ * by RETERR() to do the actual work) to break into debugger immediately
9233+ * when particular error happens.
9234+ *
9235+ */
9236+
9237+#if REISER4_DEBUG
9238+
9239+/*
9240+ * data-type to store information about where error happened ("error site").
9241+ */
9242+typedef struct err_site {
9243+ int code; /* error code */
9244+ const char *file; /* source file, filled by __FILE__ */
9245+ int line; /* source file line, filled by __LINE__ */
9246+} err_site;
9247+
9248+extern void reiser4_return_err(int code, const char *file, int line);
9249+
9250+/*
9251+ * fill &get_current_context()->err_site with error information.
9252+ */
9253+#define RETERR(code) \
9254+({ \
9255+ typeof(code) __code; \
9256+ \
9257+ __code = (code); \
9258+ reiser4_return_err(__code, __FILE__, __LINE__); \
9259+ __code; \
9260+})
9261+
9262+#else
9263+
9264+/*
9265+ * no-op versions of the above
9266+ */
9267+
9268+typedef struct err_site {
9269+} err_site;
9270+#define RETERR(code) code
9271+#endif
9272+
9273+#if REISER4_LARGE_KEY
9274+/*
9275+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
9276+ */
9277+#define ON_LARGE_KEY(...) __VA_ARGS__
9278+#else
9279+#define ON_LARGE_KEY(...)
9280+#endif
9281+
9282+/* __FS_REISER4_DEBUG_H__ */
9283+#endif
9284+
9285+/* Make Linus happy.
9286+ Local variables:
9287+ c-indentation-style: "K&R"
9288+ mode-name: "LC"
9289+ c-basic-offset: 8
9290+ tab-width: 8
9291+ fill-column: 120
9292+ End:
9293+*/
9294diff --git a/fs/reiser4/dformat.h b/fs/reiser4/dformat.h
9295new file mode 100644
9296index 0000000..8bca29e
9297--- /dev/null
9298+++ b/fs/reiser4/dformat.h
9299@@ -0,0 +1,70 @@
9300+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9301+
9302+/* Formats of on-disk data and conversion functions. */
9303+
9304+/* put all item formats in the files describing the particular items,
9305+ our model is, everything you need to do to add an item to reiser4,
9306+ (excepting the changes to the plugin that uses the item which go
9307+ into the file defining that plugin), you put into one file. */
9308+/* Data on disk are stored in little-endian format.
9309+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
9310+ d??tocpu() and cputod??() to convert. */
9311+
9312+#if !defined( __FS_REISER4_DFORMAT_H__ )
9313+#define __FS_REISER4_DFORMAT_H__
9314+
9315+#include <asm/byteorder.h>
9316+#include <asm/unaligned.h>
9317+#include <linux/types.h>
9318+
9319+typedef __u8 d8;
9320+typedef __le16 d16;
9321+typedef __le32 d32;
9322+typedef __le64 d64;
9323+
9324+#define PACKED __attribute__((packed))
9325+
9326+/* data-type for block number */
9327+typedef __u64 reiser4_block_nr;
9328+
9329+/* data-type for block number on disk, disk format */
9330+typedef __le64 reiser4_dblock_nr;
9331+
9332+/**
9333+ * disk_addr_eq - compare disk addresses
9334+ * @b1: pointer to block number ot compare
9335+ * @b2: pointer to block number ot compare
9336+ *
9337+ * Returns true if if disk addresses are the same
9338+ */
9339+static inline int disk_addr_eq(const reiser4_block_nr *b1,
9340+ const reiser4_block_nr * b2)
9341+{
9342+ assert("nikita-1033", b1 != NULL);
9343+ assert("nikita-1266", b2 != NULL);
9344+
9345+ return !memcmp(b1, b2, sizeof *b1);
9346+}
9347+
9348+/* structure of master reiser4 super block */
9349+typedef struct reiser4_master_sb {
9350+ char magic[16]; /* "ReIsEr4" */
9351+ __le16 disk_plugin_id; /* id of disk layout plugin */
9352+ __le16 blocksize;
9353+ char uuid[16]; /* unique id */
9354+ char label[16]; /* filesystem label */
9355+ __le64 diskmap; /* location of the diskmap. 0 if not present */
9356+} reiser4_master_sb;
9357+
9358+/* __FS_REISER4_DFORMAT_H__ */
9359+#endif
9360+
9361+/*
9362+ * Local variables:
9363+ * c-indentation-style: "K&R"
9364+ * mode-name: "LC"
9365+ * c-basic-offset: 8
9366+ * tab-width: 8
9367+ * fill-column: 79
9368+ * End:
9369+ */
9370diff --git a/fs/reiser4/dscale.c b/fs/reiser4/dscale.c
9371new file mode 100644
9372index 0000000..a9bc224
9373--- /dev/null
9374+++ b/fs/reiser4/dscale.c
9375@@ -0,0 +1,174 @@
9376+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9377+ * reiser4/README */
9378+
9379+/* Scalable on-disk integers */
9380+
9381+/*
9382+ * Various on-disk structures contain integer-like structures. Stat-data
9383+ * contain [yes, "data" is plural, check the dictionary] file size, link
9384+ * count; extent unit contains extent width etc. To accommodate for general
9385+ * case enough space is reserved to keep largest possible value. 64 bits in
9386+ * all cases above. But in overwhelming majority of cases numbers actually
9387+ * stored in these fields will be comparatively small and reserving 8 bytes is
9388+ * a waste of precious disk bandwidth.
9389+ *
9390+ * Scalable integers are one way to solve this problem. dscale_write()
9391+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
9392+ * depending on the magnitude of the value supplied. dscale_read() reads value
9393+ * previously stored by dscale_write().
9394+ *
9395+ * dscale_write() produces format not completely unlike of UTF: two highest
9396+ * bits of the first byte are used to store "tag". One of 4 possible tag
9397+ * values is chosen depending on the number being encoded:
9398+ *
9399+ * 0 ... 0x3f => 0 [table 1]
9400+ * 0x40 ... 0x3fff => 1
9401+ * 0x4000 ... 0x3fffffff => 2
9402+ * 0x40000000 ... 0xffffffffffffffff => 3
9403+ *
9404+ * (see dscale_range() function)
9405+ *
9406+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
9407+ * to be stored, so in this case there is no place in the first byte to store
9408+ * tag. For such values tag is stored in an extra 9th byte.
9409+ *
9410+ * As _highest_ bits are used for the test (which is natural) scaled integers
9411+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
9412+ * uses LITTLE-ENDIAN.
9413+ *
9414+ */
9415+
9416+#include "debug.h"
9417+#include "dscale.h"
9418+
9419+/* return tag of scaled integer stored at @address */
9420+static int gettag(const unsigned char *address)
9421+{
9422+ /* tag is stored in two highest bits */
9423+ return (*address) >> 6;
9424+}
9425+
9426+/* clear tag from value. Clear tag embedded into @value. */
9427+static void cleartag(__u64 * value, int tag)
9428+{
9429+ /*
9430+ * W-w-what ?!
9431+ *
9432+ * Actually, this is rather simple: @value passed here was read by
9433+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
9434+ * zeroes. Tag is still stored in the highest (arithmetically)
9435+ * non-zero bits of @value, but relative position of tag within __u64
9436+ * depends on @tag.
9437+ *
9438+ * For example if @tag is 0, it's stored 2 highest bits of lowest
9439+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
9440+ *
9441+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
9442+ * and it's offset if (2 * 8) - 2 == 14 bits.
9443+ *
9444+ * See table 1 above for details.
9445+ *
9446+ * All these cases are captured by the formula:
9447+ */
9448+ *value &= ~(3 << (((1 << tag) << 3) - 2));
9449+ /*
9450+ * That is, clear two (3 == 0t11) bits at the offset
9451+ *
9452+ * 8 * (2 ^ tag) - 2,
9453+ *
9454+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
9455+ */
9456+}
9457+
9458+/* return tag for @value. See table 1 above for details. */
9459+static int dscale_range(__u64 value)
9460+{
9461+ if (value > 0x3fffffff)
9462+ return 3;
9463+ if (value > 0x3fff)
9464+ return 2;
9465+ if (value > 0x3f)
9466+ return 1;
9467+ return 0;
9468+}
9469+
9470+/* restore value stored at @adderss by dscale_write() and return number of
9471+ * bytes consumed */
9472+int dscale_read(unsigned char *address, __u64 * value)
9473+{
9474+ int tag;
9475+
9476+ /* read tag */
9477+ tag = gettag(address);
9478+ switch (tag) {
9479+ case 3:
9480+ /* In this case tag is stored in an extra byte, skip this byte
9481+ * and decode value stored in the next 8 bytes.*/
9482+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9483+ /* worst case: 8 bytes for value itself plus one byte for
9484+ * tag. */
9485+ return 9;
9486+ case 0:
9487+ *value = get_unaligned(address);
9488+ break;
9489+ case 1:
9490+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9491+ break;
9492+ case 2:
9493+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9494+ break;
9495+ default:
9496+ return RETERR(-EIO);
9497+ }
9498+ /* clear tag embedded into @value */
9499+ cleartag(value, tag);
9500+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
9501+ return 1 << tag;
9502+}
9503+
9504+/* store @value at @address and return number of bytes consumed */
9505+int dscale_write(unsigned char *address, __u64 value)
9506+{
9507+ int tag;
9508+ int shift;
9509+ __be64 v;
9510+ unsigned char *valarr;
9511+
9512+ tag = dscale_range(value);
9513+ v = __cpu_to_be64(value);
9514+ valarr = (unsigned char *)&v;
9515+ shift = (tag == 3) ? 1 : 0;
9516+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9517+ *address |= (tag << 6);
9518+ return shift + (1 << tag);
9519+}
9520+
9521+/* number of bytes required to store @value */
9522+int dscale_bytes(__u64 value)
9523+{
9524+ int bytes;
9525+
9526+ bytes = 1 << dscale_range(value);
9527+ if (bytes == 8)
9528+ ++bytes;
9529+ return bytes;
9530+}
9531+
9532+/* returns true if @value and @other require the same number of bytes to be
9533+ * stored. Used by detect when data structure (like stat-data) has to be
9534+ * expanded or contracted. */
9535+int dscale_fit(__u64 value, __u64 other)
9536+{
9537+ return dscale_range(value) == dscale_range(other);
9538+}
9539+
9540+/* Make Linus happy.
9541+ Local variables:
9542+ c-indentation-style: "K&R"
9543+ mode-name: "LC"
9544+ c-basic-offset: 8
9545+ tab-width: 8
9546+ fill-column: 120
9547+ scroll-step: 1
9548+ End:
9549+*/
9550diff --git a/fs/reiser4/dscale.h b/fs/reiser4/dscale.h
9551new file mode 100644
9552index 0000000..545e111
9553--- /dev/null
9554+++ b/fs/reiser4/dscale.h
9555@@ -0,0 +1,27 @@
9556+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9557+ * reiser4/README */
9558+
9559+/* Scalable on-disk integers. See dscale.h for details. */
9560+
9561+#if !defined( __FS_REISER4_DSCALE_H__ )
9562+#define __FS_REISER4_DSCALE_H__
9563+
9564+#include "dformat.h"
9565+
9566+extern int dscale_read(unsigned char *address, __u64 * value);
9567+extern int dscale_write(unsigned char *address, __u64 value);
9568+extern int dscale_bytes(__u64 value);
9569+extern int dscale_fit(__u64 value, __u64 other);
9570+
9571+/* __FS_REISER4_DSCALE_H__ */
9572+#endif
9573+
9574+/* Make Linus happy.
9575+ Local variables:
9576+ c-indentation-style: "K&R"
9577+ mode-name: "LC"
9578+ c-basic-offset: 8
9579+ tab-width: 8
9580+ fill-column: 120
9581+ End:
9582+*/
9583diff --git a/fs/reiser4/entd.c b/fs/reiser4/entd.c
9584new file mode 100644
9585index 0000000..1be9fff
9586--- /dev/null
9587+++ b/fs/reiser4/entd.c
9588@@ -0,0 +1,335 @@
9589+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9590+ * reiser4/README */
9591+
9592+/* Ent daemon. */
9593+
9594+#include "debug.h"
9595+#include "txnmgr.h"
9596+#include "tree.h"
9597+#include "entd.h"
9598+#include "super.h"
9599+#include "context.h"
9600+#include "reiser4.h"
9601+#include "vfs_ops.h"
9602+#include "page_cache.h"
9603+#include "inode.h"
9604+
9605+#include <linux/sched.h> /* struct task_struct */
9606+#include <linux/suspend.h>
9607+#include <linux/kernel.h>
9608+#include <linux/writeback.h>
9609+#include <linux/time.h> /* INITIAL_JIFFIES */
9610+#include <linux/backing-dev.h> /* bdi_write_congested */
9611+#include <linux/wait.h>
9612+#include <linux/kthread.h>
9613+#include <linux/freezer.h>
9614+
9615+#define DEF_PRIORITY 12
9616+#define MAX_ENTD_ITERS 10
9617+
9618+static void entd_flush(struct super_block *, struct wbq *);
9619+static int entd(void *arg);
9620+
9621+/*
9622+ * set ->comm field of end thread to make its state visible to the user level
9623+ */
9624+#define entd_set_comm(state) \
9625+ snprintf(current->comm, sizeof(current->comm), \
9626+ "ent:%s%s", super->s_id, (state))
9627+
9628+/**
9629+ * reiser4_init_entd - initialize entd context and start kernel daemon
9630+ * @super: super block to start ent thread for
9631+ *
9632+ * Creates entd contexts, starts kernel thread and waits until it
9633+ * initializes.
9634+ */
9635+int reiser4_init_entd(struct super_block *super)
9636+{
9637+ entd_context *ctx;
9638+
9639+ assert("nikita-3104", super != NULL);
9640+
9641+ ctx = get_entd_context(super);
9642+
9643+ memset(ctx, 0, sizeof *ctx);
9644+ spin_lock_init(&ctx->guard);
9645+ init_waitqueue_head(&ctx->wait);
9646+#if REISER4_DEBUG
9647+ INIT_LIST_HEAD(&ctx->flushers_list);
9648+#endif
9649+ /* lists of writepage requests */
9650+ INIT_LIST_HEAD(&ctx->todo_list);
9651+ INIT_LIST_HEAD(&ctx->done_list);
9652+ /* start entd */
9653+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9654+ if (IS_ERR(ctx->tsk))
9655+ return PTR_ERR(ctx->tsk);
9656+ return 0;
9657+}
9658+
9659+static void put_wbq(struct wbq *rq)
9660+{
9661+ iput(rq->mapping->host);
9662+ complete(&rq->completion);
9663+}
9664+
9665+/* ent should be locked */
9666+static struct wbq *__get_wbq(entd_context * ent)
9667+{
9668+ struct wbq *wbq;
9669+
9670+ if (list_empty(&ent->todo_list))
9671+ return NULL;
9672+
9673+ ent->nr_todo_reqs --;
9674+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
9675+ list_del_init(&wbq->link);
9676+ return wbq;
9677+}
9678+
9679+/* ent thread function */
9680+static int entd(void *arg)
9681+{
9682+ struct super_block *super;
9683+ entd_context *ent;
9684+ int done = 0;
9685+
9686+ super = arg;
9687+ /* do_fork() just copies task_struct into the new
9688+ thread. ->fs_context shouldn't be copied of course. This shouldn't
9689+ be a problem for the rest of the code though.
9690+ */
9691+ current->journal_info = NULL;
9692+
9693+ ent = get_entd_context(super);
9694+
9695+ while (!done) {
9696+ try_to_freeze();
9697+
9698+ spin_lock(&ent->guard);
9699+ while (ent->nr_todo_reqs != 0) {
9700+ struct wbq *rq;
9701+
9702+ assert("", list_empty(&ent->done_list));
9703+
9704+ /* take request from the queue head */
9705+ rq = __get_wbq(ent);
9706+ assert("", rq != NULL);
9707+ ent->cur_request = rq;
9708+ spin_unlock(&ent->guard);
9709+
9710+ entd_set_comm("!");
9711+ entd_flush(super, rq);
9712+
9713+ put_wbq(rq);
9714+
9715+ /*
9716+ * wakeup all requestors and iput their inodes
9717+ */
9718+ spin_lock(&ent->guard);
9719+ while (!list_empty(&ent->done_list)) {
9720+ rq = list_entry(ent->done_list.next, struct wbq, link);
9721+ list_del_init(&rq->link);
9722+ ent->nr_done_reqs --;
9723+ spin_unlock(&ent->guard);
9724+ assert("", rq->written == 1);
9725+ put_wbq(rq);
9726+ spin_lock(&ent->guard);
9727+ }
9728+ }
9729+ spin_unlock(&ent->guard);
9730+
9731+ entd_set_comm(".");
9732+
9733+ {
9734+ DEFINE_WAIT(__wait);
9735+
9736+ do {
9737+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9738+ if (kthread_should_stop()) {
9739+ done = 1;
9740+ break;
9741+ }
9742+ if (ent->nr_todo_reqs != 0)
9743+ break;
9744+ schedule();
9745+ } while (0);
9746+ finish_wait(&ent->wait, &__wait);
9747+ }
9748+ }
9749+ BUG_ON(ent->nr_todo_reqs != 0);
9750+ return 0;
9751+}
9752+
9753+/**
9754+ * reiser4_done_entd - stop entd kernel thread
9755+ * @super: super block to stop ent thread for
9756+ *
9757+ * It is called on umount. Sends stop signal to entd and wait until it handles
9758+ * it.
9759+ */
9760+void reiser4_done_entd(struct super_block *super)
9761+{
9762+ entd_context *ent;
9763+
9764+ assert("nikita-3103", super != NULL);
9765+
9766+ ent = get_entd_context(super);
9767+ assert("zam-1055", ent->tsk != NULL);
9768+ kthread_stop(ent->tsk);
9769+}
9770+
9771+/* called at the beginning of jnode_flush to register flusher thread with ent
9772+ * daemon */
9773+void reiser4_enter_flush(struct super_block *super)
9774+{
9775+ entd_context *ent;
9776+
9777+ assert("zam-1029", super != NULL);
9778+ ent = get_entd_context(super);
9779+
9780+ assert("zam-1030", ent != NULL);
9781+
9782+ spin_lock(&ent->guard);
9783+ ent->flushers++;
9784+#if REISER4_DEBUG
9785+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9786+#endif
9787+ spin_unlock(&ent->guard);
9788+}
9789+
9790+/* called at the end of jnode_flush */
9791+void reiser4_leave_flush(struct super_block *super)
9792+{
9793+ entd_context *ent;
9794+ int wake_up_ent;
9795+
9796+ assert("zam-1027", super != NULL);
9797+ ent = get_entd_context(super);
9798+
9799+ assert("zam-1028", ent != NULL);
9800+
9801+ spin_lock(&ent->guard);
9802+ ent->flushers--;
9803+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9804+#if REISER4_DEBUG
9805+ list_del_init(&get_current_context()->flushers_link);
9806+#endif
9807+ spin_unlock(&ent->guard);
9808+ if (wake_up_ent)
9809+ wake_up(&ent->wait);
9810+}
9811+
9812+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9813+
9814+static void entd_flush(struct super_block *super, struct wbq *rq)
9815+{
9816+ reiser4_context ctx;
9817+ int tmp;
9818+
9819+ init_stack_context(&ctx, super);
9820+ ctx.entd = 1;
9821+ ctx.gfp_mask = GFP_NOFS;
9822+
9823+ rq->wbc->range_start = page_offset(rq->page);
9824+ rq->wbc->range_end = rq->wbc->range_start +
9825+ (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9826+ tmp = rq->wbc->nr_to_write;
9827+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9828+
9829+ if (rq->wbc->nr_to_write > 0) {
9830+ rq->wbc->range_start = 0;
9831+ rq->wbc->range_end = 0;
9832+ generic_sync_sb_inodes(super, rq->wbc);
9833+ }
9834+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9835+ reiser4_writeout(super, rq->wbc);
9836+
9837+ context_set_commit_async(&ctx);
9838+ reiser4_exit_context(&ctx);
9839+}
9840+
9841+/**
9842+ * write_page_by_ent - ask entd thread to flush this page as part of slum
9843+ * @page: page to be written
9844+ * @wbc: writeback control passed to reiser4_writepage
9845+ *
9846+ * Creates a request, puts it on entd list of requests, wakeups entd if
9847+ * necessary, waits until entd completes with the request.
9848+ */
9849+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9850+{
9851+ struct super_block *sb;
9852+ struct inode *inode;
9853+ entd_context *ent;
9854+ struct wbq rq;
9855+
9856+ assert("", PageLocked(page));
9857+ assert("", page->mapping != NULL);
9858+
9859+ sb = page->mapping->host->i_sb;
9860+ ent = get_entd_context(sb);
9861+ assert("", ent && ent->done == 0);
9862+
9863+ /*
9864+ * we are going to unlock page and ask ent thread to write the
9865+ * page. Re-dirty page before unlocking so that if ent thread fails to
9866+ * write it - it will remain dirty
9867+ */
9868+ reiser4_set_page_dirty_internal(page);
9869+
9870+ /*
9871+ * pin inode in memory, unlock page, entd_flush will iput. We can not
9872+ * iput here becasue we can not allow delete_inode to be called here
9873+ */
9874+ inode = igrab(page->mapping->host);
9875+ unlock_page(page);
9876+ if (inode == NULL)
9877+ /* inode is getting freed */
9878+ return 0;
9879+
9880+ /* init wbq */
9881+ INIT_LIST_HEAD(&rq.link);
9882+ rq.magic = WBQ_MAGIC;
9883+ rq.wbc = wbc;
9884+ rq.page = page;
9885+ rq.mapping = inode->i_mapping;
9886+ rq.node = NULL;
9887+ rq.written = 0;
9888+ init_completion(&rq.completion);
9889+
9890+ /* add request to entd's list of writepage requests */
9891+ spin_lock(&ent->guard);
9892+ ent->nr_todo_reqs++;
9893+ list_add_tail(&rq.link, &ent->todo_list);
9894+ if (ent->nr_todo_reqs == 1)
9895+ wake_up(&ent->wait);
9896+
9897+ spin_unlock(&ent->guard);
9898+
9899+ /* wait until entd finishes */
9900+ wait_for_completion(&rq.completion);
9901+
9902+ if (rq.written)
9903+ /* Eventually ENTD has written the page to disk. */
9904+ return 0;
9905+ return 0;
9906+}
9907+
9908+int wbq_available(void)
9909+{
9910+ struct super_block *sb = reiser4_get_current_sb();
9911+ entd_context *ent = get_entd_context(sb);
9912+ return ent->nr_todo_reqs;
9913+}
9914+
9915+/*
9916+ * Local variables:
9917+ * c-indentation-style: "K&R"
9918+ * mode-name: "LC"
9919+ * c-basic-offset: 8
9920+ * tab-width: 8
9921+ * fill-column: 79
9922+ * End:
9923+ */
9924diff --git a/fs/reiser4/entd.h b/fs/reiser4/entd.h
9925new file mode 100644
9926index 0000000..4f79a57
9927--- /dev/null
9928+++ b/fs/reiser4/entd.h
9929@@ -0,0 +1,90 @@
9930+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9931+
9932+/* Ent daemon. */
9933+
9934+#ifndef __ENTD_H__
9935+#define __ENTD_H__
9936+
9937+#include "context.h"
9938+
9939+#include <linux/fs.h>
9940+#include <linux/completion.h>
9941+#include <linux/wait.h>
9942+#include <linux/spinlock.h>
9943+#include <linux/sched.h> /* for struct task_struct */
9944+
9945+#define WBQ_MAGIC 0x7876dc76
9946+
9947+/* write-back request. */
9948+struct wbq {
9949+ int magic;
9950+ struct list_head link; /* list head of this list is in entd context */
9951+ struct writeback_control *wbc;
9952+ struct page *page;
9953+ struct address_space *mapping;
9954+ struct completion completion;
9955+ jnode *node; /* set if ent thread captured requested page */
9956+ int written; /* set if ent thread wrote requested page */
9957+};
9958+
9959+/* ent-thread context. This is used to synchronize starting/stopping ent
9960+ * threads. */
9961+typedef struct entd_context {
9962+ /* wait queue that ent thread waits on for more work. It's
9963+ * signaled by write_page_by_ent(). */
9964+ wait_queue_head_t wait;
9965+ /* spinlock protecting other fields */
9966+ spinlock_t guard;
9967+ /* ent thread */
9968+ struct task_struct *tsk;
9969+ /* set to indicate that ent thread should leave. */
9970+ int done;
9971+ /* counter of active flushers */
9972+ int flushers;
9973+ /*
9974+ * when reiser4_writepage asks entd to write a page - it adds struct
9975+ * wbq to this list
9976+ */
9977+ struct list_head todo_list;
9978+ /* number of elements on the above list */
9979+ int nr_todo_reqs;
9980+
9981+ struct wbq *cur_request;
9982+ /*
9983+ * when entd writes a page it moves write-back request from todo_list
9984+ * to done_list. This list is used at the end of entd iteration to
9985+ * wakeup requestors and iput inodes.
9986+ */
9987+ struct list_head done_list;
9988+ /* number of elements on the above list */
9989+ int nr_done_reqs;
9990+
9991+#if REISER4_DEBUG
9992+ /* list of all active flushers */
9993+ struct list_head flushers_list;
9994+#endif
9995+} entd_context;
9996+
9997+extern int reiser4_init_entd(struct super_block *);
9998+extern void reiser4_done_entd(struct super_block *);
9999+
10000+extern void reiser4_enter_flush(struct super_block *);
10001+extern void reiser4_leave_flush(struct super_block *);
10002+
10003+extern int write_page_by_ent(struct page *, struct writeback_control *);
10004+extern int wbq_available(void);
10005+extern void ent_writes_page(struct super_block *, struct page *);
10006+
10007+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
10008+/* __ENTD_H__ */
10009+#endif
10010+
10011+/* Make Linus happy.
10012+ Local variables:
10013+ c-indentation-style: "K&R"
10014+ mode-name: "LC"
10015+ c-basic-offset: 8
10016+ tab-width: 8
10017+ fill-column: 120
10018+ End:
10019+*/
10020diff --git a/fs/reiser4/eottl.c b/fs/reiser4/eottl.c
10021new file mode 100644
10022index 0000000..f921b19
10023--- /dev/null
10024+++ b/fs/reiser4/eottl.c
10025@@ -0,0 +1,509 @@
10026+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10027+
10028+#include "forward.h"
10029+#include "debug.h"
10030+#include "key.h"
10031+#include "coord.h"
10032+#include "plugin/item/item.h"
10033+#include "plugin/node/node.h"
10034+#include "znode.h"
10035+#include "block_alloc.h"
10036+#include "tree_walk.h"
10037+#include "tree_mod.h"
10038+#include "carry.h"
10039+#include "tree.h"
10040+#include "super.h"
10041+
10042+#include <linux/types.h> /* for __u?? */
10043+
10044+/*
10045+ * Extents on the twig level (EOTTL) handling.
10046+ *
10047+ * EOTTL poses some problems to the tree traversal, that are better explained
10048+ * by example.
10049+ *
10050+ * Suppose we have block B1 on the twig level with the following items:
10051+ *
10052+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
10053+ * offset)
10054+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
10055+ * 2. internal item I2 with key (10:0:0:0)
10056+ *
10057+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
10058+ * then intra-node lookup is done. This lookup finished on the E1, because the
10059+ * key we are looking for is larger than the key of E1 and is smaller than key
10060+ * the of I2.
10061+ *
10062+ * Here search is stuck.
10063+ *
10064+ * After some thought it is clear what is wrong here: extents on the twig level
10065+ * break some basic property of the *search* tree (on the pretext, that they
10066+ * restore property of balanced tree).
10067+ *
10068+ * Said property is the following: if in the internal node of the search tree
10069+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
10070+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
10071+ * through the Pointer.
10072+ *
10073+ * This is not true, when Pointer is Extent-Pointer, simply because extent
10074+ * cannot expand indefinitely to the right to include any item with
10075+ *
10076+ * Key1 <= Key <= Key2.
10077+ *
10078+ * For example, our E1 extent is only responsible for the data with keys
10079+ *
10080+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
10081+ *
10082+ * so, key range
10083+ *
10084+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
10085+ *
10086+ * is orphaned: there is no way to get there from the tree root.
10087+ *
10088+ * In other words, extent pointers are different than normal child pointers as
10089+ * far as search tree is concerned, and this creates such problems.
10090+ *
10091+ * Possible solution for this problem is to insert our item into node pointed
10092+ * to by I2. There are some problems through:
10093+ *
10094+ * (1) I2 can be in a different node.
10095+ * (2) E1 can be immediately followed by another extent E2.
10096+ *
10097+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
10098+ * for locks/coords as necessary.
10099+ *
10100+ * (2) is more complex. Solution here is to insert new empty leaf node and
10101+ * insert internal item between E1 and E2 pointing to said leaf node. This is
10102+ * further complicated by possibility that E2 is in a different node, etc.
10103+ *
10104+ * Problems:
10105+ *
10106+ * (1) if there was internal item I2 immediately on the right of an extent E1
10107+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
10108+ * key of S1 will be less than smallest key in the N2. Normally, search key
10109+ * checks that key we are looking for is in the range of keys covered by the
10110+ * node key is being looked in. To work around of this situation, while
10111+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
10112+ * cbk falgs bitmask. This flag is automatically set on entrance to the
10113+ * coord_by_key() and is only cleared when we are about to enter situation
10114+ * described above.
10115+ *
10116+ * (2) If extent E1 is immediately followed by another extent E2 and we are
10117+ * searching for the key that is between E1 and E2 we only have to insert new
10118+ * empty leaf node when coord_by_key was called for insertion, rather than just
10119+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
10120+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
10121+ * performed by insert_by_key() and friends.
10122+ *
10123+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
10124+ * case it requires modification of node content which is only possible under
10125+ * write lock. It may well happen that we only have read lock on the node where
10126+ * new internal pointer is to be inserted (common case: lookup of non-existent
10127+ * stat-data that fells between two extents). If only read lock is held, tree
10128+ * traversal is restarted with lock_level modified so that next time we hit
10129+ * this problem, write lock will be held. Once we have write lock, balancing
10130+ * will be performed.
10131+ */
10132+
10133+/**
10134+ * is_next_item_internal - check whether next item is internal
10135+ * @coord: coordinate of extent item in twig node
10136+ * @key: search key
10137+ * @lh: twig node lock handle
10138+ *
10139+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
10140+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
10141+ * to that node, @coord is set to its first unit. If next item is not internal
10142+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
10143+ * is returned if search restart has to be done.
10144+ */
10145+static int
10146+is_next_item_internal(coord_t *coord, const reiser4_key *key,
10147+ lock_handle *lh)
10148+{
10149+ coord_t next;
10150+ lock_handle rn;
10151+ int result;
10152+
10153+ coord_dup(&next, coord);
10154+ if (coord_next_unit(&next) == 0) {
10155+ /* next unit is in this node */
10156+ if (item_is_internal(&next)) {
10157+ coord_dup(coord, &next);
10158+ return 1;
10159+ }
10160+ assert("vs-3", item_is_extent(&next));
10161+ return 0;
10162+ }
10163+
10164+ /*
10165+ * next unit either does not exist or is in right neighbor. If it is in
10166+ * right neighbor we have to check right delimiting key because
10167+ * concurrent thread could get their first and insert item with a key
10168+ * smaller than @key
10169+ */
10170+ read_lock_dk(current_tree);
10171+ result = keycmp(key, znode_get_rd_key(coord->node));
10172+ read_unlock_dk(current_tree);
10173+ assert("vs-6", result != EQUAL_TO);
10174+ if (result == GREATER_THAN)
10175+ return 2;
10176+
10177+ /* lock right neighbor */
10178+ init_lh(&rn);
10179+ result = reiser4_get_right_neighbor(&rn, coord->node,
10180+ znode_is_wlocked(coord->node) ?
10181+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
10182+ GN_CAN_USE_UPPER_LEVELS);
10183+ if (result == -E_NO_NEIGHBOR) {
10184+ /* we are on the rightmost edge of the tree */
10185+ done_lh(&rn);
10186+ return 0;
10187+ }
10188+
10189+ if (result) {
10190+ assert("vs-4", result < 0);
10191+ done_lh(&rn);
10192+ return result;
10193+ }
10194+
10195+ /*
10196+ * check whether concurrent thread managed to insert item with a key
10197+ * smaller than @key
10198+ */
10199+ read_lock_dk(current_tree);
10200+ result = keycmp(key, znode_get_ld_key(rn.node));
10201+ read_unlock_dk(current_tree);
10202+ assert("vs-6", result != EQUAL_TO);
10203+ if (result == GREATER_THAN) {
10204+ done_lh(&rn);
10205+ return 2;
10206+ }
10207+
10208+ result = zload(rn.node);
10209+ if (result) {
10210+ assert("vs-5", result < 0);
10211+ done_lh(&rn);
10212+ return result;
10213+ }
10214+
10215+ coord_init_first_unit(&next, rn.node);
10216+ if (item_is_internal(&next)) {
10217+ /*
10218+ * next unit is in right neighbor and it is an unit of internal
10219+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
10220+ * is set to the first unit of right neighbor.
10221+ */
10222+ coord_dup(coord, &next);
10223+ zrelse(rn.node);
10224+ done_lh(lh);
10225+ move_lh(lh, &rn);
10226+ return 1;
10227+ }
10228+
10229+ /*
10230+ * next unit is unit of extent item. Return without chaning @lh and
10231+ * @coord.
10232+ */
10233+ assert("vs-6", item_is_extent(&next));
10234+ zrelse(rn.node);
10235+ done_lh(&rn);
10236+ return 0;
10237+}
10238+
10239+/**
10240+ * rd_key - calculate key of an item next to the given one
10241+ * @coord: position in a node
10242+ * @key: storage for result key
10243+ *
10244+ * @coord is set between items or after the last item in a node. Calculate key
10245+ * of item to the right of @coord.
10246+ */
10247+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
10248+{
10249+ coord_t dup;
10250+
10251+ assert("nikita-2281", coord_is_between_items(coord));
10252+ coord_dup(&dup, coord);
10253+
10254+ if (coord_set_to_right(&dup) == 0)
10255+ /* next item is in this node. Return its key. */
10256+ unit_key_by_coord(&dup, key);
10257+ else {
10258+ /*
10259+ * next item either does not exist or is in right
10260+ * neighbor. Return znode's right delimiting key.
10261+ */
10262+ read_lock_dk(current_tree);
10263+ *key = *znode_get_rd_key(coord->node);
10264+ read_unlock_dk(current_tree);
10265+ }
10266+ return key;
10267+}
10268+
10269+/**
10270+ * add_empty_leaf - insert empty leaf between two extents
10271+ * @insert_coord: position in twig node between two extents
10272+ * @lh: twig node lock handle
10273+ * @key: left delimiting key of new node
10274+ * @rdkey: right delimiting key of new node
10275+ *
10276+ * Inserts empty leaf node between two extent items. It is necessary when we
10277+ * have to insert an item on leaf level between two extents (items on the twig
10278+ * level).
10279+ */
10280+static int
10281+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
10282+ const reiser4_key *key, const reiser4_key *rdkey)
10283+{
10284+ int result;
10285+ carry_pool *pool;
10286+ carry_level *todo;
10287+ reiser4_item_data *item;
10288+ carry_insert_data *cdata;
10289+ carry_op *op;
10290+ znode *node;
10291+ reiser4_tree *tree;
10292+
10293+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
10294+ tree = znode_get_tree(insert_coord->node);
10295+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
10296+ if (IS_ERR(node))
10297+ return PTR_ERR(node);
10298+
10299+ /* setup delimiting keys for node being inserted */
10300+ write_lock_dk(tree);
10301+ znode_set_ld_key(node, key);
10302+ znode_set_rd_key(node, rdkey);
10303+ ON_DEBUG(node->creator = current);
10304+ ON_DEBUG(node->first_key = *key);
10305+ write_unlock_dk(tree);
10306+
10307+ ZF_SET(node, JNODE_ORPHAN);
10308+
10309+ /*
10310+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
10311+ * carry_insert_data
10312+ */
10313+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
10314+ sizeof(*item) + sizeof(*cdata));
10315+ if (IS_ERR(pool))
10316+ return PTR_ERR(pool);
10317+ todo = (carry_level *) (pool + 1);
10318+ init_carry_level(todo, pool);
10319+
10320+ item = (reiser4_item_data *) (todo + 3);
10321+ cdata = (carry_insert_data *) (item + 1);
10322+
10323+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
10324+ if (!IS_ERR(op)) {
10325+ cdata->coord = insert_coord;
10326+ cdata->key = key;
10327+ cdata->data = item;
10328+ op->u.insert.d = cdata;
10329+ op->u.insert.type = COPT_ITEM_DATA;
10330+ build_child_ptr_data(node, item);
10331+ item->arg = NULL;
10332+ /* have @insert_coord to be set at inserted item after
10333+ insertion is done */
10334+ todo->track_type = CARRY_TRACK_CHANGE;
10335+ todo->tracked = lh;
10336+
10337+ result = reiser4_carry(todo, NULL);
10338+ if (result == 0) {
10339+ /*
10340+ * pin node in memory. This is necessary for
10341+ * znode_make_dirty() below.
10342+ */
10343+ result = zload(node);
10344+ if (result == 0) {
10345+ lock_handle local_lh;
10346+
10347+ /*
10348+ * if we inserted new child into tree we have
10349+ * to mark it dirty so that flush will be able
10350+ * to process it.
10351+ */
10352+ init_lh(&local_lh);
10353+ result = longterm_lock_znode(&local_lh, node,
10354+ ZNODE_WRITE_LOCK,
10355+ ZNODE_LOCK_LOPRI);
10356+ if (result == 0) {
10357+ znode_make_dirty(node);
10358+
10359+ /*
10360+ * when internal item pointing to @node
10361+ * was inserted into twig node
10362+ * create_hook_internal did not connect
10363+ * it properly because its right
10364+ * neighbor was not known. Do it
10365+ * here
10366+ */
10367+ write_lock_tree(tree);
10368+ assert("nikita-3312",
10369+ znode_is_right_connected(node));
10370+ assert("nikita-2984",
10371+ node->right == NULL);
10372+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
10373+ write_unlock_tree(tree);
10374+ result =
10375+ connect_znode(insert_coord, node);
10376+ ON_DEBUG(if (result == 0) check_dkeys(node););
10377+
10378+ done_lh(lh);
10379+ move_lh(lh, &local_lh);
10380+ assert("vs-1676", node_is_empty(node));
10381+ coord_init_first_unit(insert_coord,
10382+ node);
10383+ } else {
10384+ warning("nikita-3136",
10385+ "Cannot lock child");
10386+ }
10387+ done_lh(&local_lh);
10388+ zrelse(node);
10389+ }
10390+ }
10391+ } else
10392+ result = PTR_ERR(op);
10393+ zput(node);
10394+ done_carry_pool(pool);
10395+ return result;
10396+}
10397+
10398+/**
10399+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
10400+ * @h: search handle
10401+ * @outcome: flag saying whether search has to restart or is done
10402+ *
10403+ * Handles search on twig level. If this function completes search itself then
10404+ * it returns 1. If search has to go one level down then 0 is returned. If
10405+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
10406+ * in @h->result.
10407+ */
10408+int handle_eottl(cbk_handle *h, int *outcome)
10409+{
10410+ int result;
10411+ reiser4_key key;
10412+ coord_t *coord;
10413+
10414+ coord = h->coord;
10415+
10416+ if (h->level != TWIG_LEVEL ||
10417+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
10418+ /* Continue to traverse tree downward. */
10419+ return 0;
10420+ }
10421+
10422+ /*
10423+ * make sure that @h->coord is set to twig node and that it is either
10424+ * set to extent item or after extent item
10425+ */
10426+ assert("vs-356", h->level == TWIG_LEVEL);
10427+ assert("vs-357", ( {
10428+ coord_t lcoord;
10429+ coord_dup(&lcoord, coord);
10430+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
10431+ item_is_extent(&lcoord);
10432+ }
10433+ ));
10434+
10435+ if (*outcome == NS_FOUND) {
10436+ /* we have found desired key on twig level in extent item */
10437+ h->result = CBK_COORD_FOUND;
10438+ *outcome = LOOKUP_DONE;
10439+ return 1;
10440+ }
10441+
10442+ if (!(h->flags & CBK_FOR_INSERT)) {
10443+ /* tree traversal is not for insertion. Just return
10444+ CBK_COORD_NOTFOUND. */
10445+ h->result = CBK_COORD_NOTFOUND;
10446+ *outcome = LOOKUP_DONE;
10447+ return 1;
10448+ }
10449+
10450+ /* take a look at the item to the right of h -> coord */
10451+ result = is_next_item_internal(coord, h->key, h->active_lh);
10452+ if (unlikely(result < 0)) {
10453+ h->error = "get_right_neighbor failed";
10454+ h->result = result;
10455+ *outcome = LOOKUP_DONE;
10456+ return 1;
10457+ }
10458+ if (result == 0) {
10459+ /*
10460+ * item to the right is also an extent one. Allocate a new node
10461+ * and insert pointer to it after item h -> coord.
10462+ *
10463+ * This is a result of extents being located at the twig
10464+ * level. For explanation, see comment just above
10465+ * is_next_item_internal().
10466+ */
10467+ znode *loaded;
10468+
10469+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10470+ /*
10471+ * we got node read locked, restart coord_by_key to
10472+ * have write lock on twig level
10473+ */
10474+ h->lock_level = TWIG_LEVEL;
10475+ h->lock_mode = ZNODE_WRITE_LOCK;
10476+ *outcome = LOOKUP_REST;
10477+ return 1;
10478+ }
10479+
10480+ loaded = coord->node;
10481+ result =
10482+ add_empty_leaf(coord, h->active_lh, h->key,
10483+ rd_key(coord, &key));
10484+ if (result) {
10485+ h->error = "could not add empty leaf";
10486+ h->result = result;
10487+ *outcome = LOOKUP_DONE;
10488+ return 1;
10489+ }
10490+ /* added empty leaf is locked (h->active_lh), its parent node
10491+ is unlocked, h->coord is set as EMPTY */
10492+ assert("vs-13", coord->between == EMPTY_NODE);
10493+ assert("vs-14", znode_is_write_locked(coord->node));
10494+ assert("vs-15",
10495+ WITH_DATA(coord->node, node_is_empty(coord->node)));
10496+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10497+ assert("vs-17", coord->node == h->active_lh->node);
10498+ *outcome = LOOKUP_DONE;
10499+ h->result = CBK_COORD_NOTFOUND;
10500+ return 1;
10501+ } else if (result == 1) {
10502+ /*
10503+ * this is special case mentioned in the comment on
10504+ * tree.h:cbk_flags. We have found internal item immediately on
10505+ * the right of extent, and we are going to insert new item
10506+ * there. Key of item we are going to insert is smaller than
10507+ * leftmost key in the node pointed to by said internal item
10508+ * (otherwise search wouldn't come to the extent in the first
10509+ * place).
10510+ *
10511+ * This is a result of extents being located at the twig
10512+ * level. For explanation, see comment just above
10513+ * is_next_item_internal().
10514+ */
10515+ h->flags &= ~CBK_TRUST_DK;
10516+ } else {
10517+ assert("vs-8", result == 2);
10518+ *outcome = LOOKUP_REST;
10519+ return 1;
10520+ }
10521+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10522+ return 0;
10523+}
10524+
10525+/*
10526+ * Local variables:
10527+ * c-indentation-style: "K&R"
10528+ * mode-name: "LC"
10529+ * c-basic-offset: 8
10530+ * tab-width: 8
10531+ * fill-column: 120
10532+ * scroll-step: 1
10533+ * End:
10534+ */
10535diff --git a/fs/reiser4/estimate.c b/fs/reiser4/estimate.c
10536new file mode 100644
10537index 0000000..656c20b
10538--- /dev/null
10539+++ b/fs/reiser4/estimate.c
10540@@ -0,0 +1,120 @@
10541+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10542+
10543+#include "debug.h"
10544+#include "dformat.h"
10545+#include "tree.h"
10546+#include "carry.h"
10547+#include "inode.h"
10548+#include "plugin/cluster.h"
10549+#include "plugin/item/ctail.h"
10550+
10551+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10552+
10553+ Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10554+ is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10555+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10556+ leaf level, 3 for twig level, 2 on upper + 1 for root.
10557+
10558+ Do not calculate the current node of the lowest level here - this is overhead only.
10559+
10560+ children is almost always 1 here. Exception is flow insertion
10561+*/
10562+static reiser4_block_nr
10563+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10564+{
10565+ reiser4_block_nr ten_percent;
10566+
10567+ ten_percent = ((103 * childen) >> 10);
10568+
10569+ /* If we have too many balancings at the time, tree height can raise on more
10570+ then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10571+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10572+}
10573+
10574+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10575+ perform insertion of one item into the tree */
10576+/* it is only called when tree height changes, or gets initialized */
10577+reiser4_block_nr calc_estimate_one_insert(tree_level height)
10578+{
10579+ return 1 + max_balance_overhead(1, height);
10580+}
10581+
10582+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10583+{
10584+ return tree->estimate_one_insert;
10585+}
10586+
10587+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10588+ perform insertion of one unit into an item in the tree */
10589+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10590+{
10591+ /* estimate insert into item just like item insertion */
10592+ return tree->estimate_one_insert;
10593+}
10594+
10595+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10596+{
10597+ /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10598+ level */
10599+ return tree->estimate_one_insert;
10600+}
10601+
10602+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10603+ both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10604+ levels */
10605+reiser4_block_nr estimate_insert_flow(tree_level height)
10606+{
10607+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10608+ CARRY_FLOW_NEW_NODES_LIMIT,
10609+ height);
10610+}
10611+
10612+/* returnes max number of nodes can be occupied by disk cluster */
10613+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10614+{
10615+ int per_cluster;
10616+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10617+ return 3 + per_cluster +
10618+ max_balance_overhead(3 + per_cluster,
10619+ REISER4_MAX_ZTREE_HEIGHT);
10620+}
10621+
10622+/* how many nodes might get dirty and added
10623+ during insertion of a disk cluster */
10624+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10625+{
10626+ return estimate_cluster(inode, 1); /* 24 */
10627+}
10628+
10629+/* how many nodes might get dirty and added
10630+ during update of a (prepped or unprepped) disk cluster */
10631+reiser4_block_nr estimate_update_cluster(struct inode * inode)
10632+{
10633+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10634+}
10635+
10636+/* How many nodes occupied by a disk cluster might get dirty.
10637+ Note that this estimation is not precise (i.e. disk cluster
10638+ can occupy more nodes).
10639+ Q: Why we don't use precise estimation?
10640+ A: 1.Because precise estimation is fairly bad: 65536 nodes
10641+ for 64K logical cluster, it means 256M of dead space on
10642+ a partition
10643+ 2.It is a very rare case when disk cluster occupies more
10644+ nodes then this estimation returns.
10645+*/
10646+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10647+{
10648+ return cluster_nrpages(inode) + 4;
10649+}
10650+
10651+/* Make Linus happy.
10652+ Local variables:
10653+ c-indentation-style: "K&R"
10654+ mode-name: "LC"
10655+ c-basic-offset: 8
10656+ tab-width: 8
10657+ fill-column: 120
10658+ scroll-step: 1
10659+ End:
10660+*/
10661diff --git a/fs/reiser4/export_ops.c b/fs/reiser4/export_ops.c
10662new file mode 100644
10663index 0000000..b75afe7
10664--- /dev/null
10665+++ b/fs/reiser4/export_ops.c
10666@@ -0,0 +1,295 @@
10667+/* Copyright 2005 by Hans Reiser, licensing governed by
10668+ * reiser4/README */
10669+
10670+#include "inode.h"
10671+#include "plugin/plugin.h"
10672+
10673+/*
10674+ * Supported file-handle types
10675+ */
10676+typedef enum {
10677+ FH_WITH_PARENT = 0x10, /* file handle with parent */
10678+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
10679+} reiser4_fhtype;
10680+
10681+#define NFSERROR (255)
10682+
10683+/* initialize place-holder for object */
10684+static void object_on_wire_init(reiser4_object_on_wire *o)
10685+{
10686+ o->plugin = NULL;
10687+}
10688+
10689+/* finish with @o */
10690+static void object_on_wire_done(reiser4_object_on_wire *o)
10691+{
10692+ if (o->plugin != NULL)
10693+ o->plugin->wire.done(o);
10694+}
10695+
10696+/*
10697+ * read serialized object identity from @addr and store information about
10698+ * object in @obj. This is dual to encode_inode().
10699+ */
10700+static char *decode_inode(struct super_block *s, char *addr,
10701+ reiser4_object_on_wire * obj)
10702+{
10703+ file_plugin *fplug;
10704+
10705+ /* identifier of object plugin is stored in the first two bytes,
10706+ * followed by... */
10707+ fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10708+ if (fplug != NULL) {
10709+ addr += sizeof(d16);
10710+ obj->plugin = fplug;
10711+ assert("nikita-3520", fplug->wire.read != NULL);
10712+ /* plugin specific encoding of object identity. */
10713+ addr = fplug->wire.read(addr, obj);
10714+ } else
10715+ addr = ERR_PTR(RETERR(-EINVAL));
10716+ return addr;
10717+}
10718+
10719+/**
10720+ * reiser4_decode_fh - decode_fh of export operations
10721+ * @super: super block
10722+ * @fh: nfsd file handle
10723+ * @len: length of file handle
10724+ * @fhtype: type of file handle
10725+ * @acceptable: acceptability testing function
10726+ * @context: argument for @acceptable
10727+ *
10728+ * Returns dentry referring to the same file as @fh.
10729+ */
10730+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10731+ int len, int fhtype,
10732+ int (*acceptable) (void *context,
10733+ struct dentry *de),
10734+ void *context)
10735+{
10736+ reiser4_context *ctx;
10737+ reiser4_object_on_wire object;
10738+ reiser4_object_on_wire parent;
10739+ char *addr;
10740+ int with_parent;
10741+
10742+ ctx = reiser4_init_context(super);
10743+ if (IS_ERR(ctx))
10744+ return (struct dentry *)ctx;
10745+
10746+ assert("vs-1482",
10747+ fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10748+
10749+ with_parent = (fhtype == FH_WITH_PARENT);
10750+
10751+ addr = (char *)fh;
10752+
10753+ object_on_wire_init(&object);
10754+ object_on_wire_init(&parent);
10755+
10756+ addr = decode_inode(super, addr, &object);
10757+ if (!IS_ERR(addr)) {
10758+ if (with_parent)
10759+ addr = decode_inode(super, addr, &parent);
10760+ if (!IS_ERR(addr)) {
10761+ struct dentry *d;
10762+ typeof(super->s_export_op->find_exported_dentry) fn;
10763+
10764+ fn = super->s_export_op->find_exported_dentry;
10765+ assert("nikita-3521", fn != NULL);
10766+ d = fn(super, &object, with_parent ? &parent : NULL,
10767+ acceptable, context);
10768+ if (d != NULL && !IS_ERR(d))
10769+ /* FIXME check for -ENOMEM */
10770+ reiser4_get_dentry_fsdata(d)->stateless = 1;
10771+ addr = (char *)d;
10772+ }
10773+ }
10774+
10775+ object_on_wire_done(&object);
10776+ object_on_wire_done(&parent);
10777+
10778+ reiser4_exit_context(ctx);
10779+ return (void *)addr;
10780+}
10781+
10782+/*
10783+ * Object serialization support.
10784+ *
10785+ * To support knfsd file system provides export_operations that are used to
10786+ * construct and interpret NFS file handles. As a generalization of this,
10787+ * reiser4 object plugins have serialization support: it provides methods to
10788+ * create on-wire representation of identity of reiser4 object, and
10789+ * re-create/locate object given its on-wire identity.
10790+ *
10791+ */
10792+
10793+/*
10794+ * return number of bytes that on-wire representation of @inode's identity
10795+ * consumes.
10796+ */
10797+static int encode_inode_size(struct inode *inode)
10798+{
10799+ assert("nikita-3514", inode != NULL);
10800+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
10801+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10802+
10803+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10804+}
10805+
10806+/*
10807+ * store on-wire representation of @inode's identity at the area beginning at
10808+ * @start.
10809+ */
10810+static char *encode_inode(struct inode *inode, char *start)
10811+{
10812+ assert("nikita-3517", inode != NULL);
10813+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
10814+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10815+
10816+ /*
10817+ * first, store two-byte identifier of object plugin, then
10818+ */
10819+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10820+ (d16 *) start);
10821+ start += sizeof(d16);
10822+ /*
10823+ * call plugin to serialize object's identity
10824+ */
10825+ return inode_file_plugin(inode)->wire.write(inode, start);
10826+}
10827+
10828+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10829+ * returned if file handle can not be stored */
10830+/**
10831+ * reiser4_encode_fh - encode_fh of export operations
10832+ * @dentry:
10833+ * @fh:
10834+ * @lenp:
10835+ * @need_parent:
10836+ *
10837+ */
10838+static int
10839+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10840+ int need_parent)
10841+{
10842+ struct inode *inode;
10843+ struct inode *parent;
10844+ char *addr;
10845+ int need;
10846+ int delta;
10847+ int result;
10848+ reiser4_context *ctx;
10849+
10850+ /*
10851+ * knfsd asks as to serialize object in @dentry, and, optionally its
10852+ * parent (if need_parent != 0).
10853+ *
10854+ * encode_inode() and encode_inode_size() is used to build
10855+ * representation of object and its parent. All hard work is done by
10856+ * object plugins.
10857+ */
10858+ inode = dentry->d_inode;
10859+ parent = dentry->d_parent->d_inode;
10860+
10861+ addr = (char *)fh;
10862+
10863+ need = encode_inode_size(inode);
10864+ if (need < 0)
10865+ return NFSERROR;
10866+ if (need_parent) {
10867+ delta = encode_inode_size(parent);
10868+ if (delta < 0)
10869+ return NFSERROR;
10870+ need += delta;
10871+ }
10872+
10873+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
10874+ if (IS_ERR(ctx))
10875+ return PTR_ERR(ctx);
10876+
10877+ if (need <= sizeof(__u32) * (*lenp)) {
10878+ addr = encode_inode(inode, addr);
10879+ if (need_parent)
10880+ addr = encode_inode(parent, addr);
10881+
10882+ /* store in lenp number of 32bit words required for file
10883+ * handle. */
10884+ *lenp = (need + sizeof(__u32) - 1) >> 2;
10885+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10886+ } else
10887+ /* no enough space in file handle */
10888+ result = NFSERROR;
10889+ reiser4_exit_context(ctx);
10890+ return result;
10891+}
10892+
10893+/**
10894+ * reiser4_get_dentry_parent - get_parent of export operations
10895+ * @child:
10896+ *
10897+ */
10898+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10899+{
10900+ struct inode *dir;
10901+ dir_plugin *dplug;
10902+
10903+ assert("nikita-3527", child != NULL);
10904+ /* see comment in reiser4_get_dentry() about following assertion */
10905+ assert("nikita-3528", is_in_reiser4_context());
10906+
10907+ dir = child->d_inode;
10908+ assert("nikita-3529", dir != NULL);
10909+ dplug = inode_dir_plugin(dir);
10910+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10911+ if (dplug != NULL)
10912+ return dplug->get_parent(dir);
10913+ else
10914+ return ERR_PTR(RETERR(-ENOTDIR));
10915+}
10916+
10917+/**
10918+ * reiser4_get_dentry - get_dentry of export operations
10919+ * @super:
10920+ * @data:
10921+ *
10922+ *
10923+ */
10924+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10925+{
10926+ reiser4_object_on_wire *o;
10927+
10928+ assert("nikita-3522", super != NULL);
10929+ assert("nikita-3523", data != NULL);
10930+ /*
10931+ * this is only supposed to be called by
10932+ *
10933+ * reiser4_decode_fh->find_exported_dentry
10934+ *
10935+ * so, reiser4_context should be here already.
10936+ */
10937+ assert("nikita-3526", is_in_reiser4_context());
10938+
10939+ o = (reiser4_object_on_wire *)data;
10940+ assert("nikita-3524", o->plugin != NULL);
10941+ assert("nikita-3525", o->plugin->wire.get != NULL);
10942+
10943+ return o->plugin->wire.get(super, o);
10944+}
10945+
10946+struct export_operations reiser4_export_operations = {
10947+ .encode_fh = reiser4_encode_fh,
10948+ .decode_fh = reiser4_decode_fh,
10949+ .get_parent = reiser4_get_dentry_parent,
10950+ .get_dentry = reiser4_get_dentry
10951+};
10952+
10953+/*
10954+ * Local variables:
10955+ * c-indentation-style: "K&R"
10956+ * mode-name: "LC"
10957+ * c-basic-offset: 8
10958+ * tab-width: 8
10959+ * fill-column: 79
10960+ * End:
10961+ */
10962diff --git a/fs/reiser4/flush.c b/fs/reiser4/flush.c
10963new file mode 100644
10964index 0000000..49b6ca5
10965--- /dev/null
10966+++ b/fs/reiser4/flush.c
10967@@ -0,0 +1,3622 @@
10968+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10969+
10970+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10971+
10972+#include "forward.h"
10973+#include "debug.h"
10974+#include "dformat.h"
10975+#include "key.h"
10976+#include "coord.h"
10977+#include "plugin/item/item.h"
10978+#include "plugin/plugin.h"
10979+#include "plugin/object.h"
10980+#include "txnmgr.h"
10981+#include "jnode.h"
10982+#include "znode.h"
10983+#include "block_alloc.h"
10984+#include "tree_walk.h"
10985+#include "carry.h"
10986+#include "tree.h"
10987+#include "vfs_ops.h"
10988+#include "inode.h"
10989+#include "page_cache.h"
10990+#include "wander.h"
10991+#include "super.h"
10992+#include "entd.h"
10993+#include "reiser4.h"
10994+#include "flush.h"
10995+#include "writeout.h"
10996+
10997+#include <asm/atomic.h>
10998+#include <linux/fs.h> /* for struct super_block */
10999+#include <linux/mm.h> /* for struct page */
11000+#include <linux/bio.h> /* for struct bio */
11001+#include <linux/pagemap.h>
11002+#include <linux/blkdev.h>
11003+
11004+/* IMPLEMENTATION NOTES */
11005+
11006+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
11007+ order to the nodes of the tree in which the parent is placed before its children, which
11008+ are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it
11009+ describes the node that "came before in forward parent-first order". When we speak of a
11010+ "parent-first follower", it describes the node that "comes next in parent-first
11011+ order" (alternatively the node that "came before in reverse parent-first order").
11012+
11013+ The following pseudo-code prints the nodes of a tree in forward parent-first order:
11014+
11015+ void parent_first (node)
11016+ {
11017+ print_node (node);
11018+ if (node->level > leaf) {
11019+ for (i = 0; i < num_children; i += 1) {
11020+ parent_first (node->child[i]);
11021+ }
11022+ }
11023+ }
11024+*/
11025+
11026+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so
11027+ that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
11028+ can be accomplished with sequential reads, which results in reading nodes in their
11029+ parent-first order. This is a read-optimization aspect of the flush algorithm, and
11030+ there is also a write-optimization aspect, which is that we wish to make large
11031+ sequential writes to the disk by allocating or reallocating blocks so that they can be
11032+ written in sequence. Sometimes the read-optimization and write-optimization goals
11033+ conflict with each other, as we discuss in more detail below.
11034+*/
11035+
11036+/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are
11037+ the relevant jnode->state bits and their relevence to flush:
11038+
11039+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it
11040+ must be allocated first. In order to be considered allocated, the jnode must have
11041+ exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and
11042+ all dirtied jnodes eventually have one of these bits set during each transaction.
11043+
11044+ JNODE_CREATED: The node was freshly created in its transaction and has no previous
11045+ block address, so it is unconditionally assigned to be relocated, although this is
11046+ mainly for code-convenience. It is not being 'relocated' from anything, but in
11047+ almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit
11048+ remains set even after JNODE_RELOC is set, so the actual relocate can be
11049+ distinguished from the created-and-allocated set easily: relocate-set members
11050+ (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
11051+ have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
11052+
11053+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
11054+ decision to maintain the pre-existing location for this node and it will be written
11055+ to the wandered-log.
11056+
11057+ JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
11058+ not created, see note above). A block with JNODE_RELOC set is eligible for
11059+ early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC
11060+ bit is set on a znode, the parent node's internal item is modified and the znode is
11061+ rehashed.
11062+
11063+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
11064+ and calls plugin->f.squeeze() method for its items. By this technology we update disk
11065+ clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
11066+ has this flag (races with write(), rare case) the flush algorythm makes the decision
11067+ to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
11068+ repeated allocation.
11069+
11070+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
11071+ flush queue. This means the jnode is not on any clean or dirty list, instead it is
11072+ moved to one of the flush queue (see flush_queue.h) object private list. This
11073+ prevents multiple concurrent flushes from attempting to start flushing from the
11074+ same node.
11075+
11076+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
11077+ squeeze-and-allocate on a node while its children are actively being squeezed and
11078+ allocated. This flag was created to avoid submitting a write request for a node
11079+ while its children are still being allocated and squeezed. Then flush queue was
11080+ re-implemented to allow unlimited number of nodes be queued. This flag support was
11081+ commented out in source code because we decided that there was no reason to submit
11082+ queued nodes before jnode_flush() finishes. However, current code calls fq_write()
11083+ during a slum traversal and may submit "busy nodes" to disk. Probably we can
11084+ re-enable the JNODE_FLUSH_BUSY bit support in future.
11085+
11086+ With these state bits, we describe a test used frequently in the code below,
11087+ jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The
11088+ test for "flushprepped" returns true if any of the following are true:
11089+
11090+ - The node is not dirty
11091+ - The node has JNODE_RELOC set
11092+ - The node has JNODE_OVRWR set
11093+
11094+ If either the node is not dirty or it has already been processed by flush (and assigned
11095+ JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns
11096+ true then flush has work to do on that node.
11097+*/
11098+
11099+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
11100+ flushprepped twice (unless an explicit call to flush_unprep is made as described in
11101+ detail below). For example a node is dirtied, allocated, and then early-flushed to
11102+ disk and set clean. Before the transaction commits, the page is dirtied again and, due
11103+ to memory pressure, the node is flushed again. The flush algorithm will not relocate
11104+ the node to a new disk location, it will simply write it to the same, previously
11105+ relocated position again.
11106+*/
11107+
11108+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
11109+ start at a leaf node and allocate in parent-first order by iterating to the right. At
11110+ each step of the iteration, we check for the right neighbor. Before advancing to the
11111+ right neighbor, we check if the current position and the right neighbor share the same
11112+ parent. If they do not share the same parent, the parent is allocated before the right
11113+ neighbor.
11114+
11115+ This process goes recursively up the tree and squeeze nodes level by level as long as
11116+ the right neighbor and the current position have different parents, then it allocates
11117+ the right-neighbors-with-different-parents on the way back down. This process is
11118+ described in more detail in flush_squalloc_changed_ancestor and the recursive function
11119+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
11120+ specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
11121+ approaches.
11122+
11123+ The top-down algorithm was implemented earlier (April-May 2002). In the top-down
11124+ approach, we find a starting point by scanning left along each level past dirty nodes,
11125+ then going up and repeating the process until the left node and the parent node are
11126+ clean. We then perform a parent-first traversal from the starting point, which makes
11127+ allocating in parent-first order trivial. After one subtree has been allocated in this
11128+ manner, we move to the right, try moving upward, then repeat the parent-first
11129+ traversal.
11130+
11131+ Both approaches have problems that need to be addressed. Both are approximately the
11132+ same amount of code, but the bottom-up approach has advantages in the order it acquires
11133+ locks which, at the very least, make it the better approach. At first glance each one
11134+ makes the other one look simpler, so it is important to remember a few of the problems
11135+ with each one.
11136+
11137+ Main problem with the top-down approach: When you encounter a clean child during the
11138+ parent-first traversal, what do you do? You would like to avoid searching through a
11139+ large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
11140+ obvious solution. One of the advantages of the top-down approach is that during the
11141+ parent-first traversal you check every child of a parent to see if it is dirty. In
11142+ this way, the top-down approach easily handles the main problem of the bottom-up
11143+ approach: unallocated children.
11144+
11145+ The unallocated children problem is that before writing a node to disk we must make
11146+ sure that all of its children are allocated. Otherwise, the writing the node means
11147+ extra I/O because the node will have to be written again when the child is finally
11148+ allocated.
11149+
11150+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this
11151+ should not cause any file system corruption, it only degrades I/O performance because a
11152+ node may be written when it is sure to be written at least one more time in the same
11153+ transaction when the remaining children are allocated. What follows is a description
11154+ of how we will solve the problem.
11155+*/
11156+
11157+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
11158+ proceeding in parent first order, allocate some of its left-children, then encounter a
11159+ clean child in the middle of the parent. We do not allocate the clean child, but there
11160+ may remain unallocated (dirty) children to the right of the clean child. If we were to
11161+ stop flushing at this moment and write everything to disk, the parent might still
11162+ contain unallocated children.
11163+
11164+ We could try to allocate all the descendents of every node that we allocate, but this
11165+ is not necessary. Doing so could result in allocating the entire tree: if the root
11166+ node is allocated then every unallocated node would have to be allocated before
11167+ flushing. Actually, we do not have to write a node just because we allocate it. It is
11168+ possible to allocate but not write a node during flush, when it still has unallocated
11169+ children. However, this approach is probably not optimal for the following reason.
11170+
11171+ The flush algorithm is designed to allocate nodes in parent-first order in an attempt
11172+ to optimize reads that occur in the same order. Thus we are read-optimizing for a
11173+ left-to-right scan through all the leaves in the system, and we are hoping to
11174+ write-optimize at the same time because those nodes will be written together in batch.
11175+ What happens, however, if we assign a block number to a node in its read-optimized
11176+ order but then avoid writing it because it has unallocated children? In that
11177+ situation, we lose out on the write-optimization aspect because a node will have to be
11178+ written again to the its location on the device, later, which likely means seeking back
11179+ to that location.
11180+
11181+ So there are tradeoffs. We can choose either:
11182+
11183+ A. Allocate all unallocated children to preserve both write-optimization and
11184+ read-optimization, but this is not always desirable because it may mean having to
11185+ allocate and flush very many nodes at once.
11186+
11187+ B. Defer writing nodes with unallocated children, keep their read-optimized locations,
11188+ but sacrifice write-optimization because those nodes will be written again.
11189+
11190+ C. Defer writing nodes with unallocated children, but do not keep their read-optimized
11191+ locations. Instead, choose to write-optimize them later, when they are written. To
11192+ facilitate this, we "undo" the read-optimized allocation that was given to the node so
11193+ that later it can be write-optimized, thus "unpreparing" the flush decision. This is a
11194+ case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a
11195+ call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
11196+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
11197+ location, and set the JNODE_CREATED bit, effectively setting the node back to an
11198+ unallocated state.
11199+
11200+ We will take the following approach in v4.0: for twig nodes we will always finish
11201+ allocating unallocated children (A). For nodes with (level > TWIG) we will defer
11202+ writing and choose write-optimization (C).
11203+
11204+ To summarize, there are several parts to a solution that avoids the problem with
11205+ unallocated children:
11206+
11207+ FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
11208+ problem because there was an experiment which was done showed that we have 1-2 nodes
11209+ with unallocated children for thousands of written nodes. The experiment was simple
11210+ like coping / deletion of linux kernel sources. However the problem can arise in more
11211+ complex tests. I think we have jnode_io_hook to insert a check for unallocated
11212+ children and see what kind of problem we have.
11213+
11214+ 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
11215+ squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to
11216+ implement: should be simple -- amounts to adding a while loop to jnode_flush, see
11217+ comments in that function.
11218+
11219+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
11220+ have unallocated children. If the twig level has unallocated children it is an
11221+ assertion failure. If a higher-level node has unallocated children, then it should be
11222+ explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement:
11223+ should be simple.
11224+
11225+ 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
11226+ CPU cycles than we would like, and it is possible (but medium complexity) to optimize
11227+ this somewhat in the case where large sub-trees are flushed. The following observation
11228+ helps: if both the left- and right-neighbor of a node are processed by the flush
11229+ algorithm then the node itself is guaranteed to have all of its children allocated.
11230+ However, the cost of this check may not be so expensive after all: it is not needed for
11231+ leaves and flush can guarantee this property for twigs. That leaves only (level >
11232+ TWIG) nodes that have to be checked, so this optimization only helps if at least three
11233+ (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
11234+ there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes
11235+ then the number of blocks being written will be very large, so the savings may be
11236+ insignificant. That said, the idea is to maintain both the left and right edges of
11237+ nodes that are processed in flush. When flush_empty_queue() is called, a relatively
11238+ simple test will tell whether the (level > TWIG) node is on the edge. If it is on the
11239+ edge, the slow check is necessary, but if it is in the interior then it can be assumed
11240+ to have all of its children allocated. FIXME: medium complexity to implement, but
11241+ simple to verify given that we must have a slow check anyway.
11242+
11243+ 4. (Optional) This part is optional, not for v4.0--flush should work independently of
11244+ whether this option is used or not. Called RAPID_SCAN, the idea is to amend the
11245+ left-scan operation to take unallocated children into account. Normally, the left-scan
11246+ operation goes left as long as adjacent nodes are dirty up until some large maximum
11247+ value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left
11248+ may stop at a position where there are unallocated children to the left with the same
11249+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
11250+ FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
11251+ with a rapid scan. The rapid scan skips all the interior children of a node--if the
11252+ leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
11253+ twig to the left). If the left neighbor of the leftmost child is also dirty, then
11254+ continue the scan at the left twig and repeat. This option will cause flush to
11255+ allocate more twigs in a single pass, but it also has the potential to write many more
11256+ nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN
11257+ was partially implemented, code removed August 12, 2002 by JMACD.
11258+*/
11259+
11260+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the
11261+ starting point for flush is a leaf node, but actually the flush code cares very little
11262+ about whether or not this is true. It is possible that all the leaf nodes are flushed
11263+ and dirty parent nodes still remain, in which case jnode_flush() is called on a
11264+ non-leaf argument. Flush doesn't care--it treats the argument node as if it were a
11265+ leaf, even when it is not. This is a simple approach, and there may be a more optimal
11266+ policy but until a problem with this approach is discovered, simplest is probably best.
11267+
11268+ NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
11269+ the leaves. This is done as a matter of simplicity and there is only one (shaky)
11270+ justification. When an atom commits, it flushes all leaf level nodes first, followed
11271+ by twigs, and so on. With flushing done in this order, if flush is eventually called
11272+ on a non-leaf node it means that (somehow) we reached a point where all leaves are
11273+ clean and only internal nodes need to be flushed. If that it the case, then it means
11274+ there were no leaves that were the parent-first preceder/follower of the parent. This
11275+ is expected to be a rare case, which is why we do nothing special about it. However,
11276+ memory pressure may pass an internal node to flush when there are still dirty leaf
11277+ nodes that need to be flushed, which could prove our original assumptions
11278+ "inoperative". If this needs to be fixed, then scan_left/right should have
11279+ special checks for the non-leaf levels. For example, instead of passing from a node to
11280+ the left neighbor, it should pass from the node to the left neighbor's rightmost
11281+ descendent (if dirty).
11282+
11283+*/
11284+
11285+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting
11286+ it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the
11287+ logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
11288+ device if we are walking from right to left. We then make passes in alternating directions, and as we do this the
11289+ device becomes sorted such that tree order and block number order fully correlate.
11290+
11291+ Resizing is done by shifting everything either all the way to the left or all the way
11292+ to the right, and then reporting the last block.
11293+*/
11294+
11295+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This
11296+ descibes the policy from the highest level:
11297+
11298+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
11299+ leaf level during flush-scan (right, left), then we unconditionally decide to relocate
11300+ leaf nodes.
11301+
11302+ Otherwise, there are two contexts in which we make a decision to relocate:
11303+
11304+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
11305+ During the initial stages of flush, after scan-right completes, we want to ask the
11306+ question: should we relocate this leaf node and thus dirty the parent node. Then if
11307+ the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
11308+ the question at the next level up, and so on. In these cases we are moving in the
11309+ reverse-parent first direction.
11310+
11311+ There is another case which is considered the reverse direction, which comes at the end
11312+ of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may
11313+ reach a point where there is a clean twig to the right with a dirty leftmost child. In
11314+ this case, we may wish to relocate the child by testing if it should be relocated
11315+ relative to its parent.
11316+
11317+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
11318+ allocate_znode. What distinguishes the forward parent-first case from the
11319+ reverse-parent first case is that the preceder has already been allocated in the
11320+ forward case, whereas in the reverse case we don't know what the preceder is until we
11321+ finish "going in reverse". That simplifies the forward case considerably, and there we
11322+ actually use the block allocator to determine whether, e.g., a block closer to the
11323+ preceder is available.
11324+*/
11325+
11326+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we
11327+ finish scan-left and find a starting point, if the parent's left neighbor is dirty then
11328+ squeeze the parent's left neighbor and the parent. This may change the
11329+ flush-starting-node's parent. Repeat until the child's parent is stable. If the child
11330+ is a leftmost child, repeat this left-edge squeezing operation at the next level up.
11331+ Note that we cannot allocate extents during this or they will be out of parent-first
11332+ order. There is also some difficult coordinate maintenence issues. We can't do a tree
11333+ search to find coordinates again (because we hold locks), we have to determine them
11334+ from the two nodes being squeezed. Looks difficult, but has potential to increase
11335+ space utilization. */
11336+
11337+/* Flush-scan helper functions. */
11338+static void scan_init(flush_scan * scan);
11339+static void scan_done(flush_scan * scan);
11340+
11341+/* Flush-scan algorithm. */
11342+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
11343+ unsigned limit);
11344+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
11345+static int scan_common(flush_scan * scan, flush_scan * other);
11346+static int scan_formatted(flush_scan * scan);
11347+static int scan_unformatted(flush_scan * scan, flush_scan * other);
11348+static int scan_by_coord(flush_scan * scan);
11349+
11350+/* Initial flush-point ancestor allocation. */
11351+static int alloc_pos_and_ancestors(flush_pos_t * pos);
11352+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
11353+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
11354+
11355+/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */
11356+static int squalloc(flush_pos_t * pos);
11357+
11358+/* Flush squeeze implementation. */
11359+static int squeeze_right_non_twig(znode * left, znode * right);
11360+static int shift_one_internal_unit(znode * left, znode * right);
11361+
11362+/* Flush reverse parent-first relocation routines. */
11363+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11364+ const reiser4_block_nr * nblk);
11365+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11366+ flush_pos_t * pos);
11367+static int reverse_relocate_check_dirty_parent(jnode * node,
11368+ const coord_t * parent_coord,
11369+ flush_pos_t * pos);
11370+
11371+/* Flush allocate write-queueing functions: */
11372+static int allocate_znode(znode * node, const coord_t * parent_coord,
11373+ flush_pos_t * pos);
11374+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
11375+ flush_pos_t * pos);
11376+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11377+
11378+/* Flush helper functions: */
11379+static int jnode_lock_parent_coord(jnode * node,
11380+ coord_t * coord,
11381+ lock_handle * parent_lh,
11382+ load_count * parent_zh,
11383+ znode_lock_mode mode, int try);
11384+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11385+ znode_lock_mode mode, int check_dirty);
11386+static int znode_same_parents(znode * a, znode * b);
11387+
11388+static int znode_check_flushprepped(znode * node)
11389+{
11390+ return jnode_check_flushprepped(ZJNODE(node));
11391+}
11392+
11393+/* Flush position functions */
11394+static void pos_init(flush_pos_t * pos);
11395+static int pos_valid(flush_pos_t * pos);
11396+static void pos_done(flush_pos_t * pos);
11397+static int pos_stop(flush_pos_t * pos);
11398+
11399+/* check that @org is first jnode extent unit, if extent is unallocated,
11400+ * because all jnodes of unallocated extent are dirty and of the same atom. */
11401+#define checkchild(scan) \
11402+assert("nikita-3435", \
11403+ ergo(scan->direction == LEFT_SIDE && \
11404+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
11405+ jnode_is_unformatted(scan->node) && \
11406+ extent_is_unallocated(&scan->parent_coord), \
11407+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11408+
11409+/* This flush_cnt variable is used to track the number of concurrent flush operations,
11410+ useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has
11411+ no static initializer function...) */
11412+ON_DEBUG(atomic_t flush_cnt;
11413+ )
11414+
11415+/* check fs backing device for write congestion */
11416+static int check_write_congestion(void)
11417+{
11418+ struct super_block *sb;
11419+ struct backing_dev_info *bdi;
11420+
11421+ sb = reiser4_get_current_sb();
11422+ bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
11423+ return bdi_write_congested(bdi);
11424+}
11425+
11426+/* conditionally write flush queue */
11427+static int write_prepped_nodes(flush_pos_t * pos)
11428+{
11429+ int ret;
11430+
11431+ assert("zam-831", pos);
11432+ assert("zam-832", pos->fq);
11433+
11434+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11435+ return 0;
11436+
11437+ if (check_write_congestion())
11438+ return 0;
11439+
11440+ ret = reiser4_write_fq(pos->fq, pos->nr_written,
11441+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11442+ return ret;
11443+}
11444+
11445+/* Proper release all flush pos. resources then move flush position to new
11446+ locked node */
11447+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
11448+ load_count * new_load, const coord_t * new_coord)
11449+{
11450+ assert("zam-857", new_lock->node == new_load->node);
11451+
11452+ if (new_coord) {
11453+ assert("zam-858", new_coord->node == new_lock->node);
11454+ coord_dup(&pos->coord, new_coord);
11455+ } else {
11456+ coord_init_first_unit(&pos->coord, new_lock->node);
11457+ }
11458+
11459+ if (pos->child) {
11460+ jput(pos->child);
11461+ pos->child = NULL;
11462+ }
11463+
11464+ move_load_count(&pos->load, new_load);
11465+ done_lh(&pos->lock);
11466+ move_lh(&pos->lock, new_lock);
11467+}
11468+
11469+/* delete empty node which link from the parent still exists. */
11470+static int delete_empty_node(znode * node)
11471+{
11472+ reiser4_key smallest_removed;
11473+
11474+ assert("zam-1019", node != NULL);
11475+ assert("zam-1020", node_is_empty(node));
11476+ assert("zam-1023", znode_is_wlocked(node));
11477+
11478+ return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11479+}
11480+
11481+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11482+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11483+{
11484+ int ret;
11485+ load_count load;
11486+ lock_handle lock;
11487+
11488+ init_lh(&lock);
11489+ init_load_count(&load);
11490+
11491+ if (jnode_is_znode(org)) {
11492+ ret = longterm_lock_znode(&lock, JZNODE(org),
11493+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11494+ if (ret)
11495+ return ret;
11496+
11497+ ret = incr_load_count_znode(&load, JZNODE(org));
11498+ if (ret)
11499+ return ret;
11500+
11501+ pos->state =
11502+ (jnode_get_level(org) ==
11503+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11504+ move_flush_pos(pos, &lock, &load, NULL);
11505+ } else {
11506+ coord_t parent_coord;
11507+ ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11508+ &load, ZNODE_WRITE_LOCK, 0);
11509+ if (ret)
11510+ goto done;
11511+ if (!item_is_extent(&parent_coord)) {
11512+ /* file was converted to tail, org became HB, we found internal
11513+ item */
11514+ ret = -EAGAIN;
11515+ goto done;
11516+ }
11517+
11518+ pos->state = POS_ON_EPOINT;
11519+ move_flush_pos(pos, &lock, &load, &parent_coord);
11520+ pos->child = jref(org);
11521+ if (extent_is_unallocated(&parent_coord)
11522+ && extent_unit_index(&parent_coord) != index_jnode(org)) {
11523+ /* @org is not first child of its parent unit. This may happen
11524+ because longerm lock of its parent node was released between
11525+ scan_left and scan_right. For now work around this having flush to repeat */
11526+ ret = -EAGAIN;
11527+ }
11528+ }
11529+
11530+ done:
11531+ done_load_count(&load);
11532+ done_lh(&lock);
11533+ return ret;
11534+}
11535+
11536+/* TODO LIST (no particular order): */
11537+/* I have labelled most of the legitimate FIXME comments in this file with letters to
11538+ indicate which issue they relate to. There are a few miscellaneous FIXMEs with
11539+ specific names mentioned instead that need to be inspected/resolved. */
11540+/* B. There is an issue described in reverse_relocate_test having to do with an
11541+ imprecise is_preceder? check having to do with partially-dirty extents. The code that
11542+ sets preceder hints and computes the preceder is basically untested. Careful testing
11543+ needs to be done that preceder calculations are done correctly, since if it doesn't
11544+ affect correctness we will not catch this stuff during regular testing. */
11545+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are
11546+ considered expected but unlikely conditions. Flush currently returns 0 (i.e., success
11547+ but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11548+ Many of the calls that may produce one of these return values (i.e.,
11549+ longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11550+ values themselves and, for instance, stop flushing instead of resulting in a restart.
11551+ If any of these results are true error conditions then flush will go into a busy-loop,
11552+ as we noticed during testing when a corrupt tree caused find_child_ptr to return
11553+ ENOENT. It needs careful thought and testing of corner conditions.
11554+*/
11555+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created
11556+ block is assigned a block number then early-flushed to disk. It is dirtied again and
11557+ flush is called again. Concurrently, that block is deleted, and the de-allocation of
11558+ its block number does not need to be deferred, since it is not part of the preserve set
11559+ (i.e., it didn't exist before the transaction). I think there may be a race condition
11560+ where flush writes the dirty, created block after the non-deferred deallocated block
11561+ number is re-allocated, making it possible to write deleted data on top of non-deleted
11562+ data. Its just a theory, but it needs to be thought out. */
11563+/* F. bio_alloc() failure is not handled gracefully. */
11564+/* G. Unallocated children. */
11565+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11566+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11567+
11568+/* JNODE_FLUSH: MAIN ENTRY POINT */
11569+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11570+ neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty
11571+ blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11572+ a part of transaction commit.
11573+
11574+ Our objective here is to prep and flush the slum the jnode belongs to. We want to
11575+ squish the slum together, and allocate the nodes in it as we squish because allocation
11576+ of children affects squishing of parents.
11577+
11578+ The "argument" @node tells flush where to start. From there, flush finds the left edge
11579+ of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a
11580+ "better place" to start squalloc first we perform a flush_scan.
11581+
11582+ Flush-scanning may be performed in both left and right directions, but for different
11583+ purposes. When scanning to the left, we are searching for a node that precedes a
11584+ sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11585+ During flush-scanning, we also take the opportunity to count the number of consecutive
11586+ leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11587+ make a decision to reallocate leaf nodes (thus favoring write-optimization).
11588+
11589+ Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11590+ also be dirty nodes to the right of the argument. If the scan-left operation does not
11591+ count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11592+ operation to see whether there is, in fact, enough nodes to meet the relocate
11593+ threshold. Each right- and left-scan operation uses a single flush_scan object.
11594+
11595+ After left-scan and possibly right-scan, we prepare a flush_position object with the
11596+ starting flush point or parent coordinate, which was determined using scan-left.
11597+
11598+ Next we call the main flush routine, squalloc, which iterates along the
11599+ leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11600+
11601+ After squalloc returns we take extra steps to ensure that all the children
11602+ of the final twig node are allocated--this involves repeating squalloc
11603+ until we finish at a twig with no unallocated children.
11604+
11605+ Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter
11606+ any above-twig nodes during flush_empty_queue that still have unallocated children, we
11607+ flush_unprep them.
11608+
11609+ Flush treats several "failure" cases as non-failures, essentially causing them to start
11610+ over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11611+ probably be handled properly rather than restarting, but there are a bunch of cases to
11612+ audit.
11613+*/
11614+
11615+static int
11616+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11617+ flush_queue_t * fq, int flags)
11618+{
11619+ long ret = 0;
11620+ flush_scan *right_scan;
11621+ flush_scan *left_scan;
11622+ flush_pos_t *flush_pos;
11623+ int todo;
11624+ struct super_block *sb;
11625+ reiser4_super_info_data *sbinfo;
11626+ jnode *leftmost_in_slum = NULL;
11627+
11628+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11629+ assert("nikita-3022", reiser4_schedulable());
11630+
11631+ assert("nikita-3185",
11632+ get_current_super_private()->delete_mutex_owner != current);
11633+
11634+ /* allocate right_scan, left_scan and flush_pos */
11635+ right_scan =
11636+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11637+ reiser4_ctx_gfp_mask_get());
11638+ if (right_scan == NULL)
11639+ return RETERR(-ENOMEM);
11640+ left_scan = right_scan + 1;
11641+ flush_pos = (flush_pos_t *) (left_scan + 1);
11642+
11643+ sb = reiser4_get_current_sb();
11644+ sbinfo = get_super_private(sb);
11645+
11646+ /* Flush-concurrency debug code */
11647+#if REISER4_DEBUG
11648+ atomic_inc(&flush_cnt);
11649+#endif
11650+
11651+ reiser4_enter_flush(sb);
11652+
11653+ /* Initialize a flush position. */
11654+ pos_init(flush_pos);
11655+
11656+ flush_pos->nr_written = nr_written;
11657+ flush_pos->fq = fq;
11658+ flush_pos->flags = flags;
11659+ flush_pos->nr_to_write = nr_to_write;
11660+
11661+ scan_init(right_scan);
11662+ scan_init(left_scan);
11663+
11664+ /* First scan left and remember the leftmost scan position. If the leftmost
11665+ position is unformatted we remember its parent_coord. We scan until counting
11666+ FLUSH_SCAN_MAXNODES.
11667+
11668+ If starting @node is unformatted, at the beginning of left scan its
11669+ parent (twig level node, containing extent item) will be long term
11670+ locked and lock handle will be stored in the
11671+ @right_scan->parent_lock. This lock is used to start the rightward
11672+ scan without redoing the tree traversal (necessary to find parent)
11673+ and, hence, is kept during leftward scan. As a result, we have to
11674+ use try-lock when taking long term locks during the leftward scan.
11675+ */
11676+ ret = scan_left(left_scan, right_scan,
11677+ node, sbinfo->flush.scan_maxnodes);
11678+ if (ret != 0)
11679+ goto failed;
11680+
11681+ leftmost_in_slum = jref(left_scan->node);
11682+ scan_done(left_scan);
11683+
11684+ /* Then possibly go right to decide if we will use a policy of relocating leaves.
11685+ This is only done if we did not scan past (and count) enough nodes during the
11686+ leftward scan. If we do scan right, we only care to go far enough to establish
11687+ that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The
11688+ scan limit is the difference between left_scan.count and the threshold. */
11689+
11690+ todo = sbinfo->flush.relocate_threshold - left_scan->count;
11691+ /* scan right is inherently deadlock prone, because we are
11692+ * (potentially) holding a lock on the twig node at this moment.
11693+ * FIXME: this is incorrect comment: lock is not held */
11694+ if (todo > 0) {
11695+ ret = scan_right(right_scan, node, (unsigned)todo);
11696+ if (ret != 0)
11697+ goto failed;
11698+ }
11699+
11700+ /* Only the right-scan count is needed, release any rightward locks right away. */
11701+ scan_done(right_scan);
11702+
11703+ /* ... and the answer is: we should relocate leaf nodes if at least
11704+ FLUSH_RELOCATE_THRESHOLD nodes were found. */
11705+ flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11706+ (left_scan->count + right_scan->count >=
11707+ sbinfo->flush.relocate_threshold);
11708+
11709+ /* Funny business here. We set the 'point' in the flush_position at prior to
11710+ starting squalloc regardless of whether the first point is
11711+ formatted or unformatted. Without this there would be an invariant, in the
11712+ rest of the code, that if the flush_position is unformatted then
11713+ flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11714+ and if the flush_position is formatted then flush_position->point is non-NULL
11715+ and no parent info is set.
11716+
11717+ This seems lazy, but it makes the initial calls to reverse_relocate_test
11718+ (which ask "is it the pos->point the leftmost child of its parent") much easier
11719+ because we know the first child already. Nothing is broken by this, but the
11720+ reasoning is subtle. Holding an extra reference on a jnode during flush can
11721+ cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11722+ removed from sibling lists until they have zero reference count. Flush would
11723+ never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11724+ deleted to the right. So if nothing is broken, why fix it?
11725+
11726+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11727+ point and in any moment, because of the concurrent file system
11728+ activity (for example, truncate). */
11729+
11730+ /* Check jnode state after flush_scan completed. Having a lock on this
11731+ node or its parent (in case of unformatted) helps us in case of
11732+ concurrent flushing. */
11733+ if (jnode_check_flushprepped(leftmost_in_slum)
11734+ && !jnode_convertible(leftmost_in_slum)) {
11735+ ret = 0;
11736+ goto failed;
11737+ }
11738+
11739+ /* Now setup flush_pos using scan_left's endpoint. */
11740+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11741+ if (ret)
11742+ goto failed;
11743+
11744+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11745+ && node_is_empty(flush_pos->coord.node)) {
11746+ znode *empty = flush_pos->coord.node;
11747+
11748+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11749+ ret = delete_empty_node(empty);
11750+ goto failed;
11751+ }
11752+
11753+ if (jnode_check_flushprepped(leftmost_in_slum)
11754+ && !jnode_convertible(leftmost_in_slum)) {
11755+ ret = 0;
11756+ goto failed;
11757+ }
11758+
11759+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */
11760+ ret = alloc_pos_and_ancestors(flush_pos);
11761+ if (ret)
11762+ goto failed;
11763+
11764+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
11765+ ret = squalloc(flush_pos);
11766+ pos_stop(flush_pos);
11767+ if (ret)
11768+ goto failed;
11769+
11770+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11771+ First, the pos_stop() and pos_valid() routines should be modified
11772+ so that pos_stop() sets a flush_position->stop flag to 1 without
11773+ releasing the current position immediately--instead release it in
11774+ pos_done(). This is a better implementation than the current one anyway.
11775+
11776+ It is not clear that all fields of the flush_position should not be released,
11777+ but at the very least the parent_lock, parent_coord, and parent_load should
11778+ remain held because they are hold the last twig when pos_stop() is
11779+ called.
11780+
11781+ When we reach this point in the code, if the parent_coord is set to after the
11782+ last item then we know that flush reached the end of a twig (and according to
11783+ the new flush queueing design, we will return now). If parent_coord is not
11784+ past the last item, we should check if the current twig has any unallocated
11785+ children to the right (we are not concerned with unallocated children to the
11786+ left--in that case the twig itself should not have been allocated). If the
11787+ twig has unallocated children to the right, set the parent_coord to that
11788+ position and then repeat the call to squalloc.
11789+
11790+ Testing for unallocated children may be defined in two ways: if any internal
11791+ item has a fake block number, it is unallocated; if any extent item is
11792+ unallocated then all of its children are unallocated. But there is a more
11793+ aggressive approach: if there are any dirty children of the twig to the right
11794+ of the current position, we may wish to relocate those nodes now. Checking for
11795+ potential relocation is more expensive as it requires knowing whether there are
11796+ any dirty children that are not unallocated. The extent_needs_allocation
11797+ should be used after setting the correct preceder.
11798+
11799+ When we reach the end of a twig at this point in the code, if the flush can
11800+ continue (when the queue is ready) it will need some information on the future
11801+ starting point. That should be stored away in the flush_handle using a seal, I
11802+ believe. Holding a jref() on the future starting point may break other code
11803+ that deletes that node.
11804+ */
11805+
11806+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11807+ above the twig level. If the VM calls flush above the twig level, do nothing
11808+ and return (but figure out why this happens). The txnmgr should be modified to
11809+ only flush its leaf-level dirty list. This will do all the necessary squeeze
11810+ and allocate steps but leave unallocated branches and possibly unallocated
11811+ twigs (when the twig's leftmost child is not dirty). After flushing the leaf
11812+ level, the remaining unallocated nodes should be given write-optimized
11813+ locations. (Possibly, the remaining unallocated twigs should be allocated just
11814+ before their leftmost child.)
11815+ */
11816+
11817+ /* Any failure reaches this point. */
11818+ failed:
11819+
11820+ switch (ret) {
11821+ case -E_REPEAT:
11822+ case -EINVAL:
11823+ case -E_DEADLOCK:
11824+ case -E_NO_NEIGHBOR:
11825+ case -ENOENT:
11826+ /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11827+ in each case. They already are handled in many cases. */
11828+ /* Something bad happened, but difficult to avoid... Try again! */
11829+ ret = 0;
11830+ }
11831+
11832+ if (leftmost_in_slum)
11833+ jput(leftmost_in_slum);
11834+
11835+ pos_done(flush_pos);
11836+ scan_done(left_scan);
11837+ scan_done(right_scan);
11838+ kfree(right_scan);
11839+
11840+ ON_DEBUG(atomic_dec(&flush_cnt));
11841+
11842+ reiser4_leave_flush(sb);
11843+
11844+ return ret;
11845+}
11846+
11847+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11848+ * flusher should submit all prepped nodes immediately without keeping them in
11849+ * flush queues for long time. The reason for rapid flush mode is to free
11850+ * memory as fast as possible. */
11851+
11852+#if REISER4_USE_RAPID_FLUSH
11853+
11854+/**
11855+ * submit all prepped nodes if rapid flush mode is set,
11856+ * turn rapid flush mode off.
11857+ */
11858+
11859+static int rapid_flush(flush_pos_t * pos)
11860+{
11861+ if (!wbq_available())
11862+ return 0;
11863+
11864+ return write_prepped_nodes(pos);
11865+}
11866+
11867+#else
11868+
11869+#define rapid_flush(pos) (0)
11870+
11871+#endif /* REISER4_USE_RAPID_FLUSH */
11872+
11873+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11874+ flush_queue_t *fq, int *nr_queued,
11875+ int flags)
11876+{
11877+ jnode * node;
11878+
11879+ if (start != NULL) {
11880+ spin_lock_jnode(start);
11881+ if (!jnode_is_flushprepped(start)) {
11882+ assert("zam-1056", start->atom == atom);
11883+ node = start;
11884+ goto enter;
11885+ }
11886+ spin_unlock_jnode(start);
11887+ }
11888+ /*
11889+ * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11890+ * nodes. The atom spin lock is not released until all dirty nodes processed or
11891+ * not prepped node found in the atom dirty lists.
11892+ */
11893+ while ((node = find_first_dirty_jnode(atom, flags))) {
11894+ spin_lock_jnode(node);
11895+ enter:
11896+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11897+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11898+
11899+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
11900+ /* move node to the end of atom's writeback list */
11901+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11902+
11903+ /*
11904+ * jnode is not necessarily on dirty list: if it was dirtied when
11905+ * it was on flush queue - it does not get moved to dirty list
11906+ */
11907+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11908+ WB_LIST, 1));
11909+
11910+ } else if (jnode_is_znode(node)
11911+ && znode_above_root(JZNODE(node))) {
11912+ /*
11913+ * A special case for znode-above-root. The above-root (fake)
11914+ * znode is captured and dirtied when the tree height changes or
11915+ * when the root node is relocated. This causes atoms to fuse so
11916+ * that changes at the root are serialized. However, this node is
11917+ * never flushed. This special case used to be in lock.c to
11918+ * prevent the above-root node from ever being captured, but now
11919+ * that it is captured we simply prevent it from flushing. The
11920+ * log-writer code relies on this to properly log superblock
11921+ * modifications of the tree height.
11922+ */
11923+ jnode_make_wander_nolock(node);
11924+ } else if (JF_ISSET(node, JNODE_RELOC)) {
11925+ queue_jnode(fq, node);
11926+ ++(*nr_queued);
11927+ } else
11928+ break;
11929+
11930+ spin_unlock_jnode(node);
11931+ }
11932+ return node;
11933+}
11934+
11935+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11936+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11937+ * other errors as they are. */
11938+int
11939+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11940+ txn_atom ** atom, jnode *start)
11941+{
11942+ reiser4_super_info_data *sinfo = get_current_super_private();
11943+ flush_queue_t *fq = NULL;
11944+ jnode *node;
11945+ int nr_queued;
11946+ int ret;
11947+
11948+ assert("zam-889", atom != NULL && *atom != NULL);
11949+ assert_spin_locked(&((*atom)->alock));
11950+ assert("zam-892", get_current_context()->trans->atom == *atom);
11951+
11952+ nr_to_write = LONG_MAX;
11953+ while (1) {
11954+ ret = reiser4_fq_by_atom(*atom, &fq);
11955+ if (ret != -E_REPEAT)
11956+ break;
11957+ *atom = get_current_atom_locked();
11958+ }
11959+ if (ret)
11960+ return ret;
11961+
11962+ assert_spin_locked(&((*atom)->alock));
11963+
11964+ /* parallel flushers limit */
11965+ if (sinfo->tmgr.atom_max_flushers != 0) {
11966+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11967+ /* An reiser4_atom_send_event() call is inside
11968+ reiser4_fq_put_nolock() which is called when flush is
11969+ finished and nr_flushers is decremented. */
11970+ reiser4_atom_wait_event(*atom);
11971+ *atom = get_current_atom_locked();
11972+ }
11973+ }
11974+
11975+ /* count ourself as a flusher */
11976+ (*atom)->nr_flushers++;
11977+
11978+ writeout_mode_enable();
11979+
11980+ nr_queued = 0;
11981+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11982+
11983+ if (node == NULL) {
11984+ if (nr_queued == 0) {
11985+ (*atom)->nr_flushers--;
11986+ reiser4_fq_put_nolock(fq);
11987+ reiser4_atom_send_event(*atom);
11988+ /* current atom remains locked */
11989+ writeout_mode_disable();
11990+ return 0;
11991+ }
11992+ spin_unlock_atom(*atom);
11993+ } else {
11994+ jref(node);
11995+ BUG_ON((*atom)->super != node->tree->super);
11996+ spin_unlock_atom(*atom);
11997+ spin_unlock_jnode(node);
11998+ BUG_ON(nr_to_write == 0);
11999+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
12000+ jput(node);
12001+ }
12002+
12003+ ret =
12004+ reiser4_write_fq(fq, nr_submitted,
12005+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
12006+
12007+ *atom = get_current_atom_locked();
12008+ (*atom)->nr_flushers--;
12009+ reiser4_fq_put_nolock(fq);
12010+ reiser4_atom_send_event(*atom);
12011+ spin_unlock_atom(*atom);
12012+
12013+ writeout_mode_disable();
12014+
12015+ if (ret == 0)
12016+ ret = -E_REPEAT;
12017+
12018+ return ret;
12019+}
12020+
12021+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
12022+
12023+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
12024+ reverse parent-first relocate context. Here all we know is the preceder and the block
12025+ number. Since we are going in reverse, the preceder may still be relocated as well, so
12026+ we can't ask the block allocator "is there a closer block available to relocate?" here.
12027+ In the _forward_ parent-first relocate context (not here) we actually call the block
12028+ allocator to try and find a closer location. */
12029+static int
12030+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
12031+ const reiser4_block_nr * nblk)
12032+{
12033+ reiser4_block_nr dist;
12034+
12035+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
12036+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
12037+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
12038+
12039+ /* Distance is the absolute value. */
12040+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
12041+
12042+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
12043+ block, do not relocate. */
12044+ if (dist <= get_current_super_private()->flush.relocate_distance) {
12045+ return 0;
12046+ }
12047+
12048+ return 1;
12049+}
12050+
12051+/* This function is a predicate that tests for relocation. Always called in the
12052+ reverse-parent-first context, when we are asking whether the current node should be
12053+ relocated in order to expand the flush by dirtying the parent level (and thus
12054+ proceeding to flush that level). When traversing in the forward parent-first direction
12055+ (not here), relocation decisions are handled in two places: allocate_znode() and
12056+ extent_needs_allocation(). */
12057+static int
12058+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
12059+ flush_pos_t * pos)
12060+{
12061+ reiser4_block_nr pblk = 0;
12062+ reiser4_block_nr nblk = 0;
12063+
12064+ assert("jmacd-8989", !jnode_is_root(node));
12065+
12066+ /*
12067+ * This function is called only from the
12068+ * reverse_relocate_check_dirty_parent() and only if the parent
12069+ * node is clean. This implies that the parent has the real (i.e., not
12070+ * fake) block number, and, so does the child, because otherwise the
12071+ * parent would be dirty.
12072+ */
12073+
12074+ /* New nodes are treated as if they are being relocated. */
12075+ if (JF_ISSET (node, JNODE_CREATED) ||
12076+ (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
12077+ return 1;
12078+ }
12079+
12080+ /* Find the preceder. FIXME(B): When the child is an unformatted, previously
12081+ existing node, the coord may be leftmost even though the child is not the
12082+ parent-first preceder of the parent. If the first dirty node appears somewhere
12083+ in the middle of the first extent unit, this preceder calculation is wrong.
12084+ Needs more logic in here. */
12085+ if (coord_is_leftmost_unit(parent_coord)) {
12086+ pblk = *znode_get_block(parent_coord->node);
12087+ } else {
12088+ pblk = pos->preceder.blk;
12089+ }
12090+ check_preceder(pblk);
12091+
12092+ /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
12093+ if (pblk == 0) {
12094+ return 1;
12095+ }
12096+
12097+ nblk = *jnode_get_block(node);
12098+
12099+ if (reiser4_blocknr_is_fake(&nblk))
12100+ /* child is unallocated, mark parent dirty */
12101+ return 1;
12102+
12103+ return reverse_relocate_if_close_enough(&pblk, &nblk);
12104+}
12105+
12106+/* This function calls reverse_relocate_test to make a reverse-parent-first
12107+ relocation decision and then, if yes, it marks the parent dirty. */
12108+static int
12109+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
12110+ flush_pos_t * pos)
12111+{
12112+ int ret;
12113+
12114+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
12115+
12116+ ret = reverse_relocate_test(node, parent_coord, pos);
12117+ if (ret < 0) {
12118+ return ret;
12119+ }
12120+
12121+ /* FIXME-ZAM
12122+ if parent is already relocated - we do not want to grab space, right? */
12123+ if (ret == 1) {
12124+ int grabbed;
12125+
12126+ grabbed = get_current_context()->grabbed_blocks;
12127+ if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
12128+ 0)
12129+ reiser4_panic("umka-1250",
12130+ "No space left during flush.");
12131+
12132+ assert("jmacd-18923",
12133+ znode_is_write_locked(parent_coord->node));
12134+ znode_make_dirty(parent_coord->node);
12135+ grabbed2free_mark(grabbed);
12136+ }
12137+ }
12138+
12139+ return 0;
12140+}
12141+
12142+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
12143+ PARENT-FIRST LOOP BEGINS) */
12144+
12145+/* Get the leftmost child for given coord. */
12146+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
12147+{
12148+ int ret;
12149+
12150+ ret = item_utmost_child(coord, LEFT_SIDE, child);
12151+
12152+ if (ret)
12153+ return ret;
12154+
12155+ if (IS_ERR(*child))
12156+ return PTR_ERR(*child);
12157+
12158+ return 0;
12159+}
12160+
12161+/* This step occurs after the left- and right-scans are completed, before starting the
12162+ forward parent-first traversal. Here we attempt to allocate ancestors of the starting
12163+ flush point, which means continuing in the reverse parent-first direction to the
12164+ parent, grandparent, and so on (as long as the child is a leftmost child). This
12165+ routine calls a recursive process, alloc_one_ancestor, which does the real work,
12166+ except there is special-case handling here for the first ancestor, which may be a twig.
12167+ At each level (here and alloc_one_ancestor), we check for relocation and then, if
12168+ the child is a leftmost child, repeat at the next level. On the way back down (the
12169+ recursion), we allocate the ancestors in parent-first order. */
12170+static int alloc_pos_and_ancestors(flush_pos_t * pos)
12171+{
12172+ int ret = 0;
12173+ lock_handle plock;
12174+ load_count pload;
12175+ coord_t pcoord;
12176+
12177+ if (znode_check_flushprepped(pos->lock.node))
12178+ return 0;
12179+
12180+ coord_init_invalid(&pcoord, NULL);
12181+ init_lh(&plock);
12182+ init_load_count(&pload);
12183+
12184+ if (pos->state == POS_ON_EPOINT) {
12185+ /* a special case for pos on twig level, where we already have
12186+ a lock on parent node. */
12187+ /* The parent may not be dirty, in which case we should decide
12188+ whether to relocate the child now. If decision is made to
12189+ relocate the child, the parent is marked dirty. */
12190+ ret =
12191+ reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
12192+ pos);
12193+ if (ret)
12194+ goto exit;
12195+
12196+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
12197+ is leftmost) and the leaf/child, so recursion is not needed.
12198+ Levels above the twig will be allocated for
12199+ write-optimization before the transaction commits. */
12200+
12201+ /* Do the recursive step, allocating zero or more of our
12202+ * ancestors. */
12203+ ret = alloc_one_ancestor(&pos->coord, pos);
12204+
12205+ } else {
12206+ if (!znode_is_root(pos->lock.node)) {
12207+ /* all formatted nodes except tree root */
12208+ ret =
12209+ reiser4_get_parent(&plock, pos->lock.node,
12210+ ZNODE_WRITE_LOCK);
12211+ if (ret)
12212+ goto exit;
12213+
12214+ ret = incr_load_count_znode(&pload, plock.node);
12215+ if (ret)
12216+ goto exit;
12217+
12218+ ret =
12219+ find_child_ptr(plock.node, pos->lock.node, &pcoord);
12220+ if (ret)
12221+ goto exit;
12222+
12223+ ret =
12224+ reverse_relocate_check_dirty_parent(ZJNODE
12225+ (pos->lock.
12226+ node), &pcoord,
12227+ pos);
12228+ if (ret)
12229+ goto exit;
12230+
12231+ ret = alloc_one_ancestor(&pcoord, pos);
12232+ if (ret)
12233+ goto exit;
12234+ }
12235+
12236+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
12237+ }
12238+ exit:
12239+ done_load_count(&pload);
12240+ done_lh(&plock);
12241+ return ret;
12242+}
12243+
12244+/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the
12245+ call to set_preceder, which is the next function described, this checks if the
12246+ child is a leftmost child and returns if it is not. If the child is a leftmost child
12247+ it checks for relocation, possibly dirtying the parent. Then it performs the recursive
12248+ step. */
12249+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
12250+{
12251+ int ret = 0;
12252+ lock_handle alock;
12253+ load_count aload;
12254+ coord_t acoord;
12255+
12256+ /* As we ascend at the left-edge of the region to flush, take this opportunity at
12257+ the twig level to find our parent-first preceder unless we have already set
12258+ it. */
12259+ if (pos->preceder.blk == 0) {
12260+ ret = set_preceder(coord, pos);
12261+ if (ret != 0)
12262+ return ret;
12263+ }
12264+
12265+ /* If the ancestor is clean or already allocated, or if the child is not a
12266+ leftmost child, stop going up, even leaving coord->node not flushprepped. */
12267+ if (znode_check_flushprepped(coord->node)
12268+ || !coord_is_leftmost_unit(coord))
12269+ return 0;
12270+
12271+ init_lh(&alock);
12272+ init_load_count(&aload);
12273+ coord_init_invalid(&acoord, NULL);
12274+
12275+ /* Only ascend to the next level if it is a leftmost child, but write-lock the
12276+ parent in case we will relocate the child. */
12277+ if (!znode_is_root(coord->node)) {
12278+
12279+ ret =
12280+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
12281+ &alock, &aload, ZNODE_WRITE_LOCK,
12282+ 0);
12283+ if (ret != 0) {
12284+ /* FIXME(C): check EINVAL, E_DEADLOCK */
12285+ goto exit;
12286+ }
12287+
12288+ ret =
12289+ reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
12290+ &acoord, pos);
12291+ if (ret != 0) {
12292+ goto exit;
12293+ }
12294+
12295+ /* Recursive call. */
12296+ if (!znode_check_flushprepped(acoord.node)) {
12297+ ret = alloc_one_ancestor(&acoord, pos);
12298+ if (ret)
12299+ goto exit;
12300+ }
12301+ }
12302+
12303+ /* Note: we call allocate with the parent write-locked (except at the root) in
12304+ case we relocate the child, in which case it will modify the parent during this
12305+ call. */
12306+ ret = allocate_znode(coord->node, &acoord, pos);
12307+
12308+ exit:
12309+ done_load_count(&aload);
12310+ done_lh(&alock);
12311+ return ret;
12312+}
12313+
12314+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
12315+ a call to this function at the twig level. During alloc_pos_and_ancestors we may ask:
12316+ should this node be relocated (in reverse parent-first context)? We repeat this
12317+ process as long as the child is the leftmost child, eventually reaching an ancestor of
12318+ the flush point that is not a leftmost child. The preceder of that ancestors, which is
12319+ not a leftmost child, is actually on the leaf level. The preceder of that block is the
12320+ left-neighbor of the flush point. The preceder of that block is the rightmost child of
12321+ the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig
12322+ level, it stops momentarily to remember the block of the rightmost child of the twig on
12323+ the left and sets it to the flush_position's preceder_hint.
12324+
12325+ There is one other place where we may set the flush_position's preceder hint, which is
12326+ during scan-left.
12327+*/
12328+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
12329+{
12330+ int ret;
12331+ coord_t coord;
12332+ lock_handle left_lock;
12333+ load_count left_load;
12334+
12335+ coord_dup(&coord, coord_in);
12336+
12337+ init_lh(&left_lock);
12338+ init_load_count(&left_load);
12339+
12340+ /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
12341+ coord_is_leftmost_unit is not the right test if the unformatted child is in the
12342+ middle of the first extent unit. */
12343+ if (!coord_is_leftmost_unit(&coord)) {
12344+ coord_prev_unit(&coord);
12345+ } else {
12346+ ret =
12347+ reiser4_get_left_neighbor(&left_lock, coord.node,
12348+ ZNODE_READ_LOCK, GN_SAME_ATOM);
12349+ if (ret) {
12350+ /* If we fail for any reason it doesn't matter because the
12351+ preceder is only a hint. We are low-priority at this point, so
12352+ this must be the case. */
12353+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12354+ ret == -ENOENT || ret == -EINVAL
12355+ || ret == -E_DEADLOCK) {
12356+ ret = 0;
12357+ }
12358+ goto exit;
12359+ }
12360+
12361+ ret = incr_load_count_znode(&left_load, left_lock.node);
12362+ if (ret)
12363+ goto exit;
12364+
12365+ coord_init_last_unit(&coord, left_lock.node);
12366+ }
12367+
12368+ ret =
12369+ item_utmost_child_real_block(&coord, RIGHT_SIDE,
12370+ &pos->preceder.blk);
12371+ exit:
12372+ check_preceder(pos->preceder.blk);
12373+ done_load_count(&left_load);
12374+ done_lh(&left_lock);
12375+ return ret;
12376+}
12377+
12378+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12379+
12380+/* This procedure implements the outer loop of the flush algorithm. To put this in
12381+ context, here is the general list of steps taken by the flush routine as a whole:
12382+
12383+ 1. Scan-left
12384+ 2. Scan-right (maybe)
12385+ 3. Allocate initial flush position and its ancestors
12386+ 4. <handle extents>
12387+ 5. <squeeze and next position and its ancestors to-the-right,
12388+ then update position to-the-right>
12389+ 6. <repeat from #4 until flush is stopped>
12390+
12391+ This procedure implements the loop in steps 4 through 6 in the above listing.
12392+
12393+ Step 4: if the current flush position is an extent item (position on the twig level),
12394+ it allocates the extent (allocate_extent_item_in_place) then shifts to the next
12395+ coordinate. If the next coordinate's leftmost child needs flushprep, we will continue.
12396+ If the next coordinate is an internal item, we descend back to the leaf level,
12397+ otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate"
12398+ brings us past the end of the twig level, then we call
12399+ reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
12400+ step #5 which moves to the right.
12401+
12402+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
12403+ tree to allocate any ancestors of the next-right flush position that are not also
12404+ ancestors of the current position. Those ancestors (in top-down order) are the next in
12405+ parent-first order. We squeeze adjacent nodes on the way up until the right node and
12406+ current node share the same parent, then allocate on the way back down. Finally, this
12407+ step sets the flush position to the next-right node. Then repeat steps 4 and 5.
12408+*/
12409+
12410+/* SQUEEZE CODE */
12411+
12412+/* squalloc_right_twig helper function, cut a range of extent items from
12413+ cut node to->node from the beginning up to coord @to. */
12414+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
12415+ znode * left)
12416+{
12417+ coord_t from;
12418+ reiser4_key from_key;
12419+
12420+ coord_init_first_unit(&from, to->node);
12421+ item_key_by_coord(&from, &from_key);
12422+
12423+ return cut_node_content(&from, to, &from_key, to_key, NULL);
12424+}
12425+
12426+/* Copy as much of the leading extents from @right to @left, allocating
12427+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
12428+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
12429+ internal item it calls shift_one_internal_unit and may then return
12430+ SUBTREE_MOVED. */
12431+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
12432+{
12433+ int ret = SUBTREE_MOVED;
12434+ coord_t coord; /* used to iterate over items */
12435+ reiser4_key stop_key;
12436+
12437+ assert("jmacd-2008", !node_is_empty(right));
12438+ coord_init_first_unit(&coord, right);
12439+
12440+ /* FIXME: can be optimized to cut once */
12441+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12442+ ON_DEBUG(void *vp);
12443+
12444+ assert("vs-1468", coord_is_leftmost_unit(&coord));
12445+ ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12446+
12447+ /* stop_key is used to find what was copied and what to cut */
12448+ stop_key = *reiser4_min_key();
12449+ ret = squalloc_extent(left, &coord, pos, &stop_key);
12450+ if (ret != SQUEEZE_CONTINUE) {
12451+ ON_DEBUG(kfree(vp));
12452+ break;
12453+ }
12454+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
12455+
12456+ /* Helper function to do the cutting. */
12457+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12458+ check_me("vs-1466",
12459+ squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12460+
12461+ ON_DEBUG(shift_check(vp, left, coord.node));
12462+ }
12463+
12464+ if (node_is_empty(coord.node))
12465+ ret = SQUEEZE_SOURCE_EMPTY;
12466+
12467+ if (ret == SQUEEZE_TARGET_FULL) {
12468+ goto out;
12469+ }
12470+
12471+ if (node_is_empty(right)) {
12472+ /* The whole right node was copied into @left. */
12473+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12474+ goto out;
12475+ }
12476+
12477+ coord_init_first_unit(&coord, right);
12478+
12479+ if (!item_is_internal(&coord)) {
12480+ /* we do not want to squeeze anything else to left neighbor because "slum"
12481+ is over */
12482+ ret = SQUEEZE_TARGET_FULL;
12483+ goto out;
12484+ }
12485+ assert("jmacd-433", item_is_internal(&coord));
12486+
12487+ /* Shift an internal unit. The child must be allocated before shifting any more
12488+ extents, so we stop here. */
12489+ ret = shift_one_internal_unit(left, right);
12490+
12491+ out:
12492+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12493+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12494+
12495+ if (ret == SQUEEZE_TARGET_FULL) {
12496+ /* We submit prepped nodes here and expect that this @left twig
12497+ * will not be modified again during this jnode_flush() call. */
12498+ int ret1;
12499+
12500+ /* NOTE: seems like io is done under long term locks. */
12501+ ret1 = write_prepped_nodes(pos);
12502+ if (ret1 < 0)
12503+ return ret1;
12504+ }
12505+
12506+ return ret;
12507+}
12508+
12509+#if REISER4_DEBUG
12510+static void item_convert_invariant(flush_pos_t * pos)
12511+{
12512+ assert("edward-1225", coord_is_existing_item(&pos->coord));
12513+ if (chaining_data_present(pos)) {
12514+ item_plugin *iplug = item_convert_plug(pos);
12515+
12516+ assert("edward-1000",
12517+ iplug == item_plugin_by_coord(&pos->coord));
12518+ assert("edward-1001", iplug->f.convert != NULL);
12519+ } else
12520+ assert("edward-1226", pos->child == NULL);
12521+}
12522+#else
12523+
12524+#define item_convert_invariant(pos) noop
12525+
12526+#endif
12527+
12528+/* Scan node items starting from the first one and apply for each
12529+ item its flush ->convert() method (if any). This method may
12530+ resize/kill the item so the tree will be changed.
12531+*/
12532+static int convert_node(flush_pos_t * pos, znode * node)
12533+{
12534+ int ret = 0;
12535+ item_plugin *iplug;
12536+
12537+ assert("edward-304", pos != NULL);
12538+ assert("edward-305", pos->child == NULL);
12539+ assert("edward-475", znode_convertible(node));
12540+ assert("edward-669", znode_is_wlocked(node));
12541+ assert("edward-1210", !node_is_empty(node));
12542+
12543+ if (znode_get_level(node) != LEAF_LEVEL)
12544+ /* unsupported */
12545+ goto exit;
12546+
12547+ coord_init_first_unit(&pos->coord, node);
12548+
12549+ while (1) {
12550+ ret = 0;
12551+ coord_set_to_left(&pos->coord);
12552+ item_convert_invariant(pos);
12553+
12554+ iplug = item_plugin_by_coord(&pos->coord);
12555+ assert("edward-844", iplug != NULL);
12556+
12557+ if (iplug->f.convert) {
12558+ ret = iplug->f.convert(pos);
12559+ if (ret)
12560+ goto exit;
12561+ }
12562+ assert("edward-307", pos->child == NULL);
12563+
12564+ if (coord_next_item(&pos->coord)) {
12565+ /* node is over */
12566+
12567+ if (!chaining_data_present(pos))
12568+ /* finished this node */
12569+ break;
12570+ if (should_chain_next_node(pos)) {
12571+ /* go to next node */
12572+ move_chaining_data(pos, 0 /* to next node */ );
12573+ break;
12574+ }
12575+ /* repeat this node */
12576+ move_chaining_data(pos, 1 /* this node */ );
12577+ continue;
12578+ }
12579+ /* Node is not over.
12580+ Check if there is attached convert data.
12581+ If so roll one item position back and repeat
12582+ on this node
12583+ */
12584+ if (chaining_data_present(pos)) {
12585+
12586+ if (iplug != item_plugin_by_coord(&pos->coord))
12587+ set_item_convert_count(pos, 0);
12588+
12589+ ret = coord_prev_item(&pos->coord);
12590+ assert("edward-1003", !ret);
12591+
12592+ move_chaining_data(pos, 1 /* this node */ );
12593+ }
12594+ }
12595+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12596+ znode_make_dirty(node);
12597+ exit:
12598+ assert("edward-1004", !ret);
12599+ return ret;
12600+}
12601+
12602+/* Squeeze and allocate the right neighbor. This is called after @left and
12603+ its current children have been squeezed and allocated already. This
12604+ procedure's job is to squeeze and items from @right to @left.
12605+
12606+ If at the leaf level, use the shift_everything_left memcpy-optimized
12607+ version of shifting (squeeze_right_leaf).
12608+
12609+ If at the twig level, extents are allocated as they are shifted from @right
12610+ to @left (squalloc_right_twig).
12611+
12612+ At any other level, shift one internal item and return to the caller
12613+ (squalloc_parent_first) so that the shifted-subtree can be processed in
12614+ parent-first order.
12615+
12616+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12617+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12618+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12619+ is returned.
12620+*/
12621+
12622+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12623+ znode * right)
12624+{
12625+ int ret;
12626+
12627+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12628+ * tree owing to error (for example, ENOSPC) in write */
12629+ /* assert("jmacd-9321", !node_is_empty(left)); */
12630+ assert("jmacd-9322", !node_is_empty(right));
12631+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12632+
12633+ switch (znode_get_level(left)) {
12634+ case TWIG_LEVEL:
12635+ /* Shift with extent allocating until either an internal item
12636+ is encountered or everything is shifted or no free space
12637+ left in @left */
12638+ ret = squeeze_right_twig(left, right, pos);
12639+ break;
12640+
12641+ default:
12642+ /* All other levels can use shift_everything until we implement per-item
12643+ flush plugins. */
12644+ ret = squeeze_right_non_twig(left, right);
12645+ break;
12646+ }
12647+
12648+ assert("jmacd-2011", (ret < 0 ||
12649+ ret == SQUEEZE_SOURCE_EMPTY
12650+ || ret == SQUEEZE_TARGET_FULL
12651+ || ret == SUBTREE_MOVED));
12652+ return ret;
12653+}
12654+
12655+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12656+ znode * right)
12657+{
12658+ int ret;
12659+
12660+ ret = squeeze_right_twig(pos->lock.node, right, pos);
12661+ if (ret < 0)
12662+ return ret;
12663+ if (ret > 0) {
12664+ coord_init_after_last_item(&pos->coord, pos->lock.node);
12665+ return ret;
12666+ }
12667+
12668+ coord_init_last_unit(&pos->coord, pos->lock.node);
12669+ return 0;
12670+}
12671+
12672+/* forward declaration */
12673+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12674+
12675+/* do a fast check for "same parents" condition before calling
12676+ * squalloc_upper_levels() */
12677+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12678+ znode * left,
12679+ znode * right)
12680+{
12681+ if (znode_same_parents(left, right))
12682+ return 0;
12683+
12684+ return squalloc_upper_levels(pos, left, right);
12685+}
12686+
12687+/* Check whether the parent of given @right node needs to be processes
12688+ ((re)allocated) prior to processing of the child. If @left and @right do not
12689+ share at least the parent of the @right is after the @left but before the
12690+ @right in parent-first order, we have to (re)allocate it before the @right
12691+ gets (re)allocated. */
12692+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12693+{
12694+ int ret;
12695+
12696+ lock_handle left_parent_lock;
12697+ lock_handle right_parent_lock;
12698+
12699+ load_count left_parent_load;
12700+ load_count right_parent_load;
12701+
12702+ init_lh(&left_parent_lock);
12703+ init_lh(&right_parent_lock);
12704+
12705+ init_load_count(&left_parent_load);
12706+ init_load_count(&right_parent_load);
12707+
12708+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12709+ if (ret)
12710+ goto out;
12711+
12712+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12713+ if (ret)
12714+ goto out;
12715+
12716+ /* Check for same parents */
12717+ if (left_parent_lock.node == right_parent_lock.node)
12718+ goto out;
12719+
12720+ if (znode_check_flushprepped(right_parent_lock.node)) {
12721+ /* Keep parent-first order. In the order, the right parent node stands
12722+ before the @right node. If it is already allocated, we set the
12723+ preceder (next block search start point) to its block number, @right
12724+ node should be allocated after it.
12725+
12726+ However, preceder is set only if the right parent is on twig level.
12727+ The explanation is the following: new branch nodes are allocated over
12728+ already allocated children while the tree grows, it is difficult to
12729+ keep tree ordered, we assume that only leaves and twings are correctly
12730+ allocated. So, only twigs are used as a preceder for allocating of the
12731+ rest of the slum. */
12732+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12733+ pos->preceder.blk =
12734+ *znode_get_block(right_parent_lock.node);
12735+ check_preceder(pos->preceder.blk);
12736+ }
12737+ goto out;
12738+ }
12739+
12740+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12741+ if (ret)
12742+ goto out;
12743+
12744+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12745+ if (ret)
12746+ goto out;
12747+
12748+ ret =
12749+ squeeze_right_neighbor(pos, left_parent_lock.node,
12750+ right_parent_lock.node);
12751+ /* We stop if error. We stop if some items/units were shifted (ret == 0)
12752+ * and thus @right changed its parent. It means we have not process
12753+ * right_parent node prior to processing of @right. Positive return
12754+ * values say that shifting items was not happen because of "empty
12755+ * source" or "target full" conditions. */
12756+ if (ret <= 0)
12757+ goto out;
12758+
12759+ /* parent(@left) and parent(@right) may have different parents also. We
12760+ * do a recursive call for checking that. */
12761+ ret =
12762+ check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12763+ right_parent_lock.node);
12764+ if (ret)
12765+ goto out;
12766+
12767+ /* allocate znode when going down */
12768+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12769+
12770+ out:
12771+ done_load_count(&left_parent_load);
12772+ done_load_count(&right_parent_load);
12773+
12774+ done_lh(&left_parent_lock);
12775+ done_lh(&right_parent_lock);
12776+
12777+ return ret;
12778+}
12779+
12780+/* Check the leftmost child "flushprepped" status, also returns true if child
12781+ * node was not found in cache. */
12782+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12783+{
12784+ int ret;
12785+ int prepped;
12786+
12787+ jnode *child;
12788+
12789+ ret = get_leftmost_child_of_unit(coord, &child);
12790+
12791+ if (ret)
12792+ return ret;
12793+
12794+ if (child) {
12795+ prepped = jnode_check_flushprepped(child);
12796+ jput(child);
12797+ } else {
12798+ /* We consider not existing child as a node which slum
12799+ processing should not continue to. Not cached node is clean,
12800+ so it is flushprepped. */
12801+ prepped = 1;
12802+ }
12803+
12804+ return prepped;
12805+}
12806+
12807+/* (re)allocate znode with automated getting parent node */
12808+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12809+{
12810+ int ret;
12811+ lock_handle parent_lock;
12812+ load_count parent_load;
12813+ coord_t pcoord;
12814+
12815+ assert("zam-851", znode_is_write_locked(node));
12816+
12817+ init_lh(&parent_lock);
12818+ init_load_count(&parent_load);
12819+
12820+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12821+ if (ret)
12822+ goto out;
12823+
12824+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
12825+ if (ret)
12826+ goto out;
12827+
12828+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
12829+ if (ret)
12830+ goto out;
12831+
12832+ ret = allocate_znode(node, &pcoord, pos);
12833+
12834+ out:
12835+ done_load_count(&parent_load);
12836+ done_lh(&parent_lock);
12837+ return ret;
12838+}
12839+
12840+/* Process nodes on leaf level until unformatted node or rightmost node in the
12841+ * slum reached. */
12842+static int handle_pos_on_formatted(flush_pos_t * pos)
12843+{
12844+ int ret;
12845+ lock_handle right_lock;
12846+ load_count right_load;
12847+
12848+ init_lh(&right_lock);
12849+ init_load_count(&right_load);
12850+
12851+ if (should_convert_node(pos, pos->lock.node)) {
12852+ ret = convert_node(pos, pos->lock.node);
12853+ if (ret)
12854+ return ret;
12855+ }
12856+
12857+ while (1) {
12858+ ret =
12859+ neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12860+ ZNODE_WRITE_LOCK,
12861+ !should_convert_next_node(pos,
12862+ right_lock.
12863+ node));
12864+ if (ret)
12865+ break;
12866+
12867+ /* we don't prep(allocate) nodes for flushing twice. This can be suboptimal, or it
12868+ * can be optimal. For now we choose to live with the risk that it will
12869+ * be suboptimal because it would be quite complex to code it to be
12870+ * smarter. */
12871+ if (znode_check_flushprepped(right_lock.node)
12872+ && !znode_convertible(right_lock.node)) {
12873+ assert("edward-1005",
12874+ !should_convert_next_node(pos, right_lock.node));
12875+ pos_stop(pos);
12876+ break;
12877+ }
12878+
12879+ ret = incr_load_count_znode(&right_load, right_lock.node);
12880+ if (ret)
12881+ break;
12882+
12883+ if (should_convert_node(pos, right_lock.node)) {
12884+ ret = convert_node(pos, right_lock.node);
12885+ if (ret)
12886+ break;
12887+ if (node_is_empty(right_lock.node)) {
12888+ /* node became empty after converting, repeat */
12889+ done_load_count(&right_load);
12890+ done_lh(&right_lock);
12891+ continue;
12892+ }
12893+ }
12894+
12895+ /* squeeze _before_ going upward. */
12896+ ret =
12897+ squeeze_right_neighbor(pos, pos->lock.node,
12898+ right_lock.node);
12899+ if (ret < 0)
12900+ break;
12901+
12902+ if (znode_check_flushprepped(right_lock.node)) {
12903+ if (should_convert_next_node(pos, right_lock.node)) {
12904+ /* in spite of flushprepped status of the node,
12905+ its right slum neighbor should be converted */
12906+ assert("edward-953", convert_data(pos));
12907+ assert("edward-954", item_convert_data(pos));
12908+
12909+ if (node_is_empty(right_lock.node)) {
12910+ done_load_count(&right_load);
12911+ done_lh(&right_lock);
12912+ } else
12913+ move_flush_pos(pos, &right_lock,
12914+ &right_load, NULL);
12915+ continue;
12916+ }
12917+ pos_stop(pos);
12918+ break;
12919+ }
12920+
12921+ if (node_is_empty(right_lock.node)) {
12922+ /* repeat if right node was squeezed completely */
12923+ done_load_count(&right_load);
12924+ done_lh(&right_lock);
12925+ continue;
12926+ }
12927+
12928+ /* parent(right_lock.node) has to be processed before
12929+ * (right_lock.node) due to "parent-first" allocation order. */
12930+ ret =
12931+ check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12932+ right_lock.node);
12933+ if (ret)
12934+ break;
12935+ /* (re)allocate _after_ going upward */
12936+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12937+ if (ret)
12938+ break;
12939+
12940+ if (should_terminate_squalloc(pos)) {
12941+ set_item_convert_count(pos, 0);
12942+ break;
12943+ }
12944+
12945+ /* advance the flush position to the right neighbor */
12946+ move_flush_pos(pos, &right_lock, &right_load, NULL);
12947+
12948+ ret = rapid_flush(pos);
12949+ if (ret)
12950+ break;
12951+ }
12952+
12953+ assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12954+
12955+ done_load_count(&right_load);
12956+ done_lh(&right_lock);
12957+
12958+ /* This function indicates via pos whether to stop or go to twig or continue on current
12959+ * level. */
12960+ return ret;
12961+
12962+}
12963+
12964+/* Process nodes on leaf level until unformatted node or rightmost node in the
12965+ * slum reached. */
12966+static int handle_pos_on_leaf(flush_pos_t * pos)
12967+{
12968+ int ret;
12969+
12970+ assert("zam-845", pos->state == POS_ON_LEAF);
12971+
12972+ ret = handle_pos_on_formatted(pos);
12973+
12974+ if (ret == -E_NO_NEIGHBOR) {
12975+ /* cannot get right neighbor, go process extents. */
12976+ pos->state = POS_TO_TWIG;
12977+ return 0;
12978+ }
12979+
12980+ return ret;
12981+}
12982+
12983+/* Process slum on level > 1 */
12984+static int handle_pos_on_internal(flush_pos_t * pos)
12985+{
12986+ assert("zam-850", pos->state == POS_ON_INTERNAL);
12987+ return handle_pos_on_formatted(pos);
12988+}
12989+
12990+/* check whether squalloc should stop before processing given extent */
12991+static int squalloc_extent_should_stop(flush_pos_t * pos)
12992+{
12993+ assert("zam-869", item_is_extent(&pos->coord));
12994+
12995+ /* pos->child is a jnode handle_pos_on_extent() should start with in
12996+ * stead of the first child of the first extent unit. */
12997+ if (pos->child) {
12998+ int prepped;
12999+
13000+ assert("vs-1383", jnode_is_unformatted(pos->child));
13001+ prepped = jnode_check_flushprepped(pos->child);
13002+ pos->pos_in_unit =
13003+ jnode_get_index(pos->child) -
13004+ extent_unit_index(&pos->coord);
13005+ assert("vs-1470",
13006+ pos->pos_in_unit < extent_unit_width(&pos->coord));
13007+ assert("nikita-3434",
13008+ ergo(extent_is_unallocated(&pos->coord),
13009+ pos->pos_in_unit == 0));
13010+ jput(pos->child);
13011+ pos->child = NULL;
13012+
13013+ return prepped;
13014+ }
13015+
13016+ pos->pos_in_unit = 0;
13017+ if (extent_is_unallocated(&pos->coord))
13018+ return 0;
13019+
13020+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
13021+}
13022+
13023+/* Handle the case when regular reiser4 tree (znodes connected one to its
13024+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
13025+ * unformatted nodes. By having a lock on twig level and use extent code
13026+ * routines to process unformatted nodes we swim around an irregular part of
13027+ * reiser4 tree. */
13028+static int handle_pos_on_twig(flush_pos_t * pos)
13029+{
13030+ int ret;
13031+
13032+ assert("zam-844", pos->state == POS_ON_EPOINT);
13033+ assert("zam-843", item_is_extent(&pos->coord));
13034+
13035+ /* We decide should we continue slum processing with current extent
13036+ unit: if leftmost child of current extent unit is flushprepped
13037+ (i.e. clean or already processed by flush) we stop squalloc(). There
13038+ is a fast check for unallocated extents which we assume contain all
13039+ not flushprepped nodes. */
13040+ /* FIXME: Here we implement simple check, we are only looking on the
13041+ leftmost child. */
13042+ ret = squalloc_extent_should_stop(pos);
13043+ if (ret != 0) {
13044+ pos_stop(pos);
13045+ return ret;
13046+ }
13047+
13048+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
13049+ && item_is_extent(&pos->coord)) {
13050+ ret = reiser4_alloc_extent(pos);
13051+ if (ret) {
13052+ break;
13053+ }
13054+ coord_next_unit(&pos->coord);
13055+ }
13056+
13057+ if (coord_is_after_rightmost(&pos->coord)) {
13058+ pos->state = POS_END_OF_TWIG;
13059+ return 0;
13060+ }
13061+ if (item_is_internal(&pos->coord)) {
13062+ pos->state = POS_TO_LEAF;
13063+ return 0;
13064+ }
13065+
13066+ assert("zam-860", item_is_extent(&pos->coord));
13067+
13068+ /* "slum" is over */
13069+ pos->state = POS_INVALID;
13070+ return 0;
13071+}
13072+
13073+/* When we about to return flush position from twig to leaf level we can process
13074+ * the right twig node or move position to the leaf. This processes right twig
13075+ * if it is possible and jump to leaf level if not. */
13076+static int handle_pos_end_of_twig(flush_pos_t * pos)
13077+{
13078+ int ret;
13079+ lock_handle right_lock;
13080+ load_count right_load;
13081+ coord_t at_right;
13082+ jnode *child = NULL;
13083+
13084+ assert("zam-848", pos->state == POS_END_OF_TWIG);
13085+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
13086+
13087+ init_lh(&right_lock);
13088+ init_load_count(&right_load);
13089+
13090+ /* We get a lock on the right twig node even it is not dirty because
13091+ * slum continues or discontinues on leaf level not on next twig. This
13092+ * lock on the right twig is needed for getting its leftmost child. */
13093+ ret =
13094+ reiser4_get_right_neighbor(&right_lock, pos->lock.node,
13095+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
13096+ if (ret)
13097+ goto out;
13098+
13099+ ret = incr_load_count_znode(&right_load, right_lock.node);
13100+ if (ret)
13101+ goto out;
13102+
13103+ /* right twig could be not dirty */
13104+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
13105+ /* If right twig node is dirty we always attempt to squeeze it
13106+ * content to the left... */
13107+ became_dirty:
13108+ ret =
13109+ squeeze_right_twig_and_advance_coord(pos, right_lock.node);
13110+ if (ret <= 0) {
13111+ /* pos->coord is on internal item, go to leaf level, or
13112+ * we have an error which will be caught in squalloc() */
13113+ pos->state = POS_TO_LEAF;
13114+ goto out;
13115+ }
13116+
13117+ /* If right twig was squeezed completely we wave to re-lock
13118+ * right twig. now it is done through the top-level squalloc
13119+ * routine. */
13120+ if (node_is_empty(right_lock.node))
13121+ goto out;
13122+
13123+ /* ... and prep it if it is not yet prepped */
13124+ if (!znode_check_flushprepped(right_lock.node)) {
13125+ /* As usual, process parent before ... */
13126+ ret =
13127+ check_parents_and_squalloc_upper_levels(pos,
13128+ pos->lock.
13129+ node,
13130+ right_lock.
13131+ node);
13132+ if (ret)
13133+ goto out;
13134+
13135+ /* ... processing the child */
13136+ ret =
13137+ lock_parent_and_allocate_znode(right_lock.node,
13138+ pos);
13139+ if (ret)
13140+ goto out;
13141+ }
13142+ } else {
13143+ coord_init_first_unit(&at_right, right_lock.node);
13144+
13145+ /* check first child of next twig, should we continue there ? */
13146+ ret = get_leftmost_child_of_unit(&at_right, &child);
13147+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
13148+ pos_stop(pos);
13149+ goto out;
13150+ }
13151+
13152+ /* check clean twig for possible relocation */
13153+ if (!znode_check_flushprepped(right_lock.node)) {
13154+ ret =
13155+ reverse_relocate_check_dirty_parent(child,
13156+ &at_right, pos);
13157+ if (ret)
13158+ goto out;
13159+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
13160+ goto became_dirty;
13161+ }
13162+ }
13163+
13164+ assert("zam-875", znode_check_flushprepped(right_lock.node));
13165+
13166+ /* Update the preceder by a block number of just processed right twig
13167+ * node. The code above could miss the preceder updating because
13168+ * allocate_znode() could not be called for this node. */
13169+ pos->preceder.blk = *znode_get_block(right_lock.node);
13170+ check_preceder(pos->preceder.blk);
13171+
13172+ coord_init_first_unit(&at_right, right_lock.node);
13173+ assert("zam-868", coord_is_existing_unit(&at_right));
13174+
13175+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
13176+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
13177+
13178+ out:
13179+ done_load_count(&right_load);
13180+ done_lh(&right_lock);
13181+
13182+ if (child)
13183+ jput(child);
13184+
13185+ return ret;
13186+}
13187+
13188+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
13189+ * continue there. */
13190+static int handle_pos_to_leaf(flush_pos_t * pos)
13191+{
13192+ int ret;
13193+ lock_handle child_lock;
13194+ load_count child_load;
13195+ jnode *child;
13196+
13197+ assert("zam-846", pos->state == POS_TO_LEAF);
13198+ assert("zam-847", item_is_internal(&pos->coord));
13199+
13200+ init_lh(&child_lock);
13201+ init_load_count(&child_load);
13202+
13203+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
13204+ if (ret)
13205+ return ret;
13206+ if (child == NULL) {
13207+ pos_stop(pos);
13208+ return 0;
13209+ }
13210+
13211+ if (jnode_check_flushprepped(child)) {
13212+ pos->state = POS_INVALID;
13213+ goto out;
13214+ }
13215+
13216+ ret =
13217+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
13218+ ZNODE_LOCK_LOPRI);
13219+ if (ret)
13220+ goto out;
13221+
13222+ ret = incr_load_count_znode(&child_load, JZNODE(child));
13223+ if (ret)
13224+ goto out;
13225+
13226+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
13227+ if (ret)
13228+ goto out;
13229+
13230+ /* move flush position to leaf level */
13231+ pos->state = POS_ON_LEAF;
13232+ move_flush_pos(pos, &child_lock, &child_load, NULL);
13233+
13234+ if (node_is_empty(JZNODE(child))) {
13235+ ret = delete_empty_node(JZNODE(child));
13236+ pos->state = POS_INVALID;
13237+ }
13238+ out:
13239+ done_load_count(&child_load);
13240+ done_lh(&child_lock);
13241+ jput(child);
13242+
13243+ return ret;
13244+}
13245+
13246+/* move pos from leaf to twig, and move lock from leaf to twig. */
13247+/* Move pos->lock to upper (twig) level */
13248+static int handle_pos_to_twig(flush_pos_t * pos)
13249+{
13250+ int ret;
13251+
13252+ lock_handle parent_lock;
13253+ load_count parent_load;
13254+ coord_t pcoord;
13255+
13256+ assert("zam-852", pos->state == POS_TO_TWIG);
13257+
13258+ init_lh(&parent_lock);
13259+ init_load_count(&parent_load);
13260+
13261+ ret =
13262+ reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
13263+ if (ret)
13264+ goto out;
13265+
13266+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
13267+ if (ret)
13268+ goto out;
13269+
13270+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
13271+ if (ret)
13272+ goto out;
13273+
13274+ assert("zam-870", item_is_internal(&pcoord));
13275+ coord_next_item(&pcoord);
13276+
13277+ if (coord_is_after_rightmost(&pcoord))
13278+ pos->state = POS_END_OF_TWIG;
13279+ else if (item_is_extent(&pcoord))
13280+ pos->state = POS_ON_EPOINT;
13281+ else {
13282+ /* Here we understand that getting -E_NO_NEIGHBOR in
13283+ * handle_pos_on_leaf() was because of just a reaching edge of
13284+ * slum */
13285+ pos_stop(pos);
13286+ goto out;
13287+ }
13288+
13289+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
13290+
13291+ out:
13292+ done_load_count(&parent_load);
13293+ done_lh(&parent_lock);
13294+
13295+ return ret;
13296+}
13297+
13298+typedef int (*pos_state_handle_t) (flush_pos_t *);
13299+static pos_state_handle_t flush_pos_handlers[] = {
13300+ /* process formatted nodes on leaf level, keep lock on a leaf node */
13301+ [POS_ON_LEAF] = handle_pos_on_leaf,
13302+ /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
13303+ * being processed */
13304+ [POS_ON_EPOINT] = handle_pos_on_twig,
13305+ /* move a lock from leaf node to its parent for further processing of unformatted nodes */
13306+ [POS_TO_TWIG] = handle_pos_to_twig,
13307+ /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
13308+ * pos->coord points to the leaf node we jump to */
13309+ [POS_TO_LEAF] = handle_pos_to_leaf,
13310+ /* after processing last extent in the twig node, attempting to shift items from the twigs
13311+ * right neighbor and process them while shifting */
13312+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
13313+ /* process formatted nodes on internal level, keep lock on an internal node */
13314+ [POS_ON_INTERNAL] = handle_pos_on_internal
13315+};
13316+
13317+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
13318+ * encrypt) nodes and their ancestors in "parent-first" order */
13319+static int squalloc(flush_pos_t * pos)
13320+{
13321+ int ret = 0;
13322+
13323+ /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
13324+ * greater CPU efficiency? Measure and see.... -Hans */
13325+ while (pos_valid(pos)) {
13326+ ret = flush_pos_handlers[pos->state] (pos);
13327+ if (ret < 0)
13328+ break;
13329+
13330+ ret = rapid_flush(pos);
13331+ if (ret)
13332+ break;
13333+ }
13334+
13335+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
13336+ routines, -E_NO_NEIGHBOR means that slum edge was reached */
13337+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
13338+ ret = 0;
13339+
13340+ return ret;
13341+}
13342+
13343+static void update_ldkey(znode * node)
13344+{
13345+ reiser4_key ldkey;
13346+
13347+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13348+ if (node_is_empty(node))
13349+ return;
13350+
13351+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13352+}
13353+
13354+/* this is to be called after calling of shift node's method to shift data from @right to
13355+ @left. It sets left delimiting keys of @left and @right to keys of first items of @left
13356+ and @right correspondingly and sets right delimiting key of @left to first key of @right */
13357+static void update_znode_dkeys(znode * left, znode * right)
13358+{
13359+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13360+ assert("vs-1629", (znode_is_write_locked(left) &&
13361+ znode_is_write_locked(right)));
13362+
13363+ /* we need to update left delimiting of left if it was empty before shift */
13364+ update_ldkey(left);
13365+ update_ldkey(right);
13366+ if (node_is_empty(right))
13367+ znode_set_rd_key(left, znode_get_rd_key(right));
13368+ else
13369+ znode_set_rd_key(left, znode_get_ld_key(right));
13370+}
13371+
13372+/* try to shift everything from @right to @left. If everything was shifted -
13373+ @right is removed from the tree. Result is the number of bytes shifted. */
13374+static int
13375+shift_everything_left(znode * right, znode * left, carry_level * todo)
13376+{
13377+ coord_t from;
13378+ node_plugin *nplug;
13379+ carry_plugin_info info;
13380+
13381+ coord_init_after_last_item(&from, right);
13382+
13383+ nplug = node_plugin_by_node(right);
13384+ info.doing = NULL;
13385+ info.todo = todo;
13386+ return nplug->shift(&from, left, SHIFT_LEFT,
13387+ 1 /* delete @right if it becomes empty */ ,
13388+ 1
13389+ /* move coord @from to node @left if everything will be shifted */
13390+ ,
13391+ &info);
13392+}
13393+
13394+/* Shift as much as possible from @right to @left using the memcpy-optimized
13395+ shift_everything_left. @left and @right are formatted neighboring nodes on
13396+ leaf level. */
13397+static int squeeze_right_non_twig(znode * left, znode * right)
13398+{
13399+ int ret;
13400+ carry_pool *pool;
13401+ carry_level *todo;
13402+
13403+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13404+
13405+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13406+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13407+ return SQUEEZE_TARGET_FULL;
13408+
13409+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13410+ if (IS_ERR(pool))
13411+ return PTR_ERR(pool);
13412+ todo = (carry_level *) (pool + 1);
13413+ init_carry_level(todo, pool);
13414+
13415+ ret = shift_everything_left(right, left, todo);
13416+ if (ret > 0) {
13417+ /* something was shifted */
13418+ reiser4_tree *tree;
13419+ __u64 grabbed;
13420+
13421+ znode_make_dirty(left);
13422+ znode_make_dirty(right);
13423+
13424+ /* update delimiting keys of nodes which participated in
13425+ shift. FIXME: it would be better to have this in shift
13426+ node's operation. But it can not be done there. Nobody
13427+ remembers why, though */
13428+ tree = znode_get_tree(left);
13429+ write_lock_dk(tree);
13430+ update_znode_dkeys(left, right);
13431+ write_unlock_dk(tree);
13432+
13433+ /* Carry is called to update delimiting key and, maybe, to remove empty
13434+ node. */
13435+ grabbed = get_current_context()->grabbed_blocks;
13436+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13437+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13438+ ret = reiser4_carry(todo, NULL /* previous level */ );
13439+ grabbed2free_mark(grabbed);
13440+ } else {
13441+ /* Shifting impossible, we return appropriate result code */
13442+ ret =
13443+ node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13444+ SQUEEZE_TARGET_FULL;
13445+ }
13446+
13447+ done_carry_pool(pool);
13448+
13449+ return ret;
13450+}
13451+
13452+#if REISER4_DEBUG
13453+static int sibling_link_is_ok(const znode *left, const znode *right)
13454+{
13455+ int result;
13456+
13457+ read_lock_tree(znode_get_tree(left));
13458+ result = (left->right == right && left == right->left);
13459+ read_unlock_tree(znode_get_tree(left));
13460+ return result;
13461+}
13462+#endif
13463+
13464+/* Shift first unit of first item if it is an internal one. Return
13465+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13466+ SUBTREE_MOVED. */
13467+static int shift_one_internal_unit(znode * left, znode * right)
13468+{
13469+ int ret;
13470+ carry_pool *pool;
13471+ carry_level *todo;
13472+ coord_t *coord;
13473+ carry_plugin_info *info;
13474+ int size, moved;
13475+
13476+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13477+ assert("nikita-2435", znode_is_write_locked(left));
13478+ assert("nikita-2436", znode_is_write_locked(right));
13479+ assert("nikita-2434", sibling_link_is_ok(left, right));
13480+
13481+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13482+ sizeof(*coord) + sizeof(*info)
13483+#if REISER4_DEBUG
13484+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
13485+#endif
13486+ );
13487+ if (IS_ERR(pool))
13488+ return PTR_ERR(pool);
13489+ todo = (carry_level *) (pool + 1);
13490+ init_carry_level(todo, pool);
13491+
13492+ coord = (coord_t *) (todo + 3);
13493+ coord_init_first_unit(coord, right);
13494+ info = (carry_plugin_info *) (coord + 1);
13495+
13496+#if REISER4_DEBUG
13497+ if (!node_is_empty(left)) {
13498+ coord_t *last;
13499+ reiser4_key *right_key;
13500+ reiser4_key *left_key;
13501+
13502+ last = (coord_t *) (info + 1);
13503+ right_key = (reiser4_key *) (last + 1);
13504+ left_key = right_key + 1;
13505+ coord_init_last_unit(last, left);
13506+
13507+ assert("nikita-2463",
13508+ keyle(item_key_by_coord(last, left_key),
13509+ item_key_by_coord(coord, right_key)));
13510+ }
13511+#endif
13512+
13513+ assert("jmacd-2007", item_is_internal(coord));
13514+
13515+ size = item_length_by_coord(coord);
13516+ info->todo = todo;
13517+ info->doing = NULL;
13518+
13519+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13520+ 1
13521+ /* delete @right if it becomes empty */
13522+ ,
13523+ 0
13524+ /* do not move coord @coord to node @left */
13525+ ,
13526+ info);
13527+
13528+ /* If shift returns positive, then we shifted the item. */
13529+ assert("vs-423", ret <= 0 || size == ret);
13530+ moved = (ret > 0);
13531+
13532+ if (moved) {
13533+ /* something was moved */
13534+ reiser4_tree *tree;
13535+ int grabbed;
13536+
13537+ znode_make_dirty(left);
13538+ znode_make_dirty(right);
13539+ tree = znode_get_tree(left);
13540+ write_lock_dk(tree);
13541+ update_znode_dkeys(left, right);
13542+ write_unlock_dk(tree);
13543+
13544+ /* reserve space for delimiting keys after shifting */
13545+ grabbed = get_current_context()->grabbed_blocks;
13546+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13547+ assert("nikita-3003", ret == 0); /* reserved space is exhausted. Ask Hans. */
13548+
13549+ ret = reiser4_carry(todo, NULL /* previous level */ );
13550+ grabbed2free_mark(grabbed);
13551+ }
13552+
13553+ done_carry_pool(pool);
13554+
13555+ if (ret != 0) {
13556+ /* Shift or carry operation failed. */
13557+ assert("jmacd-7325", ret < 0);
13558+ return ret;
13559+ }
13560+
13561+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13562+}
13563+
13564+/* Make the final relocate/wander decision during forward parent-first squalloc for a
13565+ znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13566+static int
13567+allocate_znode_loaded(znode * node,
13568+ const coord_t * parent_coord, flush_pos_t * pos)
13569+{
13570+ int ret;
13571+ reiser4_super_info_data *sbinfo = get_current_super_private();
13572+ /* FIXME(D): We have the node write-locked and should have checked for !
13573+ allocated() somewhere before reaching this point, but there can be a race, so
13574+ this assertion is bogus. */
13575+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13576+ assert("jmacd-7988", znode_is_write_locked(node));
13577+ assert("jmacd-7989", coord_is_invalid(parent_coord)
13578+ || znode_is_write_locked(parent_coord->node));
13579+
13580+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13581+ znode_is_root(node) ||
13582+ /* We have enough nodes to relocate no matter what. */
13583+ (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13584+ /* No need to decide with new nodes, they are treated the same as
13585+ relocate. If the root node is dirty, relocate. */
13586+ if (pos->preceder.blk == 0) {
13587+ /* preceder is unknown and we have decided to relocate node --
13588+ using of default value for search start is better than search
13589+ from block #0. */
13590+ get_blocknr_hint_default(&pos->preceder.blk);
13591+ check_preceder(pos->preceder.blk);
13592+ }
13593+
13594+ goto best_reloc;
13595+
13596+ } else if (pos->preceder.blk == 0) {
13597+ /* If we don't know the preceder, leave it where it is. */
13598+ jnode_make_wander(ZJNODE(node));
13599+ } else {
13600+ /* Make a decision based on block distance. */
13601+ reiser4_block_nr dist;
13602+ reiser4_block_nr nblk = *znode_get_block(node);
13603+
13604+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13605+ assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13606+ assert("jmacd-6174", pos->preceder.blk != 0);
13607+
13608+ if (pos->preceder.blk == nblk - 1) {
13609+ /* Ideal. */
13610+ jnode_make_wander(ZJNODE(node));
13611+ } else {
13612+
13613+ dist =
13614+ (nblk <
13615+ pos->preceder.blk) ? (pos->preceder.blk -
13616+ nblk) : (nblk -
13617+ pos->preceder.blk);
13618+
13619+ /* See if we can find a closer block (forward direction only). */
13620+ pos->preceder.max_dist =
13621+ min((reiser4_block_nr) sbinfo->flush.
13622+ relocate_distance, dist);
13623+ pos->preceder.level = znode_get_level(node);
13624+
13625+ ret = allocate_znode_update(node, parent_coord, pos);
13626+
13627+ pos->preceder.max_dist = 0;
13628+
13629+ if (ret && (ret != -ENOSPC))
13630+ return ret;
13631+
13632+ if (ret == 0) {
13633+ /* Got a better allocation. */
13634+ znode_make_reloc(node, pos->fq);
13635+ } else if (dist < sbinfo->flush.relocate_distance) {
13636+ /* The present allocation is good enough. */
13637+ jnode_make_wander(ZJNODE(node));
13638+ } else {
13639+ /* Otherwise, try to relocate to the best position. */
13640+ best_reloc:
13641+ ret =
13642+ allocate_znode_update(node, parent_coord,
13643+ pos);
13644+ if (ret != 0)
13645+ return ret;
13646+
13647+ /* set JNODE_RELOC bit _after_ node gets allocated */
13648+ znode_make_reloc(node, pos->fq);
13649+ }
13650+ }
13651+ }
13652+
13653+ /* This is the new preceder. */
13654+ pos->preceder.blk = *znode_get_block(node);
13655+ check_preceder(pos->preceder.blk);
13656+ pos->alloc_cnt += 1;
13657+
13658+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13659+
13660+ return 0;
13661+}
13662+
13663+static int
13664+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13665+{
13666+ /*
13667+ * perform znode allocation with znode pinned in memory to avoid races
13668+ * with asynchronous emergency flush (which plays with
13669+ * JNODE_FLUSH_RESERVED bit).
13670+ */
13671+ return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13672+}
13673+
13674+/* A subroutine of allocate_znode, this is called first to see if there is a close
13675+ position to relocate to. It may return ENOSPC if there is no close position. If there
13676+ is no close position it may not relocate. This takes care of updating the parent node
13677+ with the relocated block address. */
13678+static int
13679+allocate_znode_update(znode * node, const coord_t * parent_coord,
13680+ flush_pos_t * pos)
13681+{
13682+ int ret;
13683+ reiser4_block_nr blk;
13684+ lock_handle uber_lock;
13685+ int flush_reserved_used = 0;
13686+ int grabbed;
13687+ reiser4_context *ctx;
13688+ reiser4_super_info_data *sbinfo;
13689+
13690+ init_lh(&uber_lock);
13691+
13692+ ctx = get_current_context();
13693+ sbinfo = get_super_private(ctx->super);
13694+
13695+ grabbed = ctx->grabbed_blocks;
13696+
13697+ /* discard e-flush allocation */
13698+ ret = zload(node);
13699+ if (ret)
13700+ return ret;
13701+
13702+ if (ZF_ISSET(node, JNODE_CREATED)) {
13703+ assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13704+ pos->preceder.block_stage = BLOCK_UNALLOCATED;
13705+ } else {
13706+ pos->preceder.block_stage = BLOCK_GRABBED;
13707+
13708+ /* The disk space for relocating the @node is already reserved in "flush reserved"
13709+ * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13710+ * space from whole disk not from only 95%). */
13711+ if (znode_get_level(node) == LEAF_LEVEL) {
13712+ /*
13713+ * earlier (during do_jnode_make_dirty()) we decided
13714+ * that @node can possibly go into overwrite set and
13715+ * reserved block for its wandering location.
13716+ */
13717+ txn_atom *atom = get_current_atom_locked();
13718+ assert("nikita-3449",
13719+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13720+ flush_reserved2grabbed(atom, (__u64) 1);
13721+ spin_unlock_atom(atom);
13722+ /*
13723+ * we are trying to move node into relocate
13724+ * set. Allocation of relocated position "uses"
13725+ * reserved block.
13726+ */
13727+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
13728+ flush_reserved_used = 1;
13729+ } else {
13730+ ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13731+ if (ret != 0)
13732+ goto exit;
13733+ }
13734+ }
13735+
13736+ /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13737+ ret = reiser4_alloc_block(&pos->preceder, &blk,
13738+ BA_FORMATTED | BA_PERMANENT);
13739+ if (ret)
13740+ goto exit;
13741+
13742+ if (!ZF_ISSET(node, JNODE_CREATED) &&
13743+ (ret =
13744+ reiser4_dealloc_block(znode_get_block(node), 0,
13745+ BA_DEFER | BA_FORMATTED)))
13746+ goto exit;
13747+
13748+ if (likely(!znode_is_root(node))) {
13749+ item_plugin *iplug;
13750+
13751+ iplug = item_plugin_by_coord(parent_coord);
13752+ assert("nikita-2954", iplug->f.update != NULL);
13753+ iplug->f.update(parent_coord, &blk);
13754+
13755+ znode_make_dirty(parent_coord->node);
13756+
13757+ } else {
13758+ reiser4_tree *tree = znode_get_tree(node);
13759+ znode *uber;
13760+
13761+ /* We take a longterm lock on the fake node in order to change
13762+ the root block number. This may cause atom fusion. */
13763+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13764+ &uber_lock);
13765+ /* The fake node cannot be deleted, and we must have priority
13766+ here, and may not be confused with ENOSPC. */
13767+ assert("jmacd-74412",
13768+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13769+
13770+ if (ret)
13771+ goto exit;
13772+
13773+ uber = uber_lock.node;
13774+
13775+ write_lock_tree(tree);
13776+ tree->root_block = blk;
13777+ write_unlock_tree(tree);
13778+
13779+ znode_make_dirty(uber);
13780+ }
13781+
13782+ ret = znode_rehash(node, &blk);
13783+ exit:
13784+ if (ret) {
13785+ /* Get flush reserved block back if something fails, because
13786+ * callers assume that on error block wasn't relocated and its
13787+ * flush reserved block wasn't used. */
13788+ if (flush_reserved_used) {
13789+ /*
13790+ * ok, we failed to move node into relocate
13791+ * set. Restore status quo.
13792+ */
13793+ grabbed2flush_reserved((__u64) 1);
13794+ ZF_SET(node, JNODE_FLUSH_RESERVED);
13795+ }
13796+ }
13797+ zrelse(node);
13798+ done_lh(&uber_lock);
13799+ grabbed2free_mark(grabbed);
13800+ return ret;
13801+}
13802+
13803+/* JNODE INTERFACE */
13804+
13805+/* Lock a node (if formatted) and then get its parent locked, set the child's
13806+ coordinate in the parent. If the child is the root node, the above_root
13807+ znode is returned but the coord is not set. This function may cause atom
13808+ fusion, but it is only used for read locks (at this point) and therefore
13809+ fusion only occurs when the parent is already dirty. */
13810+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13811+ pointer in jnodes. */
13812+static int
13813+jnode_lock_parent_coord(jnode * node,
13814+ coord_t * coord,
13815+ lock_handle * parent_lh,
13816+ load_count * parent_zh,
13817+ znode_lock_mode parent_mode, int try)
13818+{
13819+ int ret;
13820+
13821+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13822+ assert("edward-54", jnode_is_unformatted(node)
13823+ || znode_is_any_locked(JZNODE(node)));
13824+
13825+ if (!jnode_is_znode(node)) {
13826+ reiser4_key key;
13827+ tree_level stop_level = TWIG_LEVEL;
13828+ lookup_bias bias = FIND_EXACT;
13829+
13830+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13831+
13832+ /* The case when node is not znode, but can have parent coord
13833+ (unformatted node, node which represents cluster page,
13834+ etc..). Generate a key for the appropriate entry, search
13835+ in the tree using coord_by_key, which handles locking for
13836+ us. */
13837+
13838+ /*
13839+ * nothing is locked at this moment, so, nothing prevents
13840+ * concurrent truncate from removing jnode from inode. To
13841+ * prevent this spin-lock jnode. jnode can be truncated just
13842+ * after call to the jnode_build_key(), but this is ok,
13843+ * because coord_by_key() will just fail to find appropriate
13844+ * extent.
13845+ */
13846+ spin_lock_jnode(node);
13847+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13848+ jnode_build_key(node, &key);
13849+ ret = 0;
13850+ } else
13851+ ret = RETERR(-ENOENT);
13852+ spin_unlock_jnode(node);
13853+
13854+ if (ret != 0)
13855+ return ret;
13856+
13857+ if (jnode_is_cluster_page(node))
13858+ stop_level = LEAF_LEVEL;
13859+
13860+ assert("jmacd-1812", coord != NULL);
13861+
13862+ ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13863+ parent_mode, bias, stop_level, stop_level,
13864+ CBK_UNIQUE, NULL /*ra_info */ );
13865+ switch (ret) {
13866+ case CBK_COORD_NOTFOUND:
13867+ assert("edward-1038",
13868+ ergo(jnode_is_cluster_page(node),
13869+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13870+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13871+ warning("nikita-3177", "Parent not found");
13872+ return ret;
13873+ case CBK_COORD_FOUND:
13874+ if (coord->between != AT_UNIT) {
13875+ /* FIXME: comment needed */
13876+ done_lh(parent_lh);
13877+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13878+ warning("nikita-3178",
13879+ "Found but not happy: %i",
13880+ coord->between);
13881+ }
13882+ return RETERR(-ENOENT);
13883+ }
13884+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13885+ if (ret != 0)
13886+ return ret;
13887+ /* if (jnode_is_cluster_page(node)) {
13888+ races with write() are possible
13889+ check_child_cluster (parent_lh->node);
13890+ }
13891+ */
13892+ break;
13893+ default:
13894+ return ret;
13895+ }
13896+
13897+ } else {
13898+ int flags;
13899+ znode *z;
13900+
13901+ z = JZNODE(node);
13902+ /* Formatted node case: */
13903+ assert("jmacd-2061", !znode_is_root(z));
13904+
13905+ flags = GN_ALLOW_NOT_CONNECTED;
13906+ if (try)
13907+ flags |= GN_TRY_LOCK;
13908+
13909+ ret =
13910+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13911+ if (ret != 0)
13912+ /* -E_REPEAT is ok here, it is handled by the caller. */
13913+ return ret;
13914+
13915+ /* Make the child's position "hint" up-to-date. (Unless above
13916+ root, which caller must check.) */
13917+ if (coord != NULL) {
13918+
13919+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
13920+ if (ret != 0) {
13921+ warning("jmacd-976812386",
13922+ "incr_load_count_znode failed: %d",
13923+ ret);
13924+ return ret;
13925+ }
13926+
13927+ ret = find_child_ptr(parent_lh->node, z, coord);
13928+ if (ret != 0) {
13929+ warning("jmacd-976812",
13930+ "find_child_ptr failed: %d", ret);
13931+ return ret;
13932+ }
13933+ }
13934+ }
13935+
13936+ return 0;
13937+}
13938+
13939+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13940+ If there is no next neighbor or the neighbor is not in memory or if there is a
13941+ neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13942+ In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13943+static int neighbor_in_slum(znode * node, /* starting point */
13944+ lock_handle * lock, /* lock on starting point */
13945+ sideof side, /* left or right direction we seek the next node in */
13946+ znode_lock_mode mode, /* kind of lock we want */
13947+ int check_dirty)
13948+{ /* true if the neighbor should be dirty */
13949+ int ret;
13950+
13951+ assert("jmacd-6334", znode_is_connected(node));
13952+
13953+ ret =
13954+ reiser4_get_neighbor(lock, node, mode,
13955+ GN_SAME_ATOM | (side ==
13956+ LEFT_SIDE ? GN_GO_LEFT : 0));
13957+
13958+ if (ret) {
13959+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
13960+ /* FIXME(C): check EINVAL, E_DEADLOCK */
13961+ if (ret == -ENOENT) {
13962+ ret = RETERR(-E_NO_NEIGHBOR);
13963+ }
13964+
13965+ return ret;
13966+ }
13967+ if (!check_dirty)
13968+ return 0;
13969+ /* Check dirty bit of locked znode, no races here */
13970+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13971+ return 0;
13972+
13973+ done_lh(lock);
13974+ return RETERR(-E_NO_NEIGHBOR);
13975+}
13976+
13977+/* Return true if two znodes have the same parent. This is called with both nodes
13978+ write-locked (for squeezing) so no tree lock is needed. */
13979+static int znode_same_parents(znode * a, znode * b)
13980+{
13981+ int result;
13982+
13983+ assert("jmacd-7011", znode_is_write_locked(a));
13984+ assert("jmacd-7012", znode_is_write_locked(b));
13985+
13986+ /* We lock the whole tree for this check.... I really don't like whole tree
13987+ * locks... -Hans */
13988+ read_lock_tree(znode_get_tree(a));
13989+ result = (znode_parent(a) == znode_parent(b));
13990+ read_unlock_tree(znode_get_tree(a));
13991+ return result;
13992+}
13993+
13994+/* FLUSH SCAN */
13995+
13996+/* Initialize the flush_scan data structure. */
13997+static void scan_init(flush_scan * scan)
13998+{
13999+ memset(scan, 0, sizeof(*scan));
14000+ init_lh(&scan->node_lock);
14001+ init_lh(&scan->parent_lock);
14002+ init_load_count(&scan->parent_load);
14003+ init_load_count(&scan->node_load);
14004+ coord_init_invalid(&scan->parent_coord, NULL);
14005+}
14006+
14007+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
14008+static void scan_done(flush_scan * scan)
14009+{
14010+ done_load_count(&scan->node_load);
14011+ if (scan->node != NULL) {
14012+ jput(scan->node);
14013+ scan->node = NULL;
14014+ }
14015+ done_load_count(&scan->parent_load);
14016+ done_lh(&scan->parent_lock);
14017+ done_lh(&scan->node_lock);
14018+}
14019+
14020+/* Returns true if flush scanning is finished. */
14021+int reiser4_scan_finished(flush_scan * scan)
14022+{
14023+ return scan->stop || (scan->direction == RIGHT_SIDE &&
14024+ scan->count >= scan->max_count);
14025+}
14026+
14027+/* Return true if the scan should continue to the @tonode. True if the node meets the
14028+ same_slum_check condition. If not, deref the "left" node and stop the scan. */
14029+int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
14030+{
14031+ int go = same_slum_check(scan->node, tonode, 1, 0);
14032+
14033+ if (!go) {
14034+ scan->stop = 1;
14035+ jput(tonode);
14036+ }
14037+
14038+ return go;
14039+}
14040+
14041+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
14042+ count, e.g., skipped unallocated nodes), deref previous current, and copy the current
14043+ parent coordinate. */
14044+int
14045+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
14046+ const coord_t * parent)
14047+{
14048+ /* Release the old references, take the new reference. */
14049+ done_load_count(&scan->node_load);
14050+
14051+ if (scan->node != NULL) {
14052+ jput(scan->node);
14053+ }
14054+ scan->node = node;
14055+ scan->count += add_count;
14056+
14057+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent() code could
14058+ delay this update step until it finishes and update the parent_coord only once.
14059+ It did that before, but there was a bug and this was the easiest way to make it
14060+ correct. */
14061+ if (parent != NULL) {
14062+ coord_dup(&scan->parent_coord, parent);
14063+ }
14064+
14065+ /* Failure may happen at the incr_load_count call, but the caller can assume the reference
14066+ is safely taken. */
14067+ return incr_load_count_jnode(&scan->node_load, node);
14068+}
14069+
14070+/* Return true if scanning in the leftward direction. */
14071+int reiser4_scanning_left(flush_scan * scan)
14072+{
14073+ return scan->direction == LEFT_SIDE;
14074+}
14075+
14076+/* Performs leftward scanning starting from either kind of node. Counts the starting
14077+ node. The right-scan object is passed in for the left-scan in order to copy the parent
14078+ of an unformatted starting position. This way we avoid searching for the unformatted
14079+ node's parent when scanning in each direction. If we search for the parent once it is
14080+ set in both scan objects. The limit parameter tells flush-scan when to stop.
14081+
14082+ Rapid scanning is used only during scan_left, where we are interested in finding the
14083+ 'leftpoint' where we begin flushing. We are interested in stopping at the left child
14084+ of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The
14085+ problem is finding a way to flush only those nodes without unallocated children, and it
14086+ is difficult to solve in the bottom-up flushing algorithm we are currently using. The
14087+ problem can be solved by scanning left at every level as we go upward, but this would
14088+ basically bring us back to using a top-down allocation strategy, which we already tried
14089+ (see BK history from May 2002), and has a different set of problems. The top-down
14090+ strategy makes avoiding unallocated children easier, but makes it difficult to
14091+ propertly flush dirty children with clean parents that would otherwise stop the
14092+ top-down flush, only later to dirty the parent once the children are flushed. So we
14093+ solve the problem in the bottom-up algorithm with a special case for twigs and leaves
14094+ only.
14095+
14096+ The first step in solving the problem is this rapid leftward scan. After we determine
14097+ that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
14098+ are no longer interested in the exact count, we are only interested in finding a the
14099+ best place to start the flush. We could choose one of two possibilities:
14100+
14101+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
14102+ This requires checking one leaf per rapid-scan twig
14103+
14104+ 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
14105+ to the left. This requires checking possibly all of the in-memory children of each
14106+ twig during the rapid scan.
14107+
14108+ For now we implement the first policy.
14109+*/
14110+static int
14111+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
14112+{
14113+ int ret = 0;
14114+
14115+ scan->max_count = limit;
14116+ scan->direction = LEFT_SIDE;
14117+
14118+ ret = scan_set_current(scan, jref(node), 1, NULL);
14119+ if (ret != 0) {
14120+ return ret;
14121+ }
14122+
14123+ ret = scan_common(scan, right);
14124+ if (ret != 0) {
14125+ return ret;
14126+ }
14127+
14128+ /* Before rapid scanning, we need a lock on scan->node so that we can get its
14129+ parent, only if formatted. */
14130+ if (jnode_is_znode(scan->node)) {
14131+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
14132+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
14133+ }
14134+
14135+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
14136+ return ret;
14137+}
14138+
14139+/* Performs rightward scanning... Does not count the starting node. The limit parameter
14140+ is described in scan_left. If the starting node is unformatted then the
14141+ parent_coord was already set during scan_left. The rapid_after parameter is not used
14142+ during right-scanning.
14143+
14144+ scan_right is only called if the scan_left operation does not count at least
14145+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to
14146+ the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
14147+ scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
14148+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
14149+{
14150+ int ret;
14151+
14152+ scan->max_count = limit;
14153+ scan->direction = RIGHT_SIDE;
14154+
14155+ ret = scan_set_current(scan, jref(node), 0, NULL);
14156+ if (ret != 0) {
14157+ return ret;
14158+ }
14159+
14160+ return scan_common(scan, NULL);
14161+}
14162+
14163+/* Common code to perform left or right scanning. */
14164+static int scan_common(flush_scan * scan, flush_scan * other)
14165+{
14166+ int ret;
14167+
14168+ assert("nikita-2376", scan->node != NULL);
14169+ assert("edward-54", jnode_is_unformatted(scan->node)
14170+ || jnode_is_znode(scan->node));
14171+
14172+ /* Special case for starting at an unformatted node. Optimization: we only want
14173+ to search for the parent (which requires a tree traversal) once. Obviously, we
14174+ shouldn't have to call it once for the left scan and once for the right scan.
14175+ For this reason, if we search for the parent during scan-left we then duplicate
14176+ the coord/lock/load into the scan-right object. */
14177+ if (jnode_is_unformatted(scan->node)) {
14178+ ret = scan_unformatted(scan, other);
14179+ if (ret != 0)
14180+ return ret;
14181+ }
14182+ /* This loop expects to start at a formatted position and performs chaining of
14183+ formatted regions */
14184+ while (!reiser4_scan_finished(scan)) {
14185+
14186+ ret = scan_formatted(scan);
14187+ if (ret != 0) {
14188+ return ret;
14189+ }
14190+ }
14191+
14192+ return 0;
14193+}
14194+
14195+static int scan_unformatted(flush_scan * scan, flush_scan * other)
14196+{
14197+ int ret = 0;
14198+ int try = 0;
14199+
14200+ if (!coord_is_invalid(&scan->parent_coord))
14201+ goto scan;
14202+
14203+ /* set parent coord from */
14204+ if (!jnode_is_unformatted(scan->node)) {
14205+ /* formatted position */
14206+
14207+ lock_handle lock;
14208+ assert("edward-301", jnode_is_znode(scan->node));
14209+ init_lh(&lock);
14210+
14211+ /*
14212+ * when flush starts from unformatted node, first thing it
14213+ * does is tree traversal to find formatted parent of starting
14214+ * node. This parent is then kept lock across scans to the
14215+ * left and to the right. This means that during scan to the
14216+ * left we cannot take left-ward lock, because this is
14217+ * dead-lock prone. So, if we are scanning to the left and
14218+ * there is already lock held by this thread,
14219+ * jnode_lock_parent_coord() should use try-lock.
14220+ */
14221+ try = reiser4_scanning_left(scan)
14222+ && !lock_stack_isclean(get_current_lock_stack());
14223+ /* Need the node locked to get the parent lock, We have to
14224+ take write lock since there is at least one call path
14225+ where this znode is already write-locked by us. */
14226+ ret =
14227+ longterm_lock_znode(&lock, JZNODE(scan->node),
14228+ ZNODE_WRITE_LOCK,
14229+ reiser4_scanning_left(scan) ?
14230+ ZNODE_LOCK_LOPRI :
14231+ ZNODE_LOCK_HIPRI);
14232+ if (ret != 0)
14233+ /* EINVAL or E_DEADLOCK here mean... try again! At this point we've
14234+ scanned too far and can't back out, just start over. */
14235+ return ret;
14236+
14237+ ret = jnode_lock_parent_coord(scan->node,
14238+ &scan->parent_coord,
14239+ &scan->parent_lock,
14240+ &scan->parent_load,
14241+ ZNODE_WRITE_LOCK, try);
14242+
14243+ /* FIXME(C): check EINVAL, E_DEADLOCK */
14244+ done_lh(&lock);
14245+ if (ret == -E_REPEAT) {
14246+ scan->stop = 1;
14247+ return 0;
14248+ }
14249+ if (ret)
14250+ return ret;
14251+
14252+ } else {
14253+ /* unformatted position */
14254+
14255+ ret =
14256+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
14257+ &scan->parent_lock,
14258+ &scan->parent_load,
14259+ ZNODE_WRITE_LOCK, try);
14260+
14261+ if (IS_CBKERR(ret))
14262+ return ret;
14263+
14264+ if (ret == CBK_COORD_NOTFOUND)
14265+ /* FIXME(C): check EINVAL, E_DEADLOCK */
14266+ return ret;
14267+
14268+ /* parent was found */
14269+ assert("jmacd-8661", other != NULL);
14270+ /* Duplicate the reference into the other flush_scan. */
14271+ coord_dup(&other->parent_coord, &scan->parent_coord);
14272+ copy_lh(&other->parent_lock, &scan->parent_lock);
14273+ copy_load_count(&other->parent_load, &scan->parent_load);
14274+ }
14275+ scan:
14276+ return scan_by_coord(scan);
14277+}
14278+
14279+/* Performs left- or rightward scanning starting from a formatted node. Follow left
14280+ pointers under tree lock as long as:
14281+
14282+ - node->left/right is non-NULL
14283+ - node->left/right is connected, dirty
14284+ - node->left/right belongs to the same atom
14285+ - scan has not reached maximum count
14286+*/
14287+static int scan_formatted(flush_scan * scan)
14288+{
14289+ int ret;
14290+ znode *neighbor = NULL;
14291+
14292+ assert("jmacd-1401", !reiser4_scan_finished(scan));
14293+
14294+ do {
14295+ znode *node = JZNODE(scan->node);
14296+
14297+ /* Node should be connected, but if not stop the scan. */
14298+ if (!znode_is_connected(node)) {
14299+ scan->stop = 1;
14300+ break;
14301+ }
14302+
14303+ /* Lock the tree, check-for and reference the next sibling. */
14304+ read_lock_tree(znode_get_tree(node));
14305+
14306+ /* It may be that a node is inserted or removed between a node and its
14307+ left sibling while the tree lock is released, but the flush-scan count
14308+ does not need to be precise. Thus, we release the tree lock as soon as
14309+ we get the neighboring node. */
14310+ neighbor =
14311+ reiser4_scanning_left(scan) ? node->left : node->right;
14312+ if (neighbor != NULL) {
14313+ zref(neighbor);
14314+ }
14315+
14316+ read_unlock_tree(znode_get_tree(node));
14317+
14318+ /* If neighbor is NULL at the leaf level, need to check for an unformatted
14319+ sibling using the parent--break in any case. */
14320+ if (neighbor == NULL) {
14321+ break;
14322+ }
14323+
14324+ /* Check the condition for going left, break if it is not met. This also
14325+ releases (jputs) the neighbor if false. */
14326+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
14327+ break;
14328+ }
14329+
14330+ /* Advance the flush_scan state to the left, repeat. */
14331+ ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14332+ if (ret != 0) {
14333+ return ret;
14334+ }
14335+
14336+ } while (!reiser4_scan_finished(scan));
14337+
14338+ /* If neighbor is NULL then we reached the end of a formatted region, or else the
14339+ sibling is out of memory, now check for an extent to the left (as long as
14340+ LEAF_LEVEL). */
14341+ if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14342+ || reiser4_scan_finished(scan)) {
14343+ scan->stop = 1;
14344+ return 0;
14345+ }
14346+ /* Otherwise, calls scan_by_coord for the right(left)most item of the
14347+ left(right) neighbor on the parent level, then possibly continue. */
14348+
14349+ coord_init_invalid(&scan->parent_coord, NULL);
14350+ return scan_unformatted(scan, NULL);
14351+}
14352+
14353+/* NOTE-EDWARD:
14354+ This scans adjacent items of the same type and calls scan flush plugin for each one.
14355+ Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start
14356+ from unformatted node, then we continue only if the next neighbor is also unformatted.
14357+ When called from scan_formatted, we skip first iteration (to make sure that
14358+ right(left)most item of the left(right) neighbor on the parent level is of the same
14359+ type and set appropriate coord). */
14360+static int scan_by_coord(flush_scan * scan)
14361+{
14362+ int ret = 0;
14363+ int scan_this_coord;
14364+ lock_handle next_lock;
14365+ load_count next_load;
14366+ coord_t next_coord;
14367+ jnode *child;
14368+ item_plugin *iplug;
14369+
14370+ init_lh(&next_lock);
14371+ init_load_count(&next_load);
14372+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14373+
14374+ /* set initial item id */
14375+ iplug = item_plugin_by_coord(&scan->parent_coord);
14376+
14377+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
14378+ if (scan_this_coord) {
14379+ /* Here we expect that unit is scannable. it would not be so due
14380+ * to race with extent->tail conversion. */
14381+ if (iplug->f.scan == NULL) {
14382+ scan->stop = 1;
14383+ ret = -E_REPEAT;
14384+ /* skip the check at the end. */
14385+ goto race;
14386+ }
14387+
14388+ ret = iplug->f.scan(scan);
14389+ if (ret != 0)
14390+ goto exit;
14391+
14392+ if (reiser4_scan_finished(scan)) {
14393+ checkchild(scan);
14394+ break;
14395+ }
14396+ } else {
14397+ /* the same race against truncate as above is possible
14398+ * here, it seems */
14399+
14400+ /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
14401+ the first coordinate. */
14402+ assert("jmacd-1231",
14403+ item_is_internal(&scan->parent_coord));
14404+ }
14405+
14406+ if (iplug->f.utmost_child == NULL
14407+ || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14408+ /* stop this coord and continue on parrent level */
14409+ ret =
14410+ scan_set_current(scan,
14411+ ZJNODE(zref
14412+ (scan->parent_coord.node)),
14413+ 1, NULL);
14414+ if (ret != 0)
14415+ goto exit;
14416+ break;
14417+ }
14418+
14419+ /* Either way, the invariant is that scan->parent_coord is set to the
14420+ parent of scan->node. Now get the next unit. */
14421+ coord_dup(&next_coord, &scan->parent_coord);
14422+ coord_sideof_unit(&next_coord, scan->direction);
14423+
14424+ /* If off-the-end of the twig, try the next twig. */
14425+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14426+ /* We take the write lock because we may start flushing from this
14427+ * coordinate. */
14428+ ret =
14429+ neighbor_in_slum(next_coord.node, &next_lock,
14430+ scan->direction, ZNODE_WRITE_LOCK,
14431+ 1 /* check dirty */ );
14432+ if (ret == -E_NO_NEIGHBOR) {
14433+ scan->stop = 1;
14434+ ret = 0;
14435+ break;
14436+ }
14437+
14438+ if (ret != 0) {
14439+ goto exit;
14440+ }
14441+
14442+ ret = incr_load_count_znode(&next_load, next_lock.node);
14443+ if (ret != 0) {
14444+ goto exit;
14445+ }
14446+
14447+ coord_init_sideof_unit(&next_coord, next_lock.node,
14448+ sideof_reverse(scan->direction));
14449+ }
14450+
14451+ iplug = item_plugin_by_coord(&next_coord);
14452+
14453+ /* Get the next child. */
14454+ ret =
14455+ iplug->f.utmost_child(&next_coord,
14456+ sideof_reverse(scan->direction),
14457+ &child);
14458+ if (ret != 0)
14459+ goto exit;
14460+ /* If the next child is not in memory, or, item_utmost_child
14461+ failed (due to race with unlink, most probably), stop
14462+ here. */
14463+ if (child == NULL || IS_ERR(child)) {
14464+ scan->stop = 1;
14465+ checkchild(scan);
14466+ break;
14467+ }
14468+
14469+ assert("nikita-2374", jnode_is_unformatted(child)
14470+ || jnode_is_znode(child));
14471+
14472+ /* See if it is dirty, part of the same atom. */
14473+ if (!reiser4_scan_goto(scan, child)) {
14474+ checkchild(scan);
14475+ break;
14476+ }
14477+
14478+ /* If so, make this child current. */
14479+ ret = scan_set_current(scan, child, 1, &next_coord);
14480+ if (ret != 0)
14481+ goto exit;
14482+
14483+ /* Now continue. If formatted we release the parent lock and return, then
14484+ proceed. */
14485+ if (jnode_is_znode(child))
14486+ break;
14487+
14488+ /* Otherwise, repeat the above loop with next_coord. */
14489+ if (next_load.node != NULL) {
14490+ done_lh(&scan->parent_lock);
14491+ move_lh(&scan->parent_lock, &next_lock);
14492+ move_load_count(&scan->parent_load, &next_load);
14493+ }
14494+ }
14495+
14496+ assert("jmacd-6233",
14497+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14498+ exit:
14499+ checkchild(scan);
14500+ race: /* skip the above check */
14501+ if (jnode_is_znode(scan->node)) {
14502+ done_lh(&scan->parent_lock);
14503+ done_load_count(&scan->parent_load);
14504+ }
14505+
14506+ done_load_count(&next_load);
14507+ done_lh(&next_lock);
14508+ return ret;
14509+}
14510+
14511+/* FLUSH POS HELPERS */
14512+
14513+/* Initialize the fields of a flush_position. */
14514+static void pos_init(flush_pos_t * pos)
14515+{
14516+ memset(pos, 0, sizeof *pos);
14517+
14518+ pos->state = POS_INVALID;
14519+ coord_init_invalid(&pos->coord, NULL);
14520+ init_lh(&pos->lock);
14521+ init_load_count(&pos->load);
14522+
14523+ reiser4_blocknr_hint_init(&pos->preceder);
14524+}
14525+
14526+/* The flush loop inside squalloc periodically checks pos_valid to
14527+ determine when "enough flushing" has been performed. This will return true until one
14528+ of the following conditions is met:
14529+
14530+ 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14531+ parameter, meaning we have flushed as many blocks as the kernel requested. When
14532+ flushing to commit, this parameter is NULL.
14533+
14534+ 2. pos_stop() is called because squalloc discovers that the "next" node in the
14535+ flush order is either non-existant, not dirty, or not in the same atom.
14536+*/
14537+
14538+static int pos_valid(flush_pos_t * pos)
14539+{
14540+ return pos->state != POS_INVALID;
14541+}
14542+
14543+/* Release any resources of a flush_position. Called when jnode_flush finishes. */
14544+static void pos_done(flush_pos_t * pos)
14545+{
14546+ pos_stop(pos);
14547+ reiser4_blocknr_hint_done(&pos->preceder);
14548+ if (convert_data(pos))
14549+ free_convert_data(pos);
14550+}
14551+
14552+/* Reset the point and parent. Called during flush subroutines to terminate the
14553+ squalloc loop. */
14554+static int pos_stop(flush_pos_t * pos)
14555+{
14556+ pos->state = POS_INVALID;
14557+ done_lh(&pos->lock);
14558+ done_load_count(&pos->load);
14559+ coord_init_invalid(&pos->coord, NULL);
14560+
14561+ if (pos->child) {
14562+ jput(pos->child);
14563+ pos->child = NULL;
14564+ }
14565+
14566+ return 0;
14567+}
14568+
14569+/* Return the flush_position's block allocator hint. */
14570+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14571+{
14572+ return &pos->preceder;
14573+}
14574+
14575+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14576+{
14577+ return pos->fq;
14578+}
14579+
14580+/* Make Linus happy.
14581+ Local variables:
14582+ c-indentation-style: "K&R"
14583+ mode-name: "LC"
14584+ c-basic-offset: 8
14585+ tab-width: 8
14586+ fill-column: 90
14587+ LocalWords: preceder
14588+ End:
14589+*/
14590diff --git a/fs/reiser4/flush.h b/fs/reiser4/flush.h
14591new file mode 100644
14592index 0000000..beab76b
14593--- /dev/null
14594+++ b/fs/reiser4/flush.h
14595@@ -0,0 +1,274 @@
14596+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14597+
14598+/* DECLARATIONS: */
14599+
14600+#if !defined(__REISER4_FLUSH_H__)
14601+#define __REISER4_FLUSH_H__
14602+
14603+#include "plugin/cluster.h"
14604+
14605+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14606+ single level of the tree. A flush-scan is used for counting the number of adjacent
14607+ nodes to flush, which is used to determine whether we should relocate, and it is also
14608+ used to find a starting point for flush. A flush-scan object can scan in both right
14609+ and left directions via the scan_left() and scan_right() interfaces. The
14610+ right- and left-variations are similar but perform different functions. When scanning
14611+ left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14612+ When scanning right we are simply counting the number of adjacent, dirty nodes. */
14613+struct flush_scan {
14614+
14615+ /* The current number of nodes scanned on this level. */
14616+ unsigned count;
14617+
14618+ /* There may be a maximum number of nodes for a scan on any single level. When
14619+ going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14620+ unsigned max_count;
14621+
14622+ /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14623+ sideof direction;
14624+
14625+ /* Initially @stop is set to false then set true once some condition stops the
14626+ search (e.g., we found a clean node before reaching max_count or we found a
14627+ node belonging to another atom). */
14628+ int stop;
14629+
14630+ /* The current scan position. If @node is non-NULL then its reference count has
14631+ been incremented to reflect this reference. */
14632+ jnode *node;
14633+
14634+ /* A handle for zload/zrelse of current scan position node. */
14635+ load_count node_load;
14636+
14637+ /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14638+ node is locked using this lock handle. The endpoint needs to be locked for
14639+ transfer to the flush_position object after scanning finishes. */
14640+ lock_handle node_lock;
14641+
14642+ /* When the position is unformatted, its parent, coordinate, and parent
14643+ zload/zrelse handle. */
14644+ lock_handle parent_lock;
14645+ coord_t parent_coord;
14646+ load_count parent_load;
14647+
14648+ /* The block allocator preceder hint. Sometimes flush_scan determines what the
14649+ preceder is and if so it sets it here, after which it is copied into the
14650+ flush_position. Otherwise, the preceder is computed later. */
14651+ reiser4_block_nr preceder_blk;
14652+};
14653+
14654+typedef struct convert_item_info {
14655+ dc_item_stat d_cur; /* disk cluster state of the current item */
14656+ dc_item_stat d_next; /* disk cluster state of the next slum item */
14657+ struct inode *inode;
14658+ flow_t flow;
14659+} convert_item_info_t;
14660+
14661+typedef struct convert_info {
14662+ int count; /* for squalloc terminating */
14663+ reiser4_cluster_t clust; /* transform cluster */
14664+ item_plugin *iplug; /* current item plugin */
14665+ convert_item_info_t *itm; /* current item info */
14666+} convert_info_t;
14667+
14668+typedef enum flush_position_state {
14669+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
14670+ * processing */
14671+ POS_ON_LEAF, /* pos points to already prepped, locked formatted node at
14672+ * leaf level */
14673+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used
14674+ * to traverse unformatted nodes */
14675+ POS_TO_LEAF, /* pos is being moved to leaf level */
14676+ POS_TO_TWIG, /* pos is being moved to twig level */
14677+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after
14678+ * rightmost unit of the current twig */
14679+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */
14680+} flushpos_state_t;
14681+
14682+/* An encapsulation of the current flush point and all the parameters that are passed
14683+ through the entire squeeze-and-allocate stage of the flush routine. A single
14684+ flush_position object is constructed after left- and right-scanning finishes. */
14685+struct flush_position {
14686+ flushpos_state_t state;
14687+
14688+ coord_t coord; /* coord to traverse unformatted nodes */
14689+ lock_handle lock; /* current lock we hold */
14690+ load_count load; /* load status for current locked formatted node */
14691+
14692+ jnode *child; /* for passing a reference to unformatted child
14693+ * across pos state changes */
14694+
14695+ reiser4_blocknr_hint preceder; /* The flush 'hint' state. */
14696+ int leaf_relocate; /* True if enough leaf-level nodes were
14697+ * found to suggest a relocate policy. */
14698+ int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */
14699+ int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14700+ flush_queue_t *fq;
14701+ long *nr_written; /* number of nodes submitted to disk */
14702+ int flags; /* a copy of jnode_flush flags argument */
14703+
14704+ znode *prev_twig; /* previous parent pointer value, used to catch
14705+ * processing of new twig node */
14706+ convert_info_t *sq; /* convert info */
14707+
14708+ unsigned long pos_in_unit; /* for extents only. Position
14709+ within an extent unit of first
14710+ jnode of slum */
14711+ long nr_to_write; /* number of unformatted nodes to handle on flush */
14712+};
14713+
14714+static inline int item_convert_count(flush_pos_t * pos)
14715+{
14716+ return pos->sq->count;
14717+}
14718+static inline void inc_item_convert_count(flush_pos_t * pos)
14719+{
14720+ pos->sq->count++;
14721+}
14722+static inline void set_item_convert_count(flush_pos_t * pos, int count)
14723+{
14724+ pos->sq->count = count;
14725+}
14726+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14727+{
14728+ return pos->sq->iplug;
14729+}
14730+
14731+static inline convert_info_t *convert_data(flush_pos_t * pos)
14732+{
14733+ return pos->sq;
14734+}
14735+
14736+static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14737+{
14738+ assert("edward-955", convert_data(pos));
14739+ return pos->sq->itm;
14740+}
14741+
14742+static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14743+{
14744+ return &pos->sq->clust.tc;
14745+}
14746+
14747+static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14748+{
14749+ assert("edward-854", pos->sq != NULL);
14750+ return tfm_stream(tfm_cluster_sq(pos), id);
14751+}
14752+
14753+static inline int chaining_data_present(flush_pos_t * pos)
14754+{
14755+ return convert_data(pos) && item_convert_data(pos);
14756+}
14757+
14758+/* Returns true if next node contains next item of the disk cluster
14759+ so item convert data should be moved to the right slum neighbor.
14760+*/
14761+static inline int should_chain_next_node(flush_pos_t * pos)
14762+{
14763+ int result = 0;
14764+
14765+ assert("edward-1007", chaining_data_present(pos));
14766+
14767+ switch (item_convert_data(pos)->d_next) {
14768+ case DC_CHAINED_ITEM:
14769+ result = 1;
14770+ break;
14771+ case DC_AFTER_CLUSTER:
14772+ break;
14773+ default:
14774+ impossible("edward-1009", "bad state of next slum item");
14775+ }
14776+ return result;
14777+}
14778+
14779+/* update item state in a disk cluster to assign conversion mode */
14780+static inline void
14781+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14782+{
14783+
14784+ assert("edward-1010", chaining_data_present(pos));
14785+
14786+ if (this_node == 0) {
14787+ /* next item is on the right neighbor */
14788+ assert("edward-1011",
14789+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14790+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14791+ assert("edward-1012",
14792+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14793+
14794+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14795+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14796+ } else {
14797+ /* next item is on the same node */
14798+ assert("edward-1013",
14799+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14800+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14801+ assert("edward-1227",
14802+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14803+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
14804+
14805+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14806+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
14807+ }
14808+}
14809+
14810+static inline int should_convert_node(flush_pos_t * pos, znode * node)
14811+{
14812+ return znode_convertible(node);
14813+}
14814+
14815+/* true if there is attached convert item info */
14816+static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14817+{
14818+ return convert_data(pos) && item_convert_data(pos);
14819+}
14820+
14821+#define SQUALLOC_THRESHOLD 256
14822+
14823+static inline int should_terminate_squalloc(flush_pos_t * pos)
14824+{
14825+ return convert_data(pos) &&
14826+ !item_convert_data(pos) &&
14827+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14828+}
14829+
14830+void free_convert_data(flush_pos_t * pos);
14831+/* used in extent.c */
14832+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14833+ const coord_t * parent);
14834+int reiser4_scan_finished(flush_scan * scan);
14835+int reiser4_scanning_left(flush_scan * scan);
14836+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14837+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14838+int reiser4_alloc_extent(flush_pos_t *flush_pos);
14839+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14840+ reiser4_key *stop_key);
14841+extern int reiser4_init_fqs(void);
14842+extern void reiser4_done_fqs(void);
14843+
14844+#if REISER4_DEBUG
14845+
14846+extern void reiser4_check_fq(const txn_atom *atom);
14847+extern atomic_t flush_cnt;
14848+
14849+#define check_preceder(blk) \
14850+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14851+extern void check_pos(flush_pos_t * pos);
14852+#else
14853+#define check_preceder(b) noop
14854+#define check_pos(pos) noop
14855+#endif
14856+
14857+/* __REISER4_FLUSH_H__ */
14858+#endif
14859+
14860+/* Make Linus happy.
14861+ Local variables:
14862+ c-indentation-style: "K&R"
14863+ mode-name: "LC"
14864+ c-basic-offset: 8
14865+ tab-width: 8
14866+ fill-column: 90
14867+ LocalWords: preceder
14868+ End:
14869+*/
14870diff --git a/fs/reiser4/flush_queue.c b/fs/reiser4/flush_queue.c
14871new file mode 100644
14872index 0000000..f6c5d9a
14873--- /dev/null
14874+++ b/fs/reiser4/flush_queue.c
14875@@ -0,0 +1,680 @@
14876+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14877+
14878+#include "debug.h"
14879+#include "super.h"
14880+#include "txnmgr.h"
14881+#include "jnode.h"
14882+#include "znode.h"
14883+#include "page_cache.h"
14884+#include "wander.h"
14885+#include "vfs_ops.h"
14886+#include "writeout.h"
14887+#include "flush.h"
14888+
14889+#include <linux/bio.h>
14890+#include <linux/mm.h>
14891+#include <linux/pagemap.h>
14892+#include <linux/blkdev.h>
14893+#include <linux/writeback.h>
14894+
14895+/* A flush queue object is an accumulator for keeping jnodes prepared
14896+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14897+ kept on the flush queue until memory pressure or atom commit asks
14898+ flush queues to write some or all from their jnodes. */
14899+
14900+/*
14901+ LOCKING:
14902+
14903+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
14904+ list protected by atom spin lock. fq->prepped list uses the following
14905+ locking:
14906+
14907+ two ways to protect fq->prepped list for read-only list traversal:
14908+
14909+ 1. atom spin-lock atom.
14910+ 2. fq is IN_USE, atom->nr_running_queues increased.
14911+
14912+ and one for list modification:
14913+
14914+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
14915+ atom->nr_running_queues == 0.
14916+
14917+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
14918+ lock flush queue, then lock jnode.
14919+*/
14920+
14921+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
14922+#define fq_ready(fq) (!fq_in_use(fq))
14923+
14924+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
14925+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
14926+
14927+/* get lock on atom from locked flush queue object */
14928+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14929+{
14930+ /* This code is similar to jnode_get_atom(), look at it for the
14931+ * explanation. */
14932+ txn_atom *atom;
14933+
14934+ assert_spin_locked(&(fq->guard));
14935+
14936+ while (1) {
14937+ atom = fq->atom;
14938+ if (atom == NULL)
14939+ break;
14940+
14941+ if (spin_trylock_atom(atom))
14942+ break;
14943+
14944+ atomic_inc(&atom->refcount);
14945+ spin_unlock(&(fq->guard));
14946+ spin_lock_atom(atom);
14947+ spin_lock(&(fq->guard));
14948+
14949+ if (fq->atom == atom) {
14950+ atomic_dec(&atom->refcount);
14951+ break;
14952+ }
14953+
14954+ spin_unlock(&(fq->guard));
14955+ atom_dec_and_unlock(atom);
14956+ spin_lock(&(fq->guard));
14957+ }
14958+
14959+ return atom;
14960+}
14961+
14962+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14963+{
14964+ txn_atom *atom;
14965+
14966+ spin_lock(&(fq->guard));
14967+ atom = atom_locked_by_fq_nolock(fq);
14968+ spin_unlock(&(fq->guard));
14969+ return atom;
14970+}
14971+
14972+static void init_fq(flush_queue_t * fq)
14973+{
14974+ memset(fq, 0, sizeof *fq);
14975+
14976+ atomic_set(&fq->nr_submitted, 0);
14977+
14978+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14979+
14980+ init_waitqueue_head(&fq->wait);
14981+ spin_lock_init(&fq->guard);
14982+}
14983+
14984+/* slab for flush queues */
14985+static struct kmem_cache *fq_slab;
14986+
14987+/**
14988+ * reiser4_init_fqs - create flush queue cache
14989+ *
14990+ * Initializes slab cache of flush queues. It is part of reiser4 module
14991+ * initialization.
14992+ */
14993+int reiser4_init_fqs(void)
14994+{
14995+ fq_slab = kmem_cache_create("fq",
14996+ sizeof(flush_queue_t),
14997+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14998+ if (fq_slab == NULL)
14999+ return RETERR(-ENOMEM);
15000+ return 0;
15001+}
15002+
15003+/**
15004+ * reiser4_done_fqs - delete flush queue cache
15005+ *
15006+ * This is called on reiser4 module unloading or system shutdown.
15007+ */
15008+void reiser4_done_fqs(void)
15009+{
15010+ destroy_reiser4_cache(&fq_slab);
15011+}
15012+
15013+/* create new flush queue object */
15014+static flush_queue_t *create_fq(gfp_t gfp)
15015+{
15016+ flush_queue_t *fq;
15017+
15018+ fq = kmem_cache_alloc(fq_slab, gfp);
15019+ if (fq)
15020+ init_fq(fq);
15021+
15022+ return fq;
15023+}
15024+
15025+/* adjust atom's and flush queue's counters of queued nodes */
15026+static void count_enqueued_node(flush_queue_t * fq)
15027+{
15028+ ON_DEBUG(fq->atom->num_queued++);
15029+}
15030+
15031+static void count_dequeued_node(flush_queue_t * fq)
15032+{
15033+ assert("zam-993", fq->atom->num_queued > 0);
15034+ ON_DEBUG(fq->atom->num_queued--);
15035+}
15036+
15037+/* attach flush queue object to the atom */
15038+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
15039+{
15040+ assert_spin_locked(&(atom->alock));
15041+ list_add(&fq->alink, &atom->flush_queues);
15042+ fq->atom = atom;
15043+ ON_DEBUG(atom->nr_flush_queues++);
15044+}
15045+
15046+static void detach_fq(flush_queue_t * fq)
15047+{
15048+ assert_spin_locked(&(fq->atom->alock));
15049+
15050+ spin_lock(&(fq->guard));
15051+ list_del_init(&fq->alink);
15052+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
15053+ ON_DEBUG(fq->atom->nr_flush_queues--);
15054+ fq->atom = NULL;
15055+ spin_unlock(&(fq->guard));
15056+}
15057+
15058+/* destroy flush queue object */
15059+static void done_fq(flush_queue_t * fq)
15060+{
15061+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
15062+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
15063+
15064+ kmem_cache_free(fq_slab, fq);
15065+}
15066+
15067+/* */
15068+static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
15069+{
15070+ JF_SET(node, JNODE_FLUSH_QUEUED);
15071+ count_enqueued_node(fq);
15072+}
15073+
15074+/* Putting jnode into the flush queue. Both atom and jnode should be
15075+ spin-locked. */
15076+void queue_jnode(flush_queue_t * fq, jnode * node)
15077+{
15078+ assert_spin_locked(&(node->guard));
15079+ assert("zam-713", node->atom != NULL);
15080+ assert_spin_locked(&(node->atom->alock));
15081+ assert("zam-716", fq->atom != NULL);
15082+ assert("zam-717", fq->atom == node->atom);
15083+ assert("zam-907", fq_in_use(fq));
15084+
15085+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
15086+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
15087+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
15088+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
15089+
15090+ mark_jnode_queued(fq, node);
15091+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
15092+
15093+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
15094+ FQ_LIST, 1));
15095+}
15096+
15097+/* repeatable process for waiting io completion on a flush queue object */
15098+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
15099+{
15100+ assert("zam-738", fq->atom != NULL);
15101+ assert_spin_locked(&(fq->atom->alock));
15102+ assert("zam-736", fq_in_use(fq));
15103+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
15104+
15105+ if (atomic_read(&fq->nr_submitted) != 0) {
15106+ struct super_block *super;
15107+
15108+ spin_unlock_atom(fq->atom);
15109+
15110+ assert("nikita-3013", reiser4_schedulable());
15111+
15112+ super = reiser4_get_current_sb();
15113+
15114+ /* FIXME: this is instead of blk_run_queues() */
15115+ blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
15116+
15117+ if (!(super->s_flags & MS_RDONLY))
15118+ wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
15119+
15120+ /* Ask the caller to re-acquire the locks and call this
15121+ function again. Note: this technique is commonly used in
15122+ the txnmgr code. */
15123+ return -E_REPEAT;
15124+ }
15125+
15126+ *nr_io_errors += atomic_read(&fq->nr_errors);
15127+ return 0;
15128+}
15129+
15130+/* wait on I/O completion, re-submit dirty nodes to write */
15131+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
15132+{
15133+ int ret;
15134+ txn_atom *atom = fq->atom;
15135+
15136+ assert("zam-801", atom != NULL);
15137+ assert_spin_locked(&(atom->alock));
15138+ assert("zam-762", fq_in_use(fq));
15139+
15140+ ret = wait_io(fq, nr_io_errors);
15141+ if (ret)
15142+ return ret;
15143+
15144+ detach_fq(fq);
15145+ done_fq(fq);
15146+
15147+ reiser4_atom_send_event(atom);
15148+
15149+ return 0;
15150+}
15151+
15152+/* wait for all i/o for given atom to be completed, actually do one iteration
15153+ on that and return -E_REPEAT if there more iterations needed */
15154+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
15155+{
15156+ flush_queue_t *fq;
15157+
15158+ assert_spin_locked(&(atom->alock));
15159+
15160+ if (list_empty_careful(&atom->flush_queues))
15161+ return 0;
15162+
15163+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15164+ if (fq_ready(fq)) {
15165+ int ret;
15166+
15167+ mark_fq_in_use(fq);
15168+ assert("vs-1247", fq->owner == NULL);
15169+ ON_DEBUG(fq->owner = current);
15170+ ret = finish_fq(fq, nr_io_errors);
15171+
15172+ if (*nr_io_errors)
15173+ reiser4_handle_error();
15174+
15175+ if (ret) {
15176+ reiser4_fq_put(fq);
15177+ return ret;
15178+ }
15179+
15180+ spin_unlock_atom(atom);
15181+
15182+ return -E_REPEAT;
15183+ }
15184+ }
15185+
15186+ /* All flush queues are in use; atom remains locked */
15187+ return -EBUSY;
15188+}
15189+
15190+/* wait all i/o for current atom */
15191+int current_atom_finish_all_fq(void)
15192+{
15193+ txn_atom *atom;
15194+ int nr_io_errors = 0;
15195+ int ret = 0;
15196+
15197+ do {
15198+ while (1) {
15199+ atom = get_current_atom_locked();
15200+ ret = finish_all_fq(atom, &nr_io_errors);
15201+ if (ret != -EBUSY)
15202+ break;
15203+ reiser4_atom_wait_event(atom);
15204+ }
15205+ } while (ret == -E_REPEAT);
15206+
15207+ /* we do not need locked atom after this function finishes, SUCCESS or
15208+ -EBUSY are two return codes when atom remains locked after
15209+ finish_all_fq */
15210+ if (!ret)
15211+ spin_unlock_atom(atom);
15212+
15213+ assert_spin_not_locked(&(atom->alock));
15214+
15215+ if (ret)
15216+ return ret;
15217+
15218+ if (nr_io_errors)
15219+ return RETERR(-EIO);
15220+
15221+ return 0;
15222+}
15223+
15224+/* change node->atom field for all jnode from given list */
15225+static void
15226+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
15227+{
15228+ jnode *cur;
15229+
15230+ list_for_each_entry(cur, list, capture_link) {
15231+ spin_lock_jnode(cur);
15232+ cur->atom = atom;
15233+ spin_unlock_jnode(cur);
15234+ }
15235+}
15236+
15237+/* support for atom fusion operation */
15238+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
15239+{
15240+ flush_queue_t *fq;
15241+
15242+ assert_spin_locked(&(to->alock));
15243+ assert_spin_locked(&(from->alock));
15244+
15245+ list_for_each_entry(fq, &from->flush_queues, alink) {
15246+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
15247+ spin_lock(&(fq->guard));
15248+ fq->atom = to;
15249+ spin_unlock(&(fq->guard));
15250+ }
15251+
15252+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
15253+
15254+#if REISER4_DEBUG
15255+ to->num_queued += from->num_queued;
15256+ to->nr_flush_queues += from->nr_flush_queues;
15257+ from->nr_flush_queues = 0;
15258+#endif
15259+}
15260+
15261+#if REISER4_DEBUG
15262+int atom_fq_parts_are_clean(txn_atom * atom)
15263+{
15264+ assert("zam-915", atom != NULL);
15265+ return list_empty_careful(&atom->flush_queues);
15266+}
15267+#endif
15268+/* Bio i/o completion routine for reiser4 write operations. */
15269+static int
15270+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
15271+ int err)
15272+{
15273+ int i;
15274+ int nr_errors = 0;
15275+ flush_queue_t *fq;
15276+
15277+ assert("zam-958", bio->bi_rw & WRITE);
15278+
15279+ /* i/o op. is not fully completed */
15280+ if (bio->bi_size != 0)
15281+ return 1;
15282+
15283+ if (err == -EOPNOTSUPP)
15284+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
15285+
15286+ /* we expect that bio->private is set to NULL or fq object which is used
15287+ * for synchronization and error counting. */
15288+ fq = bio->bi_private;
15289+ /* Check all elements of io_vec for correct write completion. */
15290+ for (i = 0; i < bio->bi_vcnt; i += 1) {
15291+ struct page *pg = bio->bi_io_vec[i].bv_page;
15292+
15293+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
15294+ SetPageError(pg);
15295+ nr_errors++;
15296+ }
15297+
15298+ {
15299+ /* jnode WRITEBACK ("write is in progress bit") is
15300+ * atomically cleared here. */
15301+ jnode *node;
15302+
15303+ assert("zam-736", pg != NULL);
15304+ assert("zam-736", PagePrivate(pg));
15305+ node = jprivate(pg);
15306+
15307+ JF_CLR(node, JNODE_WRITEBACK);
15308+ }
15309+
15310+ end_page_writeback(pg);
15311+ page_cache_release(pg);
15312+ }
15313+
15314+ if (fq) {
15315+ /* count i/o error in fq object */
15316+ atomic_add(nr_errors, &fq->nr_errors);
15317+
15318+ /* If all write requests registered in this "fq" are done we up
15319+ * the waiter. */
15320+ if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15321+ wake_up(&fq->wait);
15322+ }
15323+
15324+ bio_put(bio);
15325+ return 0;
15326+}
15327+
15328+/* Count I/O requests which will be submitted by @bio in given flush queues
15329+ @fq */
15330+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
15331+{
15332+ bio->bi_private = fq;
15333+ bio->bi_end_io = end_io_handler;
15334+
15335+ if (fq)
15336+ atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15337+}
15338+
15339+/* Move all queued nodes out from @fq->prepped list. */
15340+static void release_prepped_list(flush_queue_t * fq)
15341+{
15342+ txn_atom *atom;
15343+
15344+ assert("zam-904", fq_in_use(fq));
15345+ atom = atom_locked_by_fq(fq);
15346+
15347+ while (!list_empty(ATOM_FQ_LIST(fq))) {
15348+ jnode *cur;
15349+
15350+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15351+ list_del_init(&cur->capture_link);
15352+
15353+ count_dequeued_node(fq);
15354+ spin_lock_jnode(cur);
15355+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15356+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15357+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15358+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
15359+
15360+ if (JF_ISSET(cur, JNODE_DIRTY)) {
15361+ list_add_tail(&cur->capture_link,
15362+ ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
15363+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15364+ DIRTY_LIST, 1));
15365+ } else {
15366+ list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
15367+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15368+ CLEAN_LIST, 1));
15369+ }
15370+
15371+ spin_unlock_jnode(cur);
15372+ }
15373+
15374+ if (--atom->nr_running_queues == 0)
15375+ reiser4_atom_send_event(atom);
15376+
15377+ spin_unlock_atom(atom);
15378+}
15379+
15380+/* Submit write requests for nodes on the already filled flush queue @fq.
15381+
15382+ @fq: flush queue object which contains jnodes we can (and will) write.
15383+ @return: number of submitted blocks (>=0) if success, otherwise -- an error
15384+ code (<0). */
15385+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
15386+{
15387+ int ret;
15388+ txn_atom *atom;
15389+
15390+ while (1) {
15391+ atom = atom_locked_by_fq(fq);
15392+ assert("zam-924", atom);
15393+ /* do not write fq in parallel. */
15394+ if (atom->nr_running_queues == 0
15395+ || !(flags & WRITEOUT_SINGLE_STREAM))
15396+ break;
15397+ reiser4_atom_wait_event(atom);
15398+ }
15399+
15400+ atom->nr_running_queues++;
15401+ spin_unlock_atom(atom);
15402+
15403+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15404+ release_prepped_list(fq);
15405+
15406+ return ret;
15407+}
15408+
15409+/* Getting flush queue object for exclusive use by one thread. May require
15410+ several iterations which is indicated by -E_REPEAT return code.
15411+
15412+ This function does not contain code for obtaining an atom lock because an
15413+ atom lock is obtained by different ways in different parts of reiser4,
15414+ usually it is current atom, but we need a possibility for getting fq for the
15415+ atom of given jnode. */
15416+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15417+{
15418+ flush_queue_t *fq;
15419+
15420+ assert_spin_locked(&(atom->alock));
15421+
15422+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15423+ while (&atom->flush_queues != &fq->alink) {
15424+ spin_lock(&(fq->guard));
15425+
15426+ if (fq_ready(fq)) {
15427+ mark_fq_in_use(fq);
15428+ assert("vs-1246", fq->owner == NULL);
15429+ ON_DEBUG(fq->owner = current);
15430+ spin_unlock(&(fq->guard));
15431+
15432+ if (*new_fq)
15433+ done_fq(*new_fq);
15434+
15435+ *new_fq = fq;
15436+
15437+ return 0;
15438+ }
15439+
15440+ spin_unlock(&(fq->guard));
15441+
15442+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
15443+ }
15444+
15445+ /* Use previously allocated fq object */
15446+ if (*new_fq) {
15447+ mark_fq_in_use(*new_fq);
15448+ assert("vs-1248", (*new_fq)->owner == 0);
15449+ ON_DEBUG((*new_fq)->owner = current);
15450+ attach_fq(atom, *new_fq);
15451+
15452+ return 0;
15453+ }
15454+
15455+ spin_unlock_atom(atom);
15456+
15457+ *new_fq = create_fq(gfp);
15458+
15459+ if (*new_fq == NULL)
15460+ return RETERR(-ENOMEM);
15461+
15462+ return RETERR(-E_REPEAT);
15463+}
15464+
15465+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15466+{
15467+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15468+}
15469+
15470+/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15471+ object for current atom, if success fq->atom remains locked. */
15472+flush_queue_t *get_fq_for_current_atom(void)
15473+{
15474+ flush_queue_t *fq = NULL;
15475+ txn_atom *atom;
15476+ int ret;
15477+
15478+ do {
15479+ atom = get_current_atom_locked();
15480+ ret = reiser4_fq_by_atom(atom, &fq);
15481+ } while (ret == -E_REPEAT);
15482+
15483+ if (ret)
15484+ return ERR_PTR(ret);
15485+ return fq;
15486+}
15487+
15488+/* Releasing flush queue object after exclusive use */
15489+void reiser4_fq_put_nolock(flush_queue_t *fq)
15490+{
15491+ assert("zam-747", fq->atom != NULL);
15492+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15493+ mark_fq_ready(fq);
15494+ assert("vs-1245", fq->owner == current);
15495+ ON_DEBUG(fq->owner = NULL);
15496+}
15497+
15498+void reiser4_fq_put(flush_queue_t * fq)
15499+{
15500+ txn_atom *atom;
15501+
15502+ spin_lock(&(fq->guard));
15503+ atom = atom_locked_by_fq_nolock(fq);
15504+
15505+ assert("zam-746", atom != NULL);
15506+
15507+ reiser4_fq_put_nolock(fq);
15508+ reiser4_atom_send_event(atom);
15509+
15510+ spin_unlock(&(fq->guard));
15511+ spin_unlock_atom(atom);
15512+}
15513+
15514+/* A part of atom object initialization related to the embedded flush queue
15515+ list head */
15516+
15517+void init_atom_fq_parts(txn_atom *atom)
15518+{
15519+ INIT_LIST_HEAD(&atom->flush_queues);
15520+}
15521+
15522+#if REISER4_DEBUG
15523+
15524+void reiser4_check_fq(const txn_atom *atom)
15525+{
15526+ /* check number of nodes on all atom's flush queues */
15527+ flush_queue_t *fq;
15528+ int count;
15529+ struct list_head *pos;
15530+
15531+ count = 0;
15532+ list_for_each_entry(fq, &atom->flush_queues, alink) {
15533+ spin_lock(&(fq->guard));
15534+ /* calculate number of jnodes on fq' list of prepped jnodes */
15535+ list_for_each(pos, ATOM_FQ_LIST(fq))
15536+ count++;
15537+ spin_unlock(&(fq->guard));
15538+ }
15539+ if (count != atom->fq)
15540+ warning("", "fq counter %d, real %d\n", atom->fq, count);
15541+
15542+}
15543+
15544+#endif
15545+
15546+/*
15547+ * Local variables:
15548+ * c-indentation-style: "K&R"
15549+ * mode-name: "LC"
15550+ * c-basic-offset: 8
15551+ * tab-width: 8
15552+ * fill-column: 79
15553+ * scroll-step: 1
15554+ * End:
15555+ */
15556diff --git a/fs/reiser4/forward.h b/fs/reiser4/forward.h
15557new file mode 100644
15558index 0000000..8536833
15559--- /dev/null
15560+++ b/fs/reiser4/forward.h
15561@@ -0,0 +1,256 @@
15562+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15563+
15564+/* Forward declarations. Thank you Kernighan. */
15565+
15566+#if !defined( __REISER4_FORWARD_H__ )
15567+#define __REISER4_FORWARD_H__
15568+
15569+#include <asm/errno.h>
15570+#include <linux/types.h>
15571+
15572+typedef struct zlock zlock;
15573+typedef struct lock_stack lock_stack;
15574+typedef struct lock_handle lock_handle;
15575+typedef struct znode znode;
15576+typedef struct flow flow_t;
15577+typedef struct coord coord_t;
15578+typedef struct tree_access_pointer tap_t;
15579+typedef struct item_coord item_coord;
15580+typedef struct shift_params shift_params;
15581+typedef struct reiser4_object_create_data reiser4_object_create_data;
15582+typedef union reiser4_plugin reiser4_plugin;
15583+typedef __u16 reiser4_plugin_id;
15584+typedef __u64 reiser4_plugin_groups;
15585+typedef struct item_plugin item_plugin;
15586+typedef struct jnode_plugin jnode_plugin;
15587+typedef struct reiser4_item_data reiser4_item_data;
15588+typedef union reiser4_key reiser4_key;
15589+typedef struct reiser4_tree reiser4_tree;
15590+typedef struct carry_cut_data carry_cut_data;
15591+typedef struct carry_kill_data carry_kill_data;
15592+typedef struct carry_tree_op carry_tree_op;
15593+typedef struct carry_tree_node carry_tree_node;
15594+typedef struct carry_plugin_info carry_plugin_info;
15595+typedef struct reiser4_journal reiser4_journal;
15596+typedef struct txn_atom txn_atom;
15597+typedef struct txn_handle txn_handle;
15598+typedef struct txn_mgr txn_mgr;
15599+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15600+typedef struct reiser4_context reiser4_context;
15601+typedef struct carry_level carry_level;
15602+typedef struct blocknr_set_entry blocknr_set_entry;
15603+/* super_block->s_fs_info points to this */
15604+typedef struct reiser4_super_info_data reiser4_super_info_data;
15605+/* next two objects are fields of reiser4_super_info_data */
15606+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15607+typedef struct reiser4_space_allocator reiser4_space_allocator;
15608+
15609+typedef struct flush_scan flush_scan;
15610+typedef struct flush_position flush_pos_t;
15611+
15612+typedef unsigned short pos_in_node_t;
15613+#define MAX_POS_IN_NODE 65535
15614+
15615+typedef struct jnode jnode;
15616+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15617+
15618+typedef struct uf_coord uf_coord_t;
15619+typedef struct hint hint_t;
15620+
15621+typedef struct ktxnmgrd_context ktxnmgrd_context;
15622+
15623+typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15624+
15625+struct inode;
15626+struct page;
15627+struct file;
15628+struct dentry;
15629+struct super_block;
15630+
15631+/* return values of coord_by_key(). cbk == coord_by_key */
15632+typedef enum {
15633+ CBK_COORD_FOUND = 0,
15634+ CBK_COORD_NOTFOUND = -ENOENT,
15635+} lookup_result;
15636+
15637+/* results of lookup with directory file */
15638+typedef enum {
15639+ FILE_NAME_FOUND = 0,
15640+ FILE_NAME_NOTFOUND = -ENOENT,
15641+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15642+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15643+} file_lookup_result;
15644+
15645+/* behaviors of lookup. If coord we are looking for is actually in a tree,
15646+ both coincide. */
15647+typedef enum {
15648+ /* search exactly for the coord with key given */
15649+ FIND_EXACT,
15650+ /* search for coord with the maximal key not greater than one
15651+ given */
15652+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
15653+} lookup_bias;
15654+
15655+typedef enum {
15656+ /* number of leaf level of the tree
15657+ The fake root has (tree_level=0). */
15658+ LEAF_LEVEL = 1,
15659+
15660+ /* number of level one above leaf level of the tree.
15661+
15662+ It is supposed that internal tree used by reiser4 to store file
15663+ system data and meta data will have height 2 initially (when
15664+ created by mkfs).
15665+ */
15666+ TWIG_LEVEL = 2,
15667+} tree_level;
15668+
15669+/* The "real" maximum ztree height is the 0-origin size of any per-level
15670+ array, since the zero'th level is not used. */
15671+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15672+
15673+/* enumeration of possible mutual position of item and coord. This enum is
15674+ return type of ->is_in_item() item plugin method which see. */
15675+typedef enum {
15676+ /* coord is on the left of an item */
15677+ IP_ON_THE_LEFT,
15678+ /* coord is inside item */
15679+ IP_INSIDE,
15680+ /* coord is inside item, but to the right of the rightmost unit of
15681+ this item */
15682+ IP_RIGHT_EDGE,
15683+ /* coord is on the right of an item */
15684+ IP_ON_THE_RIGHT
15685+} interposition;
15686+
15687+/* type of lock to acquire on znode before returning it to caller */
15688+typedef enum {
15689+ ZNODE_NO_LOCK = 0,
15690+ ZNODE_READ_LOCK = 1,
15691+ ZNODE_WRITE_LOCK = 2,
15692+} znode_lock_mode;
15693+
15694+/* type of lock request */
15695+typedef enum {
15696+ ZNODE_LOCK_LOPRI = 0,
15697+ ZNODE_LOCK_HIPRI = (1 << 0),
15698+
15699+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15700+ waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately
15701+ return the value -E_REPEAT. */
15702+ ZNODE_LOCK_NONBLOCK = (1 << 1),
15703+ /* An option for longterm_lock_znode which prevents atom fusion */
15704+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
15705+} znode_lock_request;
15706+
15707+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15708+
15709+/* used to specify direction of shift. These must be -1 and 1 */
15710+typedef enum {
15711+ SHIFT_LEFT = 1,
15712+ SHIFT_RIGHT = -1
15713+} shift_direction;
15714+
15715+typedef enum {
15716+ LEFT_SIDE,
15717+ RIGHT_SIDE
15718+} sideof;
15719+
15720+#define round_up( value, order ) \
15721+ ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \
15722+ ~( ( order ) - 1 ) ) )
15723+
15724+/* values returned by squalloc_right_neighbor and its auxiliary functions */
15725+typedef enum {
15726+ /* unit of internal item is moved */
15727+ SUBTREE_MOVED = 0,
15728+ /* nothing else can be squeezed into left neighbor */
15729+ SQUEEZE_TARGET_FULL = 1,
15730+ /* all content of node is squeezed into its left neighbor */
15731+ SQUEEZE_SOURCE_EMPTY = 2,
15732+ /* one more item is copied (this is only returned by
15733+ allocate_and_copy_extent to squalloc_twig)) */
15734+ SQUEEZE_CONTINUE = 3
15735+} squeeze_result;
15736+
15737+/* Do not change items ids. If you do - there will be format change */
15738+typedef enum {
15739+ STATIC_STAT_DATA_ID = 0x0,
15740+ SIMPLE_DIR_ENTRY_ID = 0x1,
15741+ COMPOUND_DIR_ID = 0x2,
15742+ NODE_POINTER_ID = 0x3,
15743+ EXTENT_POINTER_ID = 0x5,
15744+ FORMATTING_ID = 0x6,
15745+ CTAIL_ID = 0x7,
15746+ BLACK_BOX_ID = 0x8,
15747+ LAST_ITEM_ID = 0x9
15748+} item_id;
15749+
15750+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15751+ whether commit() was called or VM memory pressure was applied. */
15752+typedef enum {
15753+ /* submit flush queue to disk at jnode_flush completion */
15754+ JNODE_FLUSH_WRITE_BLOCKS = 1,
15755+
15756+ /* flush is called for commit */
15757+ JNODE_FLUSH_COMMIT = 2,
15758+ /* not implemented */
15759+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
15760+
15761+ /* not implemented */
15762+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15763+} jnode_flush_flags;
15764+
15765+/* Flags to insert/paste carry operations. Currently they only used in
15766+ flushing code, but in future, they can be used to optimize for repetitive
15767+ accesses. */
15768+typedef enum {
15769+ /* carry is not allowed to shift data to the left when trying to find
15770+ free space */
15771+ COPI_DONT_SHIFT_LEFT = (1 << 0),
15772+ /* carry is not allowed to shift data to the right when trying to find
15773+ free space */
15774+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
15775+ /* carry is not allowed to allocate new node(s) when trying to find
15776+ free space */
15777+ COPI_DONT_ALLOCATE = (1 << 2),
15778+ /* try to load left neighbor if its not in a cache */
15779+ COPI_LOAD_LEFT = (1 << 3),
15780+ /* try to load right neighbor if its not in a cache */
15781+ COPI_LOAD_RIGHT = (1 << 4),
15782+ /* shift insertion point to the left neighbor */
15783+ COPI_GO_LEFT = (1 << 5),
15784+ /* shift insertion point to the right neighbor */
15785+ COPI_GO_RIGHT = (1 << 6),
15786+ /* try to step back into original node if insertion into new node
15787+ fails after shifting data there. */
15788+ COPI_STEP_BACK = (1 << 7)
15789+} cop_insert_flag;
15790+
15791+typedef enum {
15792+ SAFE_UNLINK, /* safe-link for unlink */
15793+ SAFE_TRUNCATE /* safe-link for truncate */
15794+} reiser4_safe_link_t;
15795+
15796+/* this is to show on which list of atom jnode is */
15797+typedef enum {
15798+ NOT_CAPTURED,
15799+ DIRTY_LIST,
15800+ CLEAN_LIST,
15801+ FQ_LIST,
15802+ WB_LIST,
15803+ OVRWR_LIST
15804+} atom_list;
15805+
15806+/* __REISER4_FORWARD_H__ */
15807+#endif
15808+
15809+/* Make Linus happy.
15810+ Local variables:
15811+ c-indentation-style: "K&R"
15812+ mode-name: "LC"
15813+ c-basic-offset: 8
15814+ tab-width: 8
15815+ fill-column: 120
15816+ End:
15817+*/
15818diff --git a/fs/reiser4/fsdata.c b/fs/reiser4/fsdata.c
15819new file mode 100644
15820index 0000000..47da01c
15821--- /dev/null
15822+++ b/fs/reiser4/fsdata.c
15823@@ -0,0 +1,804 @@
15824+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15825+ * reiser4/README */
15826+
15827+#include "fsdata.h"
15828+#include "inode.h"
15829+
15830+
15831+/* cache or dir_cursors */
15832+static struct kmem_cache *d_cursor_cache;
15833+static struct shrinker *d_cursor_shrinker;
15834+
15835+/* list of unused cursors */
15836+static LIST_HEAD(cursor_cache);
15837+
15838+/* number of cursors in list of ununsed cursors */
15839+static unsigned long d_cursor_unused = 0;
15840+
15841+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15842+DEFINE_SPINLOCK(d_lock);
15843+
15844+static reiser4_file_fsdata *create_fsdata(struct file *file);
15845+static int file_is_stateless(struct file *file);
15846+static void free_fsdata(reiser4_file_fsdata *fsdata);
15847+static void kill_cursor(dir_cursor *);
15848+
15849+/**
15850+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15851+ * @nr: number of objects to free
15852+ * @mask: GFP mask
15853+ *
15854+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15855+ * number. Return number of still freeable cursors.
15856+ */
15857+static int d_cursor_shrink(int nr, gfp_t mask)
15858+{
15859+ if (nr != 0) {
15860+ dir_cursor *scan;
15861+ int killed;
15862+
15863+ killed = 0;
15864+ spin_lock(&d_lock);
15865+ while (!list_empty(&cursor_cache)) {
15866+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
15867+ assert("nikita-3567", scan->ref == 0);
15868+ kill_cursor(scan);
15869+ ++killed;
15870+ --nr;
15871+ if (nr == 0)
15872+ break;
15873+ }
15874+ spin_unlock(&d_lock);
15875+ }
15876+ return d_cursor_unused;
15877+}
15878+
15879+/**
15880+ * reiser4_init_d_cursor - create d_cursor cache
15881+ *
15882+ * Initializes slab cache of d_cursors. It is part of reiser4 module
15883+ * initialization.
15884+ */
15885+int reiser4_init_d_cursor(void)
15886+{
15887+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15888+ SLAB_HWCACHE_ALIGN, NULL, NULL);
15889+ if (d_cursor_cache == NULL)
15890+ return RETERR(-ENOMEM);
15891+
15892+ /*
15893+ * actually, d_cursors are "priceless", because there is no way to
15894+ * recover information stored in them. On the other hand, we don't
15895+ * want to consume all kernel memory by them. As a compromise, just
15896+ * assign higher "seeks" value to d_cursor cache, so that it will be
15897+ * shrunk only if system is really tight on memory.
15898+ */
15899+ d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15900+ d_cursor_shrink);
15901+ if (d_cursor_shrinker == NULL) {
15902+ destroy_reiser4_cache(&d_cursor_cache);
15903+ d_cursor_cache = NULL;
15904+ return RETERR(-ENOMEM);
15905+ }
15906+ return 0;
15907+}
15908+
15909+/**
15910+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15911+ *
15912+ * This is called on reiser4 module unloading or system shutdown.
15913+ */
15914+void reiser4_done_d_cursor(void)
15915+{
15916+ BUG_ON(d_cursor_shrinker == NULL);
15917+ remove_shrinker(d_cursor_shrinker);
15918+ d_cursor_shrinker = NULL;
15919+
15920+ destroy_reiser4_cache(&d_cursor_cache);
15921+}
15922+
15923+#define D_CURSOR_TABLE_SIZE (256)
15924+
15925+static inline unsigned long
15926+d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15927+{
15928+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15929+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15930+}
15931+
15932+static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15933+{
15934+ return k1->cid == k2->cid && k1->oid == k2->oid;
15935+}
15936+
15937+/*
15938+ * define functions to manipulate reiser4 super block's hash table of
15939+ * dir_cursors
15940+ */
15941+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15942+#define KFREE(ptr, size) kfree(ptr)
15943+TYPE_SAFE_HASH_DEFINE(d_cursor,
15944+ dir_cursor,
15945+ d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15946+#undef KFREE
15947+#undef KMALLOC
15948+
15949+/**
15950+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15951+ * @super: super block to initialize
15952+ *
15953+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15954+ * of mount.
15955+ */
15956+int reiser4_init_super_d_info(struct super_block *super)
15957+{
15958+ d_cursor_info *p;
15959+
15960+ p = &get_super_private(super)->d_info;
15961+
15962+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15963+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15964+}
15965+
15966+/**
15967+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
15968+ * @super: super block being umounted
15969+ *
15970+ * It is called on umount. Kills all directory cursors attached to suoer block.
15971+ */
15972+void reiser4_done_super_d_info(struct super_block *super)
15973+{
15974+ d_cursor_info *d_info;
15975+ dir_cursor *cursor, *next;
15976+
15977+ d_info = &get_super_private(super)->d_info;
15978+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15979+ kill_cursor(cursor);
15980+
15981+ BUG_ON(d_info->tree.rnode != NULL);
15982+ d_cursor_hash_done(&d_info->table);
15983+}
15984+
15985+/**
15986+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15987+ * @cursor: cursor to free
15988+ *
15989+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15990+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15991+ * indices, hash table, list of unused cursors and frees it.
15992+ */
15993+static void kill_cursor(dir_cursor *cursor)
15994+{
15995+ unsigned long index;
15996+
15997+ assert("nikita-3566", cursor->ref == 0);
15998+ assert("nikita-3572", cursor->fsdata != NULL);
15999+
16000+ index = (unsigned long)cursor->key.oid;
16001+ list_del_init(&cursor->fsdata->dir.linkage);
16002+ free_fsdata(cursor->fsdata);
16003+ cursor->fsdata = NULL;
16004+
16005+ if (list_empty_careful(&cursor->list))
16006+ /* this is last cursor for a file. Kill radix-tree entry */
16007+ radix_tree_delete(&cursor->info->tree, index);
16008+ else {
16009+ void **slot;
16010+
16011+ /*
16012+ * there are other cursors for the same oid.
16013+ */
16014+
16015+ /*
16016+ * if radix tree point to the cursor being removed, re-target
16017+ * radix tree slot to the next cursor in the (non-empty as was
16018+ * checked above) element of the circular list of all cursors
16019+ * for this oid.
16020+ */
16021+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
16022+ assert("nikita-3571", *slot != NULL);
16023+ if (*slot == cursor)
16024+ *slot = list_entry(cursor->list.next, dir_cursor, list);
16025+ /* remove cursor from circular list */
16026+ list_del_init(&cursor->list);
16027+ }
16028+ /* remove cursor from the list of unused cursors */
16029+ list_del_init(&cursor->alist);
16030+ /* remove cursor from the hash table */
16031+ d_cursor_hash_remove(&cursor->info->table, cursor);
16032+ /* and free it */
16033+ kmem_cache_free(d_cursor_cache, cursor);
16034+ --d_cursor_unused;
16035+}
16036+
16037+/* possible actions that can be performed on all cursors for the given file */
16038+enum cursor_action {
16039+ /*
16040+ * load all detached state: this is called when stat-data is loaded
16041+ * from the disk to recover information about all pending readdirs
16042+ */
16043+ CURSOR_LOAD,
16044+ /*
16045+ * detach all state from inode, leaving it in the cache. This is called
16046+ * when inode is removed form the memory by memory pressure
16047+ */
16048+ CURSOR_DISPOSE,
16049+ /*
16050+ * detach cursors from the inode, and free them. This is called when
16051+ * inode is destroyed
16052+ */
16053+ CURSOR_KILL
16054+};
16055+
16056+/*
16057+ * return d_cursor data for the file system @inode is in.
16058+ */
16059+static inline d_cursor_info *d_info(struct inode *inode)
16060+{
16061+ return &get_super_private(inode->i_sb)->d_info;
16062+}
16063+
16064+/*
16065+ * lookup d_cursor in the per-super-block radix tree.
16066+ */
16067+static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
16068+{
16069+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
16070+}
16071+
16072+/*
16073+ * attach @cursor to the radix tree. There may be multiple cursors for the
16074+ * same oid, they are chained into circular list.
16075+ */
16076+static void bind_cursor(dir_cursor * cursor, unsigned long index)
16077+{
16078+ dir_cursor *head;
16079+
16080+ head = lookup(cursor->info, index);
16081+ if (head == NULL) {
16082+ /* this is the first cursor for this index */
16083+ INIT_LIST_HEAD(&cursor->list);
16084+ radix_tree_insert(&cursor->info->tree, index, cursor);
16085+ } else {
16086+ /* some cursor already exists. Chain ours */
16087+ list_add(&cursor->list, &head->list);
16088+ }
16089+}
16090+
16091+/*
16092+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
16093+ * "unused" list. Called when file descriptor is not longer in active use.
16094+ */
16095+static void clean_fsdata(struct file *file)
16096+{
16097+ dir_cursor *cursor;
16098+ reiser4_file_fsdata *fsdata;
16099+
16100+ assert("nikita-3570", file_is_stateless(file));
16101+
16102+ fsdata = (reiser4_file_fsdata *) file->private_data;
16103+ if (fsdata != NULL) {
16104+ cursor = fsdata->cursor;
16105+ if (cursor != NULL) {
16106+ spin_lock(&d_lock);
16107+ --cursor->ref;
16108+ if (cursor->ref == 0) {
16109+ list_add_tail(&cursor->alist, &cursor_cache);
16110+ ++d_cursor_unused;
16111+ }
16112+ spin_unlock(&d_lock);
16113+ file->private_data = NULL;
16114+ }
16115+ }
16116+}
16117+
16118+/*
16119+ * global counter used to generate "client ids". These ids are encoded into
16120+ * high bits of fpos.
16121+ */
16122+static __u32 cid_counter = 0;
16123+#define CID_SHIFT (20)
16124+#define CID_MASK (0xfffffull)
16125+
16126+static void free_file_fsdata_nolock(struct file *);
16127+
16128+/**
16129+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
16130+ * @cursor:
16131+ * @file:
16132+ * @inode:
16133+ *
16134+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
16135+ * reiser4 super block's hash table and radix tree.
16136+ add detachable readdir
16137+ * state to the @f
16138+ */
16139+static int insert_cursor(dir_cursor *cursor, struct file *file,
16140+ struct inode *inode)
16141+{
16142+ int result;
16143+ reiser4_file_fsdata *fsdata;
16144+
16145+ memset(cursor, 0, sizeof *cursor);
16146+
16147+ /* this is either first call to readdir, or rewind. Anyway, create new
16148+ * cursor. */
16149+ fsdata = create_fsdata(NULL);
16150+ if (fsdata != NULL) {
16151+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
16152+ if (result == 0) {
16153+ d_cursor_info *info;
16154+ oid_t oid;
16155+
16156+ info = d_info(inode);
16157+ oid = get_inode_oid(inode);
16158+ /* cid occupies higher 12 bits of f->f_pos. Don't
16159+ * allow it to become negative: this confuses
16160+ * nfsd_readdir() */
16161+ cursor->key.cid = (++cid_counter) & 0x7ff;
16162+ cursor->key.oid = oid;
16163+ cursor->fsdata = fsdata;
16164+ cursor->info = info;
16165+ cursor->ref = 1;
16166+
16167+ spin_lock_inode(inode);
16168+ /* install cursor as @f's private_data, discarding old
16169+ * one if necessary */
16170+#if REISER4_DEBUG
16171+ if (file->private_data)
16172+ warning("", "file has fsdata already");
16173+#endif
16174+ clean_fsdata(file);
16175+ free_file_fsdata_nolock(file);
16176+ file->private_data = fsdata;
16177+ fsdata->cursor = cursor;
16178+ spin_unlock_inode(inode);
16179+ spin_lock(&d_lock);
16180+ /* insert cursor into hash table */
16181+ d_cursor_hash_insert(&info->table, cursor);
16182+ /* and chain it into radix-tree */
16183+ bind_cursor(cursor, (unsigned long)oid);
16184+ spin_unlock(&d_lock);
16185+ radix_tree_preload_end();
16186+ file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
16187+ }
16188+ } else
16189+ result = RETERR(-ENOMEM);
16190+ return result;
16191+}
16192+
16193+/**
16194+ * process_cursors - do action on each cursor attached to inode
16195+ * @inode:
16196+ * @act: action to do
16197+ *
16198+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
16199+ * and performs action specified by @act on each of cursors.
16200+ */
16201+static void process_cursors(struct inode *inode, enum cursor_action act)
16202+{
16203+ oid_t oid;
16204+ dir_cursor *start;
16205+ struct list_head *head;
16206+ reiser4_context *ctx;
16207+ d_cursor_info *info;
16208+
16209+ /* this can be called by
16210+ *
16211+ * kswapd->...->prune_icache->..reiser4_destroy_inode
16212+ *
16213+ * without reiser4_context
16214+ */
16215+ ctx = reiser4_init_context(inode->i_sb);
16216+ if (IS_ERR(ctx)) {
16217+ warning("vs-23", "failed to init context");
16218+ return;
16219+ }
16220+
16221+ assert("nikita-3558", inode != NULL);
16222+
16223+ info = d_info(inode);
16224+ oid = get_inode_oid(inode);
16225+ spin_lock_inode(inode);
16226+ head = get_readdir_list(inode);
16227+ spin_lock(&d_lock);
16228+ /* find any cursor for this oid: reference to it is hanging of radix
16229+ * tree */
16230+ start = lookup(info, (unsigned long)oid);
16231+ if (start != NULL) {
16232+ dir_cursor *scan;
16233+ reiser4_file_fsdata *fsdata;
16234+
16235+ /* process circular list of cursors for this oid */
16236+ scan = start;
16237+ do {
16238+ dir_cursor *next;
16239+
16240+ next = list_entry(scan->list.next, dir_cursor, list);
16241+ fsdata = scan->fsdata;
16242+ assert("nikita-3557", fsdata != NULL);
16243+ if (scan->key.oid == oid) {
16244+ switch (act) {
16245+ case CURSOR_DISPOSE:
16246+ list_del_init(&fsdata->dir.linkage);
16247+ break;
16248+ case CURSOR_LOAD:
16249+ list_add(&fsdata->dir.linkage, head);
16250+ break;
16251+ case CURSOR_KILL:
16252+ kill_cursor(scan);
16253+ break;
16254+ }
16255+ }
16256+ if (scan == next)
16257+ /* last cursor was just killed */
16258+ break;
16259+ scan = next;
16260+ } while (scan != start);
16261+ }
16262+ spin_unlock(&d_lock);
16263+ /* check that we killed 'em all */
16264+ assert("nikita-3568",
16265+ ergo(act == CURSOR_KILL,
16266+ list_empty_careful(get_readdir_list(inode))));
16267+ assert("nikita-3569",
16268+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
16269+ spin_unlock_inode(inode);
16270+ reiser4_exit_context(ctx);
16271+}
16272+
16273+/**
16274+ * reiser4_dispose_cursors - removes cursors from inode's list
16275+ * @inode: inode to dispose cursors of
16276+ *
16277+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
16278+ * attached to cursor from inode's readdir list. This is called when inode is
16279+ * removed from the memory by memory pressure.
16280+ */
16281+void reiser4_dispose_cursors(struct inode *inode)
16282+{
16283+ process_cursors(inode, CURSOR_DISPOSE);
16284+}
16285+
16286+/**
16287+ * reiser4_load_cursors - attach cursors to inode
16288+ * @inode: inode to load cursors to
16289+ *
16290+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
16291+ * attached to cursor to inode's readdir list. This is done when inode is
16292+ * loaded into memory.
16293+ */
16294+void reiser4_load_cursors(struct inode *inode)
16295+{
16296+ process_cursors(inode, CURSOR_LOAD);
16297+}
16298+
16299+/**
16300+ * reiser4_kill_cursors - kill all inode cursors
16301+ * @inode: inode to kill cursors of
16302+ *
16303+ * Frees all cursors for this inode. This is called when inode is destroyed.
16304+ */
16305+void reiser4_kill_cursors(struct inode *inode)
16306+{
16307+ process_cursors(inode, CURSOR_KILL);
16308+}
16309+
16310+/**
16311+ * file_is_stateless -
16312+ * @file:
16313+ *
16314+ * true, if file descriptor @f is created by NFS server by "demand" to serve
16315+ * one file system operation. This means that there may be "detached state"
16316+ * for underlying inode.
16317+ */
16318+static int file_is_stateless(struct file *file)
16319+{
16320+ return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16321+}
16322+
16323+/**
16324+ * reiser4_get_dir_fpos -
16325+ * @dir:
16326+ *
16327+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16328+ * in the case of stateless directory operation (readdir-over-nfs), client id
16329+ * was encoded in the high bits of cookie and should me masked off.
16330+ */
16331+loff_t reiser4_get_dir_fpos(struct file *dir)
16332+{
16333+ if (file_is_stateless(dir))
16334+ return dir->f_pos & CID_MASK;
16335+ else
16336+ return dir->f_pos;
16337+}
16338+
16339+/**
16340+ * reiser4_attach_fsdata - try to attach fsdata
16341+ * @file:
16342+ * @inode:
16343+ *
16344+ * Finds or creates cursor for readdir-over-nfs.
16345+ */
16346+int reiser4_attach_fsdata(struct file *file, struct inode *inode)
16347+{
16348+ loff_t pos;
16349+ int result;
16350+ dir_cursor *cursor;
16351+
16352+ /*
16353+ * we are serialized by inode->i_mutex
16354+ */
16355+ if (!file_is_stateless(file))
16356+ return 0;
16357+
16358+ pos = file->f_pos;
16359+ result = 0;
16360+ if (pos == 0) {
16361+ /*
16362+ * first call to readdir (or rewind to the beginning of
16363+ * directory)
16364+ */
16365+ cursor = kmem_cache_alloc(d_cursor_cache,
16366+ reiser4_ctx_gfp_mask_get());
16367+ if (cursor != NULL)
16368+ result = insert_cursor(cursor, file, inode);
16369+ else
16370+ result = RETERR(-ENOMEM);
16371+ } else {
16372+ /* try to find existing cursor */
16373+ d_cursor_key key;
16374+
16375+ key.cid = pos >> CID_SHIFT;
16376+ key.oid = get_inode_oid(inode);
16377+ spin_lock(&d_lock);
16378+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16379+ if (cursor != NULL) {
16380+ /* cursor was found */
16381+ if (cursor->ref == 0) {
16382+ /* move it from unused list */
16383+ list_del_init(&cursor->alist);
16384+ --d_cursor_unused;
16385+ }
16386+ ++cursor->ref;
16387+ }
16388+ spin_unlock(&d_lock);
16389+ if (cursor != NULL) {
16390+ spin_lock_inode(inode);
16391+ assert("nikita-3556", cursor->fsdata->back == NULL);
16392+ clean_fsdata(file);
16393+ free_file_fsdata_nolock(file);
16394+ file->private_data = cursor->fsdata;
16395+ spin_unlock_inode(inode);
16396+ }
16397+ }
16398+ return result;
16399+}
16400+
16401+/**
16402+ * reiser4_detach_fsdata - ???
16403+ * @file:
16404+ *
16405+ * detach fsdata, if necessary
16406+ */
16407+void reiser4_detach_fsdata(struct file *file)
16408+{
16409+ struct inode *inode;
16410+
16411+ if (!file_is_stateless(file))
16412+ return;
16413+
16414+ inode = file->f_dentry->d_inode;
16415+ spin_lock_inode(inode);
16416+ clean_fsdata(file);
16417+ spin_unlock_inode(inode);
16418+}
16419+
16420+/* slab for reiser4_dentry_fsdata */
16421+static struct kmem_cache *dentry_fsdata_cache;
16422+
16423+/**
16424+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
16425+ *
16426+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
16427+ * part of reiser4 module initialization.
16428+ */
16429+int reiser4_init_dentry_fsdata(void)
16430+{
16431+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16432+ sizeof(reiser4_dentry_fsdata),
16433+ 0,
16434+ SLAB_HWCACHE_ALIGN |
16435+ SLAB_RECLAIM_ACCOUNT, NULL,
16436+ NULL);
16437+ if (dentry_fsdata_cache == NULL)
16438+ return RETERR(-ENOMEM);
16439+ return 0;
16440+}
16441+
16442+/**
16443+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
16444+ *
16445+ * This is called on reiser4 module unloading or system shutdown.
16446+ */
16447+void reiser4_done_dentry_fsdata(void)
16448+{
16449+ destroy_reiser4_cache(&dentry_fsdata_cache);
16450+}
16451+
16452+/**
16453+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
16454+ * @dentry: queried dentry
16455+ *
16456+ * Allocates if necessary and returns per-dentry data that we attach to each
16457+ * dentry.
16458+ */
16459+reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16460+{
16461+ assert("nikita-1365", dentry != NULL);
16462+
16463+ if (dentry->d_fsdata == NULL) {
16464+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16465+ reiser4_ctx_gfp_mask_get());
16466+ if (dentry->d_fsdata == NULL)
16467+ return ERR_PTR(RETERR(-ENOMEM));
16468+ memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
16469+ }
16470+ return dentry->d_fsdata;
16471+}
16472+
16473+/**
16474+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16475+ * @dentry: dentry to free fsdata of
16476+ *
16477+ * Detaches and frees fs-specific dentry data
16478+ */
16479+void reiser4_free_dentry_fsdata(struct dentry *dentry)
16480+{
16481+ if (dentry->d_fsdata != NULL) {
16482+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16483+ dentry->d_fsdata = NULL;
16484+ }
16485+}
16486+
16487+/* slab for reiser4_file_fsdata */
16488+static struct kmem_cache *file_fsdata_cache;
16489+
16490+/**
16491+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16492+ *
16493+ * Initializes slab cache of structures attached to file->private_data. It is
16494+ * part of reiser4 module initialization.
16495+ */
16496+int reiser4_init_file_fsdata(void)
16497+{
16498+ file_fsdata_cache = kmem_cache_create("file_fsdata",
16499+ sizeof(reiser4_file_fsdata),
16500+ 0,
16501+ SLAB_HWCACHE_ALIGN |
16502+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16503+ if (file_fsdata_cache == NULL)
16504+ return RETERR(-ENOMEM);
16505+ return 0;
16506+}
16507+
16508+/**
16509+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16510+ *
16511+ * This is called on reiser4 module unloading or system shutdown.
16512+ */
16513+void reiser4_done_file_fsdata(void)
16514+{
16515+ destroy_reiser4_cache(&file_fsdata_cache);
16516+}
16517+
16518+/**
16519+ * create_fsdata - allocate and initialize reiser4_file_fsdata
16520+ * @file: what to create file_fsdata for, may be NULL
16521+ *
16522+ * Allocates and initializes reiser4_file_fsdata structure.
16523+ */
16524+static reiser4_file_fsdata *create_fsdata(struct file *file)
16525+{
16526+ reiser4_file_fsdata *fsdata;
16527+
16528+ fsdata = kmem_cache_alloc(file_fsdata_cache,
16529+ reiser4_ctx_gfp_mask_get());
16530+ if (fsdata != NULL) {
16531+ memset(fsdata, 0, sizeof *fsdata);
16532+ fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16533+ fsdata->back = file;
16534+ INIT_LIST_HEAD(&fsdata->dir.linkage);
16535+ }
16536+ return fsdata;
16537+}
16538+
16539+/**
16540+ * free_fsdata - free reiser4_file_fsdata
16541+ * @fsdata: object to free
16542+ *
16543+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
16544+ */
16545+static void free_fsdata(reiser4_file_fsdata *fsdata)
16546+{
16547+ BUG_ON(fsdata == NULL);
16548+ kmem_cache_free(file_fsdata_cache, fsdata);
16549+}
16550+
16551+/**
16552+ * reiser4_get_file_fsdata - get fs-specific file data
16553+ * @file: queried file
16554+ *
16555+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16556+ * to @file.
16557+ */
16558+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16559+{
16560+ assert("nikita-1603", file != NULL);
16561+
16562+ if (file->private_data == NULL) {
16563+ reiser4_file_fsdata *fsdata;
16564+ struct inode *inode;
16565+
16566+ fsdata = create_fsdata(file);
16567+ if (fsdata == NULL)
16568+ return ERR_PTR(RETERR(-ENOMEM));
16569+
16570+ inode = file->f_dentry->d_inode;
16571+ spin_lock_inode(inode);
16572+ if (file->private_data == NULL) {
16573+ file->private_data = fsdata;
16574+ fsdata = NULL;
16575+ }
16576+ spin_unlock_inode(inode);
16577+ if (fsdata != NULL)
16578+ /* other thread initialized ->fsdata */
16579+ kmem_cache_free(file_fsdata_cache, fsdata);
16580+ }
16581+ assert("nikita-2665", file->private_data != NULL);
16582+ return file->private_data;
16583+}
16584+
16585+/**
16586+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16587+ * @file:
16588+ *
16589+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16590+ * readdir list, frees if it is not linked to d_cursor object.
16591+ */
16592+static void free_file_fsdata_nolock(struct file *file)
16593+{
16594+ reiser4_file_fsdata *fsdata;
16595+
16596+ assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16597+ fsdata = file->private_data;
16598+ if (fsdata != NULL) {
16599+ list_del_init(&fsdata->dir.linkage);
16600+ if (fsdata->cursor == NULL)
16601+ free_fsdata(fsdata);
16602+ }
16603+ file->private_data = NULL;
16604+}
16605+
16606+/**
16607+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16608+ * @file:
16609+ *
16610+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16611+ */
16612+void reiser4_free_file_fsdata(struct file *file)
16613+{
16614+ spin_lock_inode(file->f_dentry->d_inode);
16615+ free_file_fsdata_nolock(file);
16616+ spin_unlock_inode(file->f_dentry->d_inode);
16617+}
16618+
16619+/*
16620+ * Local variables:
16621+ * c-indentation-style: "K&R"
16622+ * mode-name: "LC"
16623+ * c-basic-offset: 8
16624+ * tab-width: 8
16625+ * fill-column: 79
16626+ * End:
16627+ */
16628diff --git a/fs/reiser4/fsdata.h b/fs/reiser4/fsdata.h
16629new file mode 100644
16630index 0000000..49e8ebf
16631--- /dev/null
16632+++ b/fs/reiser4/fsdata.h
16633@@ -0,0 +1,207 @@
16634+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16635+ * reiser4/README */
16636+
16637+#if !defined( __REISER4_FSDATA_H__ )
16638+#define __REISER4_FSDATA_H__
16639+
16640+#include "debug.h"
16641+#include "kassign.h"
16642+#include "seal.h"
16643+#include "type_safe_hash.h"
16644+#include "plugin/file/file.h"
16645+#include "readahead.h"
16646+
16647+/*
16648+ * comment about reiser4_dentry_fsdata
16649+ *
16650+ *
16651+ */
16652+
16653+/*
16654+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
16655+ * protected by ->i_mutex on inode. Under this lock following invariant
16656+ * holds:
16657+ *
16658+ * file descriptor is "looking" at the entry_no-th directory entry from
16659+ * the beginning of directory. This entry has key dir_entry_key and is
16660+ * pos-th entry with duplicate-key sequence.
16661+ *
16662+ */
16663+
16664+/* logical position within directory */
16665+typedef struct {
16666+ /* key of directory entry (actually, part of a key sufficient to
16667+ identify directory entry) */
16668+ de_id dir_entry_key;
16669+ /* ordinal number of directory entry among all entries with the same
16670+ key. (Starting from 0.) */
16671+ unsigned pos;
16672+} dir_pos;
16673+
16674+typedef struct {
16675+ /* f_pos corresponding to this readdir position */
16676+ __u64 fpos;
16677+ /* logical position within directory */
16678+ dir_pos position;
16679+ /* logical number of directory entry within
16680+ directory */
16681+ __u64 entry_no;
16682+} readdir_pos;
16683+
16684+/*
16685+ * this is used to speed up lookups for directory entry: on initial call to
16686+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
16687+ * in struct dentry and reused later to avoid tree traversals.
16688+ */
16689+typedef struct de_location {
16690+ /* seal covering directory entry */
16691+ seal_t entry_seal;
16692+ /* coord of directory entry */
16693+ coord_t entry_coord;
16694+ /* ordinal number of directory entry among all entries with the same
16695+ key. (Starting from 0.) */
16696+ int pos;
16697+} de_location;
16698+
16699+/**
16700+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16701+ *
16702+ * This is allocated dynamically and released in d_op->d_release()
16703+ *
16704+ * Currently it only contains cached location (hint) of directory entry, but
16705+ * it is expected that other information will be accumulated here.
16706+ */
16707+typedef struct reiser4_dentry_fsdata {
16708+ /*
16709+ * here will go fields filled by ->lookup() to speedup next
16710+ * create/unlink, like blocknr of znode with stat-data, or key of
16711+ * stat-data.
16712+ */
16713+ de_location dec;
16714+ int stateless; /* created through reiser4_decode_fh, needs special
16715+ * treatment in readdir. */
16716+} reiser4_dentry_fsdata;
16717+
16718+extern int reiser4_init_dentry_fsdata(void);
16719+extern void reiser4_done_dentry_fsdata(void);
16720+extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16721+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16722+
16723+/**
16724+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16725+ *
16726+ * This is allocated dynamically and released in inode->i_fop->release
16727+ */
16728+typedef struct reiser4_file_fsdata {
16729+ /*
16730+ * pointer back to the struct file which this reiser4_file_fsdata is
16731+ * part of
16732+ */
16733+ struct file *back;
16734+ /* detached cursor for stateless readdir. */
16735+ struct dir_cursor *cursor;
16736+ /*
16737+ * We need both directory and regular file parts here, because there
16738+ * are file system objects that are files and directories.
16739+ */
16740+ struct {
16741+ /*
16742+ * position in directory. It is updated each time directory is
16743+ * modified
16744+ */
16745+ readdir_pos readdir;
16746+ /* head of this list is reiser4_inode->lists.readdir_list */
16747+ struct list_head linkage;
16748+ } dir;
16749+ /* hints to speed up operations with regular files: read and write. */
16750+ struct {
16751+ hint_t hint;
16752+ } reg;
16753+ struct reiser4_file_ra_state ra1;
16754+
16755+} reiser4_file_fsdata;
16756+
16757+extern int reiser4_init_file_fsdata(void);
16758+extern void reiser4_done_file_fsdata(void);
16759+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16760+extern void reiser4_free_file_fsdata(struct file *);
16761+
16762+/*
16763+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16764+ * used to address problem reiser4 has with readdir accesses via NFS. See
16765+ * plugin/file_ops_readdir.c for more details.
16766+ */
16767+typedef struct {
16768+ __u16 cid;
16769+ __u64 oid;
16770+} d_cursor_key;
16771+
16772+/*
16773+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16774+ * maintain hash table of dir_cursor-s in reiser4's super block
16775+ */
16776+typedef struct dir_cursor dir_cursor;
16777+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16778+
16779+typedef struct d_cursor_info d_cursor_info;
16780+
16781+struct dir_cursor {
16782+ int ref;
16783+ reiser4_file_fsdata *fsdata;
16784+
16785+ /* link to reiser4 super block hash table of cursors */
16786+ d_cursor_hash_link hash;
16787+
16788+ /*
16789+ * this is to link cursors to reiser4 super block's radix tree of
16790+ * cursors if there are more than one cursor of the same objectid
16791+ */
16792+ struct list_head list;
16793+ d_cursor_key key;
16794+ d_cursor_info *info;
16795+ /* list of unused cursors */
16796+ struct list_head alist;
16797+};
16798+
16799+extern int reiser4_init_d_cursor(void);
16800+extern void reiser4_done_d_cursor(void);
16801+
16802+extern int reiser4_init_super_d_info(struct super_block *);
16803+extern void reiser4_done_super_d_info(struct super_block *);
16804+
16805+extern loff_t reiser4_get_dir_fpos(struct file *);
16806+extern int reiser4_attach_fsdata(struct file *, struct inode *);
16807+extern void reiser4_detach_fsdata(struct file *);
16808+
16809+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16810+ more details */
16811+void reiser4_dispose_cursors(struct inode *inode);
16812+void reiser4_load_cursors(struct inode *inode);
16813+void reiser4_kill_cursors(struct inode *inode);
16814+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16815+ int offset, int adj);
16816+
16817+/*
16818+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16819+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16820+ */
16821+struct d_cursor_info {
16822+ d_cursor_hash_table table;
16823+ struct radix_tree_root tree;
16824+};
16825+
16826+/* spinlock protecting readdir cursors */
16827+extern spinlock_t d_lock;
16828+
16829+/* __REISER4_FSDATA_H__ */
16830+#endif
16831+
16832+/*
16833+ * Local variables:
16834+ * c-indentation-style: "K&R"
16835+ * mode-name: "LC"
16836+ * c-basic-offset: 8
16837+ * tab-width: 8
16838+ * fill-column: 120
16839+ * End:
16840+ */
16841diff --git a/fs/reiser4/init_super.c b/fs/reiser4/init_super.c
16842new file mode 100644
16843index 0000000..3513d5f
16844--- /dev/null
16845+++ b/fs/reiser4/init_super.c
16846@@ -0,0 +1,750 @@
16847+/* Copyright by Hans Reiser, 2003 */
16848+
16849+#include "super.h"
16850+#include "inode.h"
16851+#include "plugin/plugin_set.h"
16852+
16853+#include <linux/swap.h>
16854+
16855+/**
16856+ * init_fs_info - allocate reiser4 specific super block
16857+ * @super: super block of filesystem
16858+ *
16859+ * Allocates and initialize reiser4_super_info_data, attaches it to
16860+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
16861+ */
16862+int reiser4_init_fs_info(struct super_block *super)
16863+{
16864+ reiser4_super_info_data *sbinfo;
16865+
16866+ sbinfo = kmalloc(sizeof(reiser4_super_info_data),
16867+ reiser4_ctx_gfp_mask_get());
16868+ if (!sbinfo)
16869+ return RETERR(-ENOMEM);
16870+
16871+ super->s_fs_info = sbinfo;
16872+ super->s_op = NULL;
16873+ memset(sbinfo, 0, sizeof(*sbinfo));
16874+
16875+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16876+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16877+
16878+ mutex_init(&sbinfo->delete_mutex);
16879+ spin_lock_init(&(sbinfo->guard));
16880+
16881+ /* initialize per-super-block d_cursor resources */
16882+ reiser4_init_super_d_info(super);
16883+
16884+ return 0;
16885+}
16886+
16887+/**
16888+ * reiser4_done_fs_info - free reiser4 specific super block
16889+ * @super: super block of filesystem
16890+ *
16891+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
16892+ * frees reiser4_super_info_data.
16893+ */
16894+void reiser4_done_fs_info(struct super_block *super)
16895+{
16896+ assert("zam-990", super->s_fs_info != NULL);
16897+
16898+ /* release per-super-block d_cursor resources */
16899+ reiser4_done_super_d_info(super);
16900+
16901+ /* make sure that there are not jnodes already */
16902+ assert("", list_empty(&get_super_private(super)->all_jnodes));
16903+ assert("", get_current_context()->trans->atom == NULL);
16904+ reiser4_check_block_counters(super);
16905+ kfree(super->s_fs_info);
16906+ super->s_fs_info = NULL;
16907+}
16908+
16909+/* type of option parseable by parse_option() */
16910+typedef enum {
16911+ /* value of option is arbitrary string */
16912+ OPT_STRING,
16913+
16914+ /*
16915+ * option specifies bit in a bitmask. When option is set - bit in
16916+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16917+ * dont_load_bitmap, atomic_write.
16918+ */
16919+ OPT_BIT,
16920+
16921+ /*
16922+ * value of option should conform to sprintf() format. Examples are
16923+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16924+ */
16925+ OPT_FORMAT,
16926+
16927+ /*
16928+ * option can take one of predefined values. Example is onerror=panic or
16929+ * onerror=remount-ro
16930+ */
16931+ OPT_ONEOF,
16932+} opt_type_t;
16933+
16934+typedef struct opt_bitmask_bit {
16935+ const char *bit_name;
16936+ int bit_nr;
16937+} opt_bitmask_bit;
16938+
16939+/* description of option parseable by parse_option() */
16940+typedef struct opt_desc {
16941+ /* option name.
16942+
16943+ parsed portion of string has a form "name=value".
16944+ */
16945+ const char *name;
16946+ /* type of option */
16947+ opt_type_t type;
16948+ union {
16949+ /* where to store value of string option (type == OPT_STRING) */
16950+ char **string;
16951+ /* description of bits for bit option (type == OPT_BIT) */
16952+ struct {
16953+ int nr;
16954+ void *addr;
16955+ } bit;
16956+ /* description of format and targets for format option (type
16957+ == OPT_FORMAT) */
16958+ struct {
16959+ const char *format;
16960+ int nr_args;
16961+ void *arg1;
16962+ void *arg2;
16963+ void *arg3;
16964+ void *arg4;
16965+ } f;
16966+ struct {
16967+ int *result;
16968+ const char *list[10];
16969+ } oneof;
16970+ struct {
16971+ void *addr;
16972+ int nr_bits;
16973+ opt_bitmask_bit *bits;
16974+ } bitmask;
16975+ } u;
16976+} opt_desc_t;
16977+
16978+/**
16979+ * parse_option - parse one option
16980+ * @opt_strin: starting point of parsing
16981+ * @opt: option description
16982+ *
16983+ * foo=bar,
16984+ * ^ ^ ^
16985+ * | | +-- replaced to '\0'
16986+ * | +-- val_start
16987+ * +-- opt_string
16988+ * Figures out option type and handles option correspondingly.
16989+ */
16990+static int parse_option(char *opt_string, opt_desc_t *opt)
16991+{
16992+ char *val_start;
16993+ int result;
16994+ const char *err_msg;
16995+
16996+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16997+
16998+ val_start = strchr(opt_string, '=');
16999+ if (val_start != NULL) {
17000+ *val_start = '\0';
17001+ ++val_start;
17002+ }
17003+
17004+ err_msg = NULL;
17005+ result = 0;
17006+ switch (opt->type) {
17007+ case OPT_STRING:
17008+ if (val_start == NULL) {
17009+ err_msg = "String arg missing";
17010+ result = RETERR(-EINVAL);
17011+ } else
17012+ *opt->u.string = val_start;
17013+ break;
17014+ case OPT_BIT:
17015+ if (val_start != NULL)
17016+ err_msg = "Value ignored";
17017+ else
17018+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
17019+ break;
17020+ case OPT_FORMAT:
17021+ if (val_start == NULL) {
17022+ err_msg = "Formatted arg missing";
17023+ result = RETERR(-EINVAL);
17024+ break;
17025+ }
17026+ if (sscanf(val_start, opt->u.f.format,
17027+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
17028+ opt->u.f.arg4) != opt->u.f.nr_args) {
17029+ err_msg = "Wrong conversion";
17030+ result = RETERR(-EINVAL);
17031+ }
17032+ break;
17033+ case OPT_ONEOF:
17034+ {
17035+ int i = 0;
17036+
17037+ if (val_start == NULL) {
17038+ err_msg = "Value is missing";
17039+ result = RETERR(-EINVAL);
17040+ break;
17041+ }
17042+ err_msg = "Wrong option value";
17043+ result = RETERR(-EINVAL);
17044+ while (opt->u.oneof.list[i]) {
17045+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
17046+ result = 0;
17047+ err_msg = NULL;
17048+ *opt->u.oneof.result = i;
17049+ break;
17050+ }
17051+ i++;
17052+ }
17053+ break;
17054+ }
17055+ default:
17056+ wrong_return_value("nikita-2100", "opt -> type");
17057+ break;
17058+ }
17059+ if (err_msg != NULL) {
17060+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
17061+ err_msg, opt->name, val_start ? "=" : "",
17062+ val_start ? : "");
17063+ }
17064+ return result;
17065+}
17066+
17067+/**
17068+ * parse_options - parse reiser4 mount options
17069+ * @opt_string: starting point
17070+ * @opts: array of option description
17071+ * @nr_opts: number of elements in @opts
17072+ *
17073+ * Parses comma separated list of reiser4 mount options.
17074+ */
17075+static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
17076+{
17077+ int result;
17078+
17079+ result = 0;
17080+ while ((result == 0) && opt_string && *opt_string) {
17081+ int j;
17082+ char *next;
17083+
17084+ next = strchr(opt_string, ',');
17085+ if (next != NULL) {
17086+ *next = '\0';
17087+ ++next;
17088+ }
17089+ for (j = 0; j < nr_opts; ++j) {
17090+ if (!strncmp(opt_string, opts[j].name,
17091+ strlen(opts[j].name))) {
17092+ result = parse_option(opt_string, &opts[j]);
17093+ break;
17094+ }
17095+ }
17096+ if (j == nr_opts) {
17097+ warning("nikita-2307", "Unrecognized option: \"%s\"",
17098+ opt_string);
17099+ /* traditionally, -EINVAL is returned on wrong mount
17100+ option */
17101+ result = RETERR(-EINVAL);
17102+ }
17103+ opt_string = next;
17104+ }
17105+ return result;
17106+}
17107+
17108+#define NUM_OPT( label, fmt, addr ) \
17109+ { \
17110+ .name = ( label ), \
17111+ .type = OPT_FORMAT, \
17112+ .u = { \
17113+ .f = { \
17114+ .format = ( fmt ), \
17115+ .nr_args = 1, \
17116+ .arg1 = ( addr ), \
17117+ .arg2 = NULL, \
17118+ .arg3 = NULL, \
17119+ .arg4 = NULL \
17120+ } \
17121+ } \
17122+ }
17123+
17124+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
17125+
17126+#define BIT_OPT(label, bitnr) \
17127+ { \
17128+ .name = label, \
17129+ .type = OPT_BIT, \
17130+ .u = { \
17131+ .bit = { \
17132+ .nr = bitnr, \
17133+ .addr = &sbinfo->fs_flags \
17134+ } \
17135+ } \
17136+ }
17137+
17138+#define MAX_NR_OPTIONS (30)
17139+
17140+/**
17141+ * reiser4_init_super_data - initialize reiser4 private super block
17142+ * @super: super block to initialize
17143+ * @opt_string: list of reiser4 mount options
17144+ *
17145+ * Sets various reiser4 parameters to default values. Parses mount options and
17146+ * overwrites default settings.
17147+ */
17148+int reiser4_init_super_data(struct super_block *super, char *opt_string)
17149+{
17150+ int result;
17151+ opt_desc_t *opts, *p;
17152+ reiser4_super_info_data *sbinfo = get_super_private(super);
17153+
17154+ /* initialize super, export, dentry operations */
17155+ sbinfo->ops.super = reiser4_super_operations;
17156+ sbinfo->ops.export = reiser4_export_operations;
17157+ sbinfo->ops.dentry = reiser4_dentry_operations;
17158+ super->s_op = &sbinfo->ops.super;
17159+ super->s_export_op = &sbinfo->ops.export;
17160+
17161+ /* initialize transaction manager parameters to default values */
17162+ sbinfo->tmgr.atom_max_size = totalram_pages / 4;
17163+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
17164+ sbinfo->tmgr.atom_min_size = 256;
17165+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
17166+
17167+ /* initialize cbk cache parameter */
17168+ sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
17169+
17170+ /* initialize flush parameters */
17171+ sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
17172+ sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
17173+ sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
17174+ sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
17175+
17176+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
17177+
17178+ /* preliminary tree initializations */
17179+ sbinfo->tree.super = super;
17180+ sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
17181+ sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
17182+ sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
17183+ sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
17184+ rwlock_init(&(sbinfo->tree.tree_lock));
17185+ spin_lock_init(&(sbinfo->tree.epoch_lock));
17186+
17187+ /* initialize default readahead params */
17188+ sbinfo->ra_params.max = num_physpages / 4;
17189+ sbinfo->ra_params.flags = 0;
17190+
17191+ /* allocate memory for structure describing reiser4 mount options */
17192+ opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS,
17193+ reiser4_ctx_gfp_mask_get());
17194+ if (opts == NULL)
17195+ return RETERR(-ENOMEM);
17196+
17197+ /* initialize structure describing reiser4 mount options */
17198+ p = opts;
17199+
17200+#if REISER4_DEBUG
17201+# define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) { \
17202+ warning ("zam-1046", "opt array is overloaded"); break; \
17203+ }
17204+#else
17205+# define OPT_ARRAY_CHECK noop
17206+#endif
17207+
17208+#define PUSH_OPT(...) \
17209+do { \
17210+ opt_desc_t o = __VA_ARGS__; \
17211+ OPT_ARRAY_CHECK; \
17212+ *p ++ = o; \
17213+} while (0)
17214+
17215+#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
17216+#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
17217+
17218+ /*
17219+ * tmgr.atom_max_size=N
17220+ * Atoms containing more than N blocks will be forced to commit. N is
17221+ * decimal.
17222+ */
17223+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
17224+ /*
17225+ * tmgr.atom_max_age=N
17226+ * Atoms older than N seconds will be forced to commit. N is decimal.
17227+ */
17228+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
17229+ /*
17230+ * tmgr.atom_min_size=N
17231+ * In committing an atom to free dirty pages, force the atom less than
17232+ * N in size to fuse with another one.
17233+ */
17234+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
17235+ /*
17236+ * tmgr.atom_max_flushers=N
17237+ * limit of concurrent flushers for one atom. 0 means no limit.
17238+ */
17239+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
17240+ /*
17241+ * tree.cbk_cache_slots=N
17242+ * Number of slots in the cbk cache.
17243+ */
17244+ PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
17245+ /*
17246+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
17247+ * leaf-level blocks it will force them to be relocated.
17248+ */
17249+ PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
17250+ /*
17251+ * If flush finds can find a block allocation closer than at most
17252+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
17253+ * position.
17254+ */
17255+ PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
17256+ /*
17257+ * If we have written this much or more blocks before encountering busy
17258+ * jnode in flush list - abort flushing hoping that next time we get
17259+ * called this jnode will be clean already, and we will save some
17260+ * seeks.
17261+ */
17262+ PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
17263+ /* The maximum number of nodes to scan left on a level during flush. */
17264+ PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
17265+ /* preferred IO size */
17266+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
17267+ /* carry flags used for insertion of new nodes */
17268+ PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
17269+ /* carry flags used for insertion of new extents */
17270+ PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
17271+ /* carry flags used for paste operations */
17272+ PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
17273+ /* carry flags used for insert operations */
17274+ PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
17275+
17276+#ifdef CONFIG_REISER4_BADBLOCKS
17277+ /*
17278+ * Alternative master superblock location in case if it's original
17279+ * location is not writeable/accessable. This is offset in BYTES.
17280+ */
17281+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
17282+#endif
17283+
17284+ /* turn on BSD-style gid assignment */
17285+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
17286+ /* turn on 32 bit times */
17287+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
17288+ /*
17289+ * Don't load all bitmap blocks at mount time, it is useful for
17290+ * machines with tiny RAM and large disks.
17291+ */
17292+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
17293+ /* disable transaction commits during write() */
17294+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
17295+ /* disable use of write barriers in the reiser4 log writer. */
17296+ PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
17297+
17298+ PUSH_OPT(
17299+ {
17300+ /*
17301+ * tree traversal readahead parameters:
17302+ * -o readahead:MAXNUM:FLAGS
17303+ * MAXNUM - max number fo nodes to request readahead for: -1UL
17304+ * will set it to max_sane_readahead()
17305+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17306+ * CONTINUE_ON_PRESENT
17307+ */
17308+ .name = "readahead",
17309+ .type = OPT_FORMAT,
17310+ .u = {
17311+ .f = {
17312+ .format = "%u:%u",
17313+ .nr_args = 2,
17314+ .arg1 = &sbinfo->ra_params.max,
17315+ .arg2 = &sbinfo->ra_params.flags,
17316+ .arg3 = NULL,
17317+ .arg4 = NULL
17318+ }
17319+ }
17320+ }
17321+ );
17322+
17323+ /* What to do in case of fs error */
17324+ PUSH_OPT(
17325+ {
17326+ .name = "onerror",
17327+ .type = OPT_ONEOF,
17328+ .u = {
17329+ .oneof = {
17330+ .result = &sbinfo->onerror,
17331+ .list = {
17332+ "panic", "remount-ro", NULL
17333+ },
17334+ }
17335+ }
17336+ }
17337+ );
17338+
17339+ /* modify default settings to values set by mount options */
17340+ result = parse_options(opt_string, opts, p - opts);
17341+ kfree(opts);
17342+ if (result != 0)
17343+ return result;
17344+
17345+ /* correct settings to sanity values */
17346+ sbinfo->tmgr.atom_max_age *= HZ;
17347+ if (sbinfo->tmgr.atom_max_age <= 0)
17348+ /* overflow */
17349+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17350+
17351+ /* round optimal io size up to 512 bytes */
17352+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17353+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17354+ if (sbinfo->optimal_io_size == 0) {
17355+ warning("nikita-2497", "optimal_io_size is too small");
17356+ return RETERR(-EINVAL);
17357+ }
17358+ return result;
17359+}
17360+
17361+/**
17362+ * reiser4_init_read_super - read reiser4 master super block
17363+ * @super: super block to fill
17364+ * @silent: if 0 - print warnings
17365+ *
17366+ * Reads reiser4 master super block either from predefined location or from
17367+ * location specified by altsuper mount option, initializes disk format plugin.
17368+ */
17369+int reiser4_init_read_super(struct super_block *super, int silent)
17370+{
17371+ struct buffer_head *super_bh;
17372+ struct reiser4_master_sb *master_sb;
17373+ reiser4_super_info_data *sbinfo = get_super_private(super);
17374+ unsigned long blocksize;
17375+
17376+ read_super_block:
17377+#ifdef CONFIG_REISER4_BADBLOCKS
17378+ if (sbinfo->altsuper)
17379+ /*
17380+ * read reiser4 master super block at position specified by
17381+ * mount option
17382+ */
17383+ super_bh = sb_bread(super,
17384+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
17385+ else
17386+#endif
17387+ /* read reiser4 master super block at 16-th 4096 block */
17388+ super_bh = sb_bread(super,
17389+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17390+ if (!super_bh)
17391+ return RETERR(-EIO);
17392+
17393+ master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17394+ /* check reiser4 magic string */
17395+ if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17396+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
17397+ /* reiser4 master super block contains filesystem blocksize */
17398+ blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17399+
17400+ if (blocksize != PAGE_CACHE_SIZE) {
17401+ /*
17402+ * currenly reiser4's blocksize must be equal to
17403+ * pagesize
17404+ */
17405+ if (!silent)
17406+ warning("nikita-2609",
17407+ "%s: wrong block size %ld\n", super->s_id,
17408+ blocksize);
17409+ brelse(super_bh);
17410+ return RETERR(-EINVAL);
17411+ }
17412+ if (blocksize != super->s_blocksize) {
17413+ /*
17414+ * filesystem uses different blocksize. Reread master
17415+ * super block with correct blocksize
17416+ */
17417+ brelse(super_bh);
17418+ if (!sb_set_blocksize(super, (int)blocksize))
17419+ return RETERR(-EINVAL);
17420+ goto read_super_block;
17421+ }
17422+
17423+ sbinfo->df_plug =
17424+ disk_format_plugin_by_id(
17425+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17426+ if (sbinfo->df_plug == NULL) {
17427+ if (!silent)
17428+ warning("nikita-26091",
17429+ "%s: unknown disk format plugin %d\n",
17430+ super->s_id,
17431+ le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17432+ brelse(super_bh);
17433+ return RETERR(-EINVAL);
17434+ }
17435+ sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17436+ brelse(super_bh);
17437+ return 0;
17438+ }
17439+
17440+ /* there is no reiser4 on the device */
17441+ if (!silent)
17442+ warning("nikita-2608",
17443+ "%s: wrong master super block magic", super->s_id);
17444+ brelse(super_bh);
17445+ return RETERR(-EINVAL);
17446+}
17447+
17448+static struct {
17449+ reiser4_plugin_type type;
17450+ reiser4_plugin_id id;
17451+} default_plugins[PSET_LAST] = {
17452+ [PSET_FILE] = {
17453+ .type = REISER4_FILE_PLUGIN_TYPE,
17454+ .id = UNIX_FILE_PLUGIN_ID
17455+ },
17456+ [PSET_DIR] = {
17457+ .type = REISER4_DIR_PLUGIN_TYPE,
17458+ .id = HASHED_DIR_PLUGIN_ID
17459+ },
17460+ [PSET_HASH] = {
17461+ .type = REISER4_HASH_PLUGIN_TYPE,
17462+ .id = R5_HASH_ID
17463+ },
17464+ [PSET_FIBRATION] = {
17465+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
17466+ .id = FIBRATION_DOT_O
17467+ },
17468+ [PSET_PERM] = {
17469+ .type = REISER4_PERM_PLUGIN_TYPE,
17470+ .id = NULL_PERM_ID
17471+ },
17472+ [PSET_FORMATTING] = {
17473+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
17474+ .id = SMALL_FILE_FORMATTING_ID
17475+ },
17476+ [PSET_SD] = {
17477+ .type = REISER4_ITEM_PLUGIN_TYPE,
17478+ .id = STATIC_STAT_DATA_ID
17479+ },
17480+ [PSET_DIR_ITEM] = {
17481+ .type = REISER4_ITEM_PLUGIN_TYPE,
17482+ .id = COMPOUND_DIR_ID
17483+ },
17484+ [PSET_CIPHER] = {
17485+ .type = REISER4_CIPHER_PLUGIN_TYPE,
17486+ .id = NONE_CIPHER_ID
17487+ },
17488+ [PSET_DIGEST] = {
17489+ .type = REISER4_DIGEST_PLUGIN_TYPE,
17490+ .id = SHA256_32_DIGEST_ID
17491+ },
17492+ [PSET_COMPRESSION] = {
17493+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17494+ .id = LZO1_COMPRESSION_ID
17495+ },
17496+ [PSET_COMPRESSION_MODE] = {
17497+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17498+ .id = CONVX_COMPRESSION_MODE_ID
17499+ },
17500+ [PSET_CLUSTER] = {
17501+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
17502+ .id = CLUSTER_64K_ID
17503+ },
17504+ [PSET_CREATE] = {
17505+ .type = REISER4_FILE_PLUGIN_TYPE,
17506+ .id = UNIX_FILE_PLUGIN_ID
17507+ }
17508+};
17509+
17510+/* access to default plugin table */
17511+reiser4_plugin *get_default_plugin(pset_member memb)
17512+{
17513+ return plugin_by_id(default_plugins[memb].type,
17514+ default_plugins[memb].id);
17515+}
17516+
17517+/**
17518+ * reiser4_init_root_inode - obtain inode of root directory
17519+ * @super: super block of filesystem
17520+ *
17521+ * Obtains inode of root directory (reading it from disk), initializes plugin
17522+ * set it was not initialized.
17523+ */
17524+int reiser4_init_root_inode(struct super_block *super)
17525+{
17526+ reiser4_super_info_data *sbinfo = get_super_private(super);
17527+ struct inode *inode;
17528+ int result = 0;
17529+
17530+ inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17531+ if (IS_ERR(inode))
17532+ return RETERR(PTR_ERR(inode));
17533+
17534+ super->s_root = d_alloc_root(inode);
17535+ if (!super->s_root) {
17536+ iput(inode);
17537+ return RETERR(-ENOMEM);
17538+ }
17539+
17540+ super->s_root->d_op = &sbinfo->ops.dentry;
17541+
17542+ if (!is_inode_loaded(inode)) {
17543+ pset_member memb;
17544+ plugin_set *pset;
17545+
17546+ pset = reiser4_inode_data(inode)->pset;
17547+ for (memb = 0; memb < PSET_LAST; ++memb) {
17548+
17549+ if (aset_get(pset, memb) != NULL)
17550+ continue;
17551+
17552+ result = grab_plugin_pset(inode, NULL, memb);
17553+ if (result != 0)
17554+ break;
17555+
17556+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17557+ }
17558+
17559+ if (result == 0) {
17560+ if (REISER4_DEBUG) {
17561+ for (memb = 0; memb < PSET_LAST; ++memb)
17562+ assert("nikita-3500",
17563+ aset_get(pset, memb) != NULL);
17564+ }
17565+ } else
17566+ warning("nikita-3448", "Cannot set plugins of root: %i",
17567+ result);
17568+ reiser4_iget_complete(inode);
17569+
17570+ /* As the default pset kept in the root dir may has been changed
17571+ (length is unknown), call update_sd. */
17572+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17573+ result = reiser4_grab_space(
17574+ inode_file_plugin(inode)->estimate.update(inode),
17575+ BA_CAN_COMMIT);
17576+
17577+ if (result == 0)
17578+ result = reiser4_update_sd(inode);
17579+
17580+ all_grabbed2free();
17581+ }
17582+ }
17583+
17584+ super->s_maxbytes = MAX_LFS_FILESIZE;
17585+ return result;
17586+}
17587+
17588+/*
17589+ * Local variables:
17590+ * c-indentation-style: "K&R"
17591+ * mode-name: "LC"
17592+ * c-basic-offset: 8
17593+ * tab-width: 8
17594+ * fill-column: 79
17595+ * End:
17596+ */
17597diff --git a/fs/reiser4/inode.c b/fs/reiser4/inode.c
17598new file mode 100644
17599index 0000000..2429ac1
17600--- /dev/null
17601+++ b/fs/reiser4/inode.c
17602@@ -0,0 +1,709 @@
17603+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17604+
17605+/* Inode specific operations. */
17606+
17607+#include "forward.h"
17608+#include "debug.h"
17609+#include "key.h"
17610+#include "kassign.h"
17611+#include "coord.h"
17612+#include "seal.h"
17613+#include "dscale.h"
17614+#include "plugin/item/item.h"
17615+#include "plugin/security/perm.h"
17616+#include "plugin/plugin.h"
17617+#include "plugin/object.h"
17618+#include "znode.h"
17619+#include "vfs_ops.h"
17620+#include "inode.h"
17621+#include "super.h"
17622+#include "reiser4.h"
17623+
17624+#include <linux/fs.h> /* for struct super_block, address_space */
17625+
17626+/* return reiser4 internal tree which inode belongs to */
17627+/* Audited by: green(2002.06.17) */
17628+reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17629+{
17630+ assert("nikita-256", inode != NULL);
17631+ assert("nikita-257", inode->i_sb != NULL);
17632+ return reiser4_get_tree(inode->i_sb);
17633+}
17634+
17635+/* return reiser4-specific inode flags */
17636+static inline unsigned long *inode_flags(const struct inode *const inode)
17637+{
17638+ assert("nikita-2842", inode != NULL);
17639+ return &reiser4_inode_data(inode)->flags;
17640+}
17641+
17642+/* set reiser4-specific flag @f in @inode */
17643+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17644+{
17645+ assert("nikita-2248", inode != NULL);
17646+ set_bit((int)f, inode_flags(inode));
17647+}
17648+
17649+/* clear reiser4-specific flag @f in @inode */
17650+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17651+{
17652+ assert("nikita-2250", inode != NULL);
17653+ clear_bit((int)f, inode_flags(inode));
17654+}
17655+
17656+/* true if reiser4-specific flag @f is set in @inode */
17657+int reiser4_inode_get_flag(const struct inode *inode,
17658+ reiser4_file_plugin_flags f)
17659+{
17660+ assert("nikita-2251", inode != NULL);
17661+ return test_bit((int)f, inode_flags(inode));
17662+}
17663+
17664+/* convert oid to inode number */
17665+ino_t oid_to_ino(oid_t oid)
17666+{
17667+ return (ino_t) oid;
17668+}
17669+
17670+/* convert oid to user visible inode number */
17671+ino_t oid_to_uino(oid_t oid)
17672+{
17673+ /* reiser4 object is uniquely identified by oid which is 64 bit
17674+ quantity. Kernel in-memory inode is indexed (in the hash table) by
17675+ 32 bit i_ino field, but this is not a problem, because there is a
17676+ way to further distinguish inodes with identical inode numbers
17677+ (find_actor supplied to iget()).
17678+
17679+ But user space expects unique 32 bit inode number. Obviously this
17680+ is impossible. Work-around is to somehow hash oid into user visible
17681+ inode number.
17682+ */
17683+ oid_t max_ino = (ino_t) ~ 0;
17684+
17685+ if (REISER4_INO_IS_OID || (oid <= max_ino))
17686+ return oid;
17687+ else
17688+ /* this is remotely similar to algorithm used to find next pid
17689+ to use for process: after wrap-around start from some
17690+ offset rather than from 0. Idea is that there are some long
17691+ living objects with which we don't want to collide.
17692+ */
17693+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17694+}
17695+
17696+/* check that "inode" is on reiser4 file-system */
17697+int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17698+{
17699+ return inode != NULL && is_reiser4_super(inode->i_sb);
17700+}
17701+
17702+/* Maximal length of a name that can be stored in directory @inode.
17703+
17704+ This is used in check during file creation and lookup. */
17705+int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17706+{
17707+ assert("nikita-287", is_reiser4_inode(inode));
17708+ assert("nikita-1710", inode_dir_item_plugin(inode));
17709+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17710+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17711+ else
17712+ return 255;
17713+}
17714+
17715+#if REISER4_USE_COLLISION_LIMIT
17716+/* Maximal number of hash collisions for this directory. */
17717+int max_hash_collisions(const struct inode *dir /* inode queried */ )
17718+{
17719+ assert("nikita-1711", dir != NULL);
17720+ return reiser4_inode_data(dir)->plugin.max_collisions;
17721+}
17722+#endif /* REISER4_USE_COLLISION_LIMIT */
17723+
17724+/* Install file, inode, and address_space operation on @inode, depending on
17725+ its mode. */
17726+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17727+ reiser4_object_create_data * data /* parameters to create
17728+ * object */ )
17729+{
17730+ reiser4_super_info_data *sinfo;
17731+ file_plugin *fplug;
17732+ dir_plugin *dplug;
17733+
17734+ fplug = inode_file_plugin(inode);
17735+ dplug = inode_dir_plugin(inode);
17736+
17737+ sinfo = get_super_private(inode->i_sb);
17738+
17739+ switch (inode->i_mode & S_IFMT) {
17740+ case S_IFSOCK:
17741+ case S_IFBLK:
17742+ case S_IFCHR:
17743+ case S_IFIFO:
17744+ {
17745+ dev_t rdev; /* to keep gcc happy */
17746+
17747+ assert("vs-46", fplug != NULL);
17748+ /* ugly hack with rdev */
17749+ if (data == NULL) {
17750+ rdev = inode->i_rdev;
17751+ inode->i_rdev = 0;
17752+ } else
17753+ rdev = data->rdev;
17754+ inode->i_blocks = 0;
17755+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17756+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17757+ /* initialize inode->i_fop and inode->i_rdev for block and char
17758+ devices */
17759+ init_special_inode(inode, inode->i_mode, rdev);
17760+ /* all address space operations are null */
17761+ inode->i_mapping->a_ops =
17762+ &file_plugins[fplug->h.id].as_ops;
17763+ break;
17764+ }
17765+ case S_IFLNK:
17766+ assert("vs-46", fplug != NULL);
17767+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17768+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17769+ inode->i_fop = NULL;
17770+ /* all address space operations are null */
17771+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17772+ break;
17773+ case S_IFDIR:
17774+ assert("vs-46", dplug != NULL);
17775+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17776+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17777+ inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17778+ inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17779+ inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17780+ break;
17781+ case S_IFREG:
17782+ assert("vs-46", fplug != NULL);
17783+ assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17784+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17785+ inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17786+ inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17787+ inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17788+ break;
17789+ default:
17790+ warning("nikita-291", "wrong file mode: %o for %llu",
17791+ inode->i_mode,
17792+ (unsigned long long)get_inode_oid(inode));
17793+ reiser4_make_bad_inode(inode);
17794+ return RETERR(-EINVAL);
17795+ }
17796+ return 0;
17797+}
17798+
17799+/* Initialize inode from disk data. Called with inode locked.
17800+ Return inode locked. */
17801+static int init_inode(struct inode *inode /* inode to intialise */ ,
17802+ coord_t * coord /* coord of stat data */ )
17803+{
17804+ int result;
17805+ item_plugin *iplug;
17806+ void *body;
17807+ int length;
17808+ reiser4_inode *state;
17809+
17810+ assert("nikita-292", coord != NULL);
17811+ assert("nikita-293", inode != NULL);
17812+
17813+ coord_clear_iplug(coord);
17814+ result = zload(coord->node);
17815+ if (result)
17816+ return result;
17817+ iplug = item_plugin_by_coord(coord);
17818+ body = item_body_by_coord(coord);
17819+ length = item_length_by_coord(coord);
17820+
17821+ assert("nikita-295", iplug != NULL);
17822+ assert("nikita-296", body != NULL);
17823+ assert("nikita-297", length > 0);
17824+
17825+ /* inode is under I_LOCK now */
17826+
17827+ state = reiser4_inode_data(inode);
17828+ /* call stat-data plugin method to load sd content into inode */
17829+ result = iplug->s.sd.init_inode(inode, body, length);
17830+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17831+ if (result == 0) {
17832+ result = setup_inode_ops(inode, NULL);
17833+ if (result == 0 && inode->i_sb->s_root &&
17834+ inode->i_sb->s_root->d_inode)
17835+ result = finish_pset(inode);
17836+ }
17837+ zrelse(coord->node);
17838+ return result;
17839+}
17840+
17841+/* read `inode' from the disk. This is what was previously in
17842+ reiserfs_read_inode2().
17843+
17844+ Must be called with inode locked. Return inode still locked.
17845+*/
17846+static int read_inode(struct inode *inode /* inode to read from disk */ ,
17847+ const reiser4_key * key /* key of stat data */ ,
17848+ int silent)
17849+{
17850+ int result;
17851+ lock_handle lh;
17852+ reiser4_inode *info;
17853+ coord_t coord;
17854+
17855+ assert("nikita-298", inode != NULL);
17856+ assert("nikita-1945", !is_inode_loaded(inode));
17857+
17858+ info = reiser4_inode_data(inode);
17859+ assert("nikita-300", info->locality_id != 0);
17860+
17861+ coord_init_zero(&coord);
17862+ init_lh(&lh);
17863+ /* locate stat-data in a tree and return znode locked */
17864+ result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17865+ assert("nikita-301", !is_inode_loaded(inode));
17866+ if (result == 0) {
17867+ /* use stat-data plugin to load sd into inode. */
17868+ result = init_inode(inode, &coord);
17869+ if (result == 0) {
17870+ /* initialize stat-data seal */
17871+ spin_lock_inode(inode);
17872+ reiser4_seal_init(&info->sd_seal, &coord, key);
17873+ info->sd_coord = coord;
17874+ spin_unlock_inode(inode);
17875+
17876+ /* call file plugin's method to initialize plugin
17877+ * specific part of inode */
17878+ if (inode_file_plugin(inode)->init_inode_data)
17879+ inode_file_plugin(inode)->init_inode_data(inode,
17880+ NULL,
17881+ 0);
17882+ /* load detached directory cursors for stateless
17883+ * directory readers (NFS). */
17884+ reiser4_load_cursors(inode);
17885+
17886+ /* Check the opened inode for consistency. */
17887+ result =
17888+ get_super_private(inode->i_sb)->df_plug->
17889+ check_open(inode);
17890+ }
17891+ }
17892+ /* lookup_sd() doesn't release coord because we want znode
17893+ stay read-locked while stat-data fields are accessed in
17894+ init_inode() */
17895+ done_lh(&lh);
17896+
17897+ if (result != 0)
17898+ reiser4_make_bad_inode(inode);
17899+ return result;
17900+}
17901+
17902+/* initialise new reiser4 inode being inserted into hash table. */
17903+static int init_locked_inode(struct inode *inode /* new inode */ ,
17904+ void *opaque /* key of stat data passed to the
17905+ * iget5_locked as cookie */ )
17906+{
17907+ reiser4_key *key;
17908+
17909+ assert("nikita-1995", inode != NULL);
17910+ assert("nikita-1996", opaque != NULL);
17911+ key = opaque;
17912+ set_inode_oid(inode, get_key_objectid(key));
17913+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17914+ return 0;
17915+}
17916+
17917+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17918+
17919+ This function is called by iget5_locked() to distinguish reiser4 inodes
17920+ having the same inode numbers. Such inodes can only exist due to some error
17921+ condition. One of them should be bad. Inodes with identical inode numbers
17922+ (objectids) are distinguished by their packing locality.
17923+
17924+*/
17925+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table to
17926+ * check */ ,
17927+ void *opaque /* "cookie" passed to
17928+ * iget5_locked(). This is stat data
17929+ * key */ )
17930+{
17931+ reiser4_key *key;
17932+
17933+ key = opaque;
17934+ return
17935+ /* oid is unique, so first term is enough, actually. */
17936+ get_inode_oid(inode) == get_key_objectid(key) &&
17937+ /*
17938+ * also, locality should be checked, but locality is stored in
17939+ * the reiser4-specific part of the inode, and actor can be
17940+ * called against arbitrary inode that happened to be in this
17941+ * hash chain. Hence we first have to check that this is
17942+ * reiser4 inode at least. is_reiser4_inode() is probably too
17943+ * early to call, as inode may have ->i_op not yet
17944+ * initialised.
17945+ */
17946+ is_reiser4_super(inode->i_sb) &&
17947+ /*
17948+ * usually objectid is unique, but pseudo files use counter to
17949+ * generate objectid. All pseudo files are placed into special
17950+ * (otherwise unused) locality.
17951+ */
17952+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17953+}
17954+
17955+/* hook for kmem_cache_create */
17956+void loading_init_once(reiser4_inode * info)
17957+{
17958+ mutex_init(&info->loading);
17959+}
17960+
17961+/* for reiser4_alloc_inode */
17962+void loading_alloc(reiser4_inode * info)
17963+{
17964+ assert("vs-1717", !mutex_is_locked(&info->loading));
17965+}
17966+
17967+/* for reiser4_destroy */
17968+void loading_destroy(reiser4_inode * info)
17969+{
17970+ assert("vs-1717a", !mutex_is_locked(&info->loading));
17971+}
17972+
17973+static void loading_begin(reiser4_inode * info)
17974+{
17975+ mutex_lock(&info->loading);
17976+}
17977+
17978+static void loading_end(reiser4_inode * info)
17979+{
17980+ mutex_unlock(&info->loading);
17981+}
17982+
17983+/**
17984+ * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17985+ * @super: super block of filesystem
17986+ * @key: key of inode's stat-data
17987+ * @silent:
17988+ *
17989+ * This is our helper function a la iget(). This is be called by
17990+ * lookup_common() and reiser4_read_super(). Return inode locked or error
17991+ * encountered.
17992+ */
17993+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17994+ int silent)
17995+{
17996+ struct inode *inode;
17997+ int result;
17998+ reiser4_inode *info;
17999+
18000+ assert("nikita-302", super != NULL);
18001+ assert("nikita-303", key != NULL);
18002+
18003+ result = 0;
18004+
18005+ /* call iget(). Our ->read_inode() is dummy, so this will either
18006+ find inode in cache or return uninitialised inode */
18007+ inode = iget5_locked(super,
18008+ (unsigned long)get_key_objectid(key),
18009+ reiser4_inode_find_actor,
18010+ init_locked_inode, (reiser4_key *) key);
18011+ if (inode == NULL)
18012+ return ERR_PTR(RETERR(-ENOMEM));
18013+ if (is_bad_inode(inode)) {
18014+ warning("nikita-304", "Bad inode found");
18015+ reiser4_print_key("key", key);
18016+ iput(inode);
18017+ return ERR_PTR(RETERR(-EIO));
18018+ }
18019+
18020+ info = reiser4_inode_data(inode);
18021+
18022+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
18023+ loaded and initialized inode from just allocated inode. If
18024+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
18025+ info->loading. The place in reiser4 which uses not initialized inode
18026+ is the reiser4 repacker, see repacker-related functions in
18027+ plugin/item/extent.c */
18028+ if (!is_inode_loaded(inode)) {
18029+ loading_begin(info);
18030+ if (!is_inode_loaded(inode)) {
18031+ /* locking: iget5_locked returns locked inode */
18032+ assert("nikita-1941", !is_inode_loaded(inode));
18033+ assert("nikita-1949",
18034+ reiser4_inode_find_actor(inode,
18035+ (reiser4_key *) key));
18036+ /* now, inode has objectid as ->i_ino and locality in
18037+ reiser4-specific part. This is enough for
18038+ read_inode() to read stat data from the disk */
18039+ result = read_inode(inode, key, silent);
18040+ } else
18041+ loading_end(info);
18042+ }
18043+
18044+ if (inode->i_state & I_NEW)
18045+ unlock_new_inode(inode);
18046+
18047+ if (is_bad_inode(inode)) {
18048+ assert("vs-1717", result != 0);
18049+ loading_end(info);
18050+ iput(inode);
18051+ inode = ERR_PTR(result);
18052+ } else if (REISER4_DEBUG) {
18053+ reiser4_key found_key;
18054+
18055+ assert("vs-1717", result == 0);
18056+ build_sd_key(inode, &found_key);
18057+ if (!keyeq(&found_key, key)) {
18058+ warning("nikita-305", "Wrong key in sd");
18059+ reiser4_print_key("sought for", key);
18060+ reiser4_print_key("found", &found_key);
18061+ }
18062+ if (inode->i_nlink == 0) {
18063+ warning("nikita-3559", "Unlinked inode found: %llu\n",
18064+ (unsigned long long)get_inode_oid(inode));
18065+ }
18066+ }
18067+ return inode;
18068+}
18069+
18070+/* reiser4_iget() may return not fully initialized inode, this function should
18071+ * be called after one completes reiser4 inode initializing. */
18072+void reiser4_iget_complete(struct inode *inode)
18073+{
18074+ assert("zam-988", is_reiser4_inode(inode));
18075+
18076+ if (!is_inode_loaded(inode)) {
18077+ reiser4_inode_set_flag(inode, REISER4_LOADED);
18078+ loading_end(reiser4_inode_data(inode));
18079+ }
18080+}
18081+
18082+void reiser4_make_bad_inode(struct inode *inode)
18083+{
18084+ assert("nikita-1934", inode != NULL);
18085+
18086+ /* clear LOADED bit */
18087+ reiser4_inode_clr_flag(inode, REISER4_LOADED);
18088+ make_bad_inode(inode);
18089+ return;
18090+}
18091+
18092+file_plugin *inode_file_plugin(const struct inode * inode)
18093+{
18094+ assert("nikita-1997", inode != NULL);
18095+ return reiser4_inode_data(inode)->pset->file;
18096+}
18097+
18098+dir_plugin *inode_dir_plugin(const struct inode * inode)
18099+{
18100+ assert("nikita-1998", inode != NULL);
18101+ return reiser4_inode_data(inode)->pset->dir;
18102+}
18103+
18104+formatting_plugin *inode_formatting_plugin(const struct inode * inode)
18105+{
18106+ assert("nikita-2000", inode != NULL);
18107+ return reiser4_inode_data(inode)->pset->formatting;
18108+}
18109+
18110+hash_plugin *inode_hash_plugin(const struct inode * inode)
18111+{
18112+ assert("nikita-2001", inode != NULL);
18113+ return reiser4_inode_data(inode)->pset->hash;
18114+}
18115+
18116+fibration_plugin *inode_fibration_plugin(const struct inode * inode)
18117+{
18118+ assert("nikita-2001", inode != NULL);
18119+ return reiser4_inode_data(inode)->pset->fibration;
18120+}
18121+
18122+cipher_plugin *inode_cipher_plugin(const struct inode * inode)
18123+{
18124+ assert("edward-36", inode != NULL);
18125+ return reiser4_inode_data(inode)->pset->cipher;
18126+}
18127+
18128+compression_plugin *inode_compression_plugin(const struct inode * inode)
18129+{
18130+ assert("edward-37", inode != NULL);
18131+ return reiser4_inode_data(inode)->pset->compression;
18132+}
18133+
18134+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
18135+ inode)
18136+{
18137+ assert("edward-1330", inode != NULL);
18138+ return reiser4_inode_data(inode)->pset->compression_mode;
18139+}
18140+
18141+cluster_plugin *inode_cluster_plugin(const struct inode * inode)
18142+{
18143+ assert("edward-1328", inode != NULL);
18144+ return reiser4_inode_data(inode)->pset->cluster;
18145+}
18146+
18147+file_plugin *inode_create_plugin(const struct inode * inode)
18148+{
18149+ assert("edward-1329", inode != NULL);
18150+ return reiser4_inode_data(inode)->pset->create;
18151+}
18152+
18153+digest_plugin *inode_digest_plugin(const struct inode * inode)
18154+{
18155+ assert("edward-86", inode != NULL);
18156+ return reiser4_inode_data(inode)->pset->digest;
18157+}
18158+
18159+item_plugin *inode_sd_plugin(const struct inode * inode)
18160+{
18161+ assert("vs-534", inode != NULL);
18162+ return reiser4_inode_data(inode)->pset->sd;
18163+}
18164+
18165+item_plugin *inode_dir_item_plugin(const struct inode * inode)
18166+{
18167+ assert("vs-534", inode != NULL);
18168+ return reiser4_inode_data(inode)->pset->dir_item;
18169+}
18170+
18171+file_plugin *child_create_plugin(const struct inode * inode)
18172+{
18173+ assert("edward-1329", inode != NULL);
18174+ return reiser4_inode_data(inode)->hset->create;
18175+}
18176+
18177+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
18178+{
18179+ reiser4_inode *state;
18180+
18181+ assert("nikita-2716", inode != NULL);
18182+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
18183+ assert("nikita-3491", spin_inode_is_locked(inode));
18184+
18185+ state = reiser4_inode_data(inode);
18186+ state->extmask |= 1 << ext;
18187+ /* force re-calculation of stat-data length on next call to
18188+ update_sd(). */
18189+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18190+}
18191+
18192+void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
18193+{
18194+ reiser4_inode *state;
18195+
18196+ assert("vpf-1926", inode != NULL);
18197+ assert("vpf-1927", ext < LAST_SD_EXTENSION);
18198+ assert("vpf-1928", spin_inode_is_locked(inode));
18199+
18200+ state = reiser4_inode_data(inode);
18201+ state->extmask &= ~(1 << ext);
18202+ /* force re-calculation of stat-data length on next call to
18203+ update_sd(). */
18204+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18205+}
18206+
18207+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
18208+{
18209+ assert("edward-1287", inode != NULL);
18210+ if (!dscale_fit(old, new))
18211+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18212+ return;
18213+}
18214+
18215+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
18216+{
18217+ assert("nikita-2875", inode != NULL);
18218+ spin_lock_inode(inode);
18219+ inode_check_scale_nolock(inode, old, new);
18220+ spin_unlock_inode(inode);
18221+}
18222+
18223+/*
18224+ * initialize ->ordering field of inode. This field defines how file stat-data
18225+ * and body is ordered within a tree with respect to other objects within the
18226+ * same parent directory.
18227+ */
18228+void
18229+init_inode_ordering(struct inode *inode,
18230+ reiser4_object_create_data * crd, int create)
18231+{
18232+ reiser4_key key;
18233+
18234+ if (create) {
18235+ struct inode *parent;
18236+
18237+ parent = crd->parent;
18238+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
18239+ inode_dir_plugin(parent)->build_entry_key(parent,
18240+ &crd->dentry->d_name,
18241+ &key);
18242+ } else {
18243+ coord_t *coord;
18244+
18245+ coord = &reiser4_inode_data(inode)->sd_coord;
18246+ coord_clear_iplug(coord);
18247+ /* safe to use ->sd_coord, because node is under long term
18248+ * lock */
18249+ WITH_DATA(coord->node, item_key_by_coord(coord, &key));
18250+ }
18251+
18252+ set_inode_ordering(inode, get_key_ordering(&key));
18253+}
18254+
18255+znode *inode_get_vroot(struct inode *inode)
18256+{
18257+ reiser4_block_nr blk;
18258+ znode *result;
18259+
18260+ spin_lock_inode(inode);
18261+ blk = reiser4_inode_data(inode)->vroot;
18262+ spin_unlock_inode(inode);
18263+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
18264+ result = zlook(reiser4_tree_by_inode(inode), &blk);
18265+ else
18266+ result = NULL;
18267+ return result;
18268+}
18269+
18270+void inode_set_vroot(struct inode *inode, znode *vroot)
18271+{
18272+ spin_lock_inode(inode);
18273+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
18274+ spin_unlock_inode(inode);
18275+}
18276+
18277+#if REISER4_DEBUG
18278+
18279+void reiser4_inode_invariant(const struct inode *inode)
18280+{
18281+ assert("nikita-3077", spin_inode_is_locked(inode));
18282+}
18283+
18284+int inode_has_no_jnodes(reiser4_inode * r4_inode)
18285+{
18286+ return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
18287+ r4_inode->nr_jnodes == 0;
18288+}
18289+
18290+#endif
18291+
18292+/* true if directory is empty (only contains dot and dotdot) */
18293+/* FIXME: shouldn't it be dir plugin method? */
18294+int is_dir_empty(const struct inode *dir)
18295+{
18296+ assert("nikita-1976", dir != NULL);
18297+
18298+ /* rely on our method to maintain directory i_size being equal to the
18299+ number of entries. */
18300+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18301+}
18302+
18303+/* Make Linus happy.
18304+ Local variables:
18305+ c-indentation-style: "K&R"
18306+ mode-name: "LC"
18307+ c-basic-offset: 8
18308+ tab-width: 8
18309+ fill-column: 120
18310+ End:
18311+*/
18312diff --git a/fs/reiser4/inode.h b/fs/reiser4/inode.h
18313new file mode 100644
18314index 0000000..2cc1d82
18315--- /dev/null
18316+++ b/fs/reiser4/inode.h
18317@@ -0,0 +1,438 @@
18318+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
18319+
18320+/* Inode functions. */
18321+
18322+#if !defined( __REISER4_INODE_H__ )
18323+#define __REISER4_INODE_H__
18324+
18325+#include "forward.h"
18326+#include "debug.h"
18327+#include "key.h"
18328+#include "seal.h"
18329+#include "plugin/plugin.h"
18330+#include "plugin/file/cryptcompress.h"
18331+#include "plugin/file/file.h"
18332+#include "plugin/dir/dir.h"
18333+#include "plugin/plugin_set.h"
18334+#include "plugin/security/perm.h"
18335+#include "vfs_ops.h"
18336+#include "jnode.h"
18337+#include "fsdata.h"
18338+
18339+#include <linux/types.h> /* for __u?? , ino_t */
18340+#include <linux/fs.h> /* for struct super_block, struct
18341+ * rw_semaphore, etc */
18342+#include <linux/spinlock.h>
18343+#include <asm/types.h>
18344+
18345+/* reiser4-specific inode flags. They are "transient" and are not
18346+ supposed to be stored on disk. Used to trace "state" of
18347+ inode
18348+*/
18349+typedef enum {
18350+ /* this is light-weight inode, inheriting some state from its
18351+ parent */
18352+ REISER4_LIGHT_WEIGHT = 0,
18353+ /* stat data wasn't yet created */
18354+ REISER4_NO_SD = 1,
18355+ /* internal immutable flag. Currently is only used
18356+ to avoid race condition during file creation.
18357+ See comment in create_object(). */
18358+ REISER4_IMMUTABLE = 2,
18359+ /* inode was read from storage */
18360+ REISER4_LOADED = 3,
18361+ /* this bit is set for symlinks. inode->i_private points to target
18362+ name of symlink. */
18363+ REISER4_GENERIC_PTR_USED = 4,
18364+ /* set if size of stat-data item for this inode is known. If this is
18365+ * set we can avoid recalculating size of stat-data on each update. */
18366+ REISER4_SDLEN_KNOWN = 5,
18367+ /* reiser4_inode->crypt points to the crypto stat */
18368+ REISER4_CRYPTO_STAT_LOADED = 6,
18369+ /* cryptcompress_inode_data points to the secret key */
18370+ REISER4_SECRET_KEY_INSTALLED = 7,
18371+ /* File (possibly) has pages corresponding to the tail items, that
18372+ * were created by ->readpage. It is set by mmap_unix_file() and
18373+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
18374+ * kill-hook of tail items. It is never cleared once set. This bit is
18375+ * modified and inspected under i_mutex. */
18376+ REISER4_HAS_MMAP = 8,
18377+ REISER4_PART_MIXED = 9,
18378+ REISER4_PART_IN_CONV = 10,
18379+ /* This flag indicates that file plugin conversion is in progress */
18380+ REISER4_FILE_CONV_IN_PROGRESS = 11
18381+} reiser4_file_plugin_flags;
18382+
18383+/* state associated with each inode.
18384+ reiser4 inode.
18385+
18386+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18387+ be of the same size. File-system allocates inodes by itself through
18388+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
18389+ at the time of its creation.
18390+
18391+ Invariants involving parts of this data-type:
18392+
18393+ [inode->eflushed]
18394+
18395+*/
18396+
18397+typedef struct reiser4_inode reiser4_inode;
18398+/* return pointer to reiser4-specific part of inode */
18399+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18400+ /* inode queried */ );
18401+
18402+#if BITS_PER_LONG == 64
18403+
18404+#define REISER4_INO_IS_OID (1)
18405+typedef struct {;
18406+} oid_hi_t;
18407+
18408+/* BITS_PER_LONG == 64 */
18409+#else
18410+
18411+#define REISER4_INO_IS_OID (0)
18412+typedef __u32 oid_hi_t;
18413+
18414+/* BITS_PER_LONG == 64 */
18415+#endif
18416+
18417+struct reiser4_inode {
18418+ /* spin lock protecting fields of this structure. */
18419+ spinlock_t guard;
18420+ /* main plugin set that control the file
18421+ (see comments in plugin/plugin_set.c) */
18422+ plugin_set *pset;
18423+ /* plugin set for inheritance
18424+ (see comments in plugin/plugin_set.c) */
18425+ plugin_set *hset;
18426+ /* high 32 bits of object id */
18427+ oid_hi_t oid_hi;
18428+ /* seal for stat-data */
18429+ seal_t sd_seal;
18430+ /* locality id for this file */
18431+ oid_t locality_id;
18432+#if REISER4_LARGE_KEY
18433+ __u64 ordering;
18434+#endif
18435+ /* coord of stat-data in sealed node */
18436+ coord_t sd_coord;
18437+ /* bit-mask of stat-data extentions used by this file */
18438+ __u64 extmask;
18439+ /* bitmask of non-default plugins for this inode */
18440+ __u16 plugin_mask;
18441+ /* bitmask of set heir plugins for this inode. */
18442+ __u16 heir_mask;
18443+ union {
18444+ struct list_head readdir_list;
18445+ struct list_head not_used;
18446+ } lists;
18447+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18448+ unsigned long flags;
18449+ union {
18450+ /* fields specific to unix_file plugin */
18451+ unix_file_info_t unix_file_info;
18452+ /* fields specific to cryptcompress plugin */
18453+ cryptcompress_info_t cryptcompress_info;
18454+ } file_plugin_data;
18455+
18456+ /* this semaphore is to serialize readers and writers of @pset->file
18457+ * when file plugin conversion is enabled
18458+ */
18459+ struct rw_semaphore conv_sem;
18460+
18461+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18462+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18463+ struct radix_tree_root jnodes_tree;
18464+#if REISER4_DEBUG
18465+ /* number of unformatted node jnodes of this file in jnode hash table */
18466+ unsigned long nr_jnodes;
18467+#endif
18468+
18469+ /* block number of virtual root for this object. See comment above
18470+ * fs/reiser4/search.c:handle_vroot() */
18471+ reiser4_block_nr vroot;
18472+ struct mutex loading;
18473+};
18474+
18475+void loading_init_once(reiser4_inode *);
18476+void loading_alloc(reiser4_inode *);
18477+void loading_destroy(reiser4_inode *);
18478+
18479+typedef struct reiser4_inode_object {
18480+ /* private part */
18481+ reiser4_inode p;
18482+ /* generic fields not specific to reiser4, but used by VFS */
18483+ struct inode vfs_inode;
18484+} reiser4_inode_object;
18485+
18486+/* return pointer to the reiser4 specific portion of @inode */
18487+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18488+ /* inode queried */ )
18489+{
18490+ assert("nikita-254", inode != NULL);
18491+ return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
18492+}
18493+
18494+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18495+ r4_inode /* inode queried */
18496+ )
18497+{
18498+ return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
18499+}
18500+
18501+/*
18502+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18503+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18504+ * bits.
18505+ *
18506+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18507+ * of inode, otherwise whole oid is stored in i_ino.
18508+ *
18509+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18510+ */
18511+
18512+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18513+
18514+#if REISER4_INO_IS_OID
18515+
18516+static inline oid_t get_inode_oid(const struct inode *inode)
18517+{
18518+ return inode->i_ino;
18519+}
18520+
18521+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18522+{
18523+ inode->i_ino = oid;
18524+}
18525+
18526+/* REISER4_INO_IS_OID */
18527+#else
18528+
18529+static inline oid_t get_inode_oid(const struct inode *inode)
18530+{
18531+ return
18532+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18533+ inode->i_ino;
18534+}
18535+
18536+static inline void set_inode_oid(struct inode *inode, oid_t oid)
18537+{
18538+ assert("nikita-2519", inode != NULL);
18539+ inode->i_ino = (ino_t) (oid);
18540+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18541+ assert("nikita-2521", get_inode_oid(inode) == (oid));
18542+}
18543+
18544+/* REISER4_INO_IS_OID */
18545+#endif
18546+
18547+static inline oid_t get_inode_locality(const struct inode *inode)
18548+{
18549+ return reiser4_inode_data(inode)->locality_id;
18550+}
18551+
18552+#if REISER4_LARGE_KEY
18553+static inline __u64 get_inode_ordering(const struct inode *inode)
18554+{
18555+ return reiser4_inode_data(inode)->ordering;
18556+}
18557+
18558+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18559+{
18560+ reiser4_inode_data(inode)->ordering = ordering;
18561+}
18562+
18563+#else
18564+
18565+#define get_inode_ordering(inode) (0)
18566+#define set_inode_ordering(inode, val) noop
18567+
18568+#endif
18569+
18570+/* return inode in which @uf_info is embedded */
18571+static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18572+ uf_info)
18573+{
18574+ return &container_of(uf_info, reiser4_inode_object,
18575+ p.file_plugin_data.unix_file_info)->vfs_inode;
18576+}
18577+
18578+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18579+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18580+
18581+extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18582+
18583+#if REISER4_DEBUG
18584+extern void reiser4_inode_invariant(const struct inode *inode);
18585+extern int inode_has_no_jnodes(reiser4_inode *);
18586+#else
18587+#define reiser4_inode_invariant(inode) noop
18588+#endif
18589+
18590+static inline int spin_inode_is_locked(const struct inode *inode)
18591+{
18592+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18593+ return 1;
18594+}
18595+
18596+/**
18597+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
18598+ * @inode: inode to lock
18599+ *
18600+ * In debug mode it checks that lower priority locks are not held and
18601+ * increments reiser4_context's lock counters on which lock ordering checking
18602+ * is based.
18603+ */
18604+static inline void spin_lock_inode(struct inode *inode)
18605+{
18606+ assert("", LOCK_CNT_NIL(spin_locked));
18607+ /* check lock ordering */
18608+ assert_spin_not_locked(&d_lock);
18609+
18610+ spin_lock(&reiser4_inode_data(inode)->guard);
18611+
18612+ LOCK_CNT_INC(spin_locked_inode);
18613+ LOCK_CNT_INC(spin_locked);
18614+
18615+ reiser4_inode_invariant(inode);
18616+}
18617+
18618+/**
18619+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18620+ * @inode: inode to unlock
18621+ *
18622+ * In debug mode it checks that spinlock is held and decrements
18623+ * reiser4_context's lock counters on which lock ordering checking is based.
18624+ */
18625+static inline void spin_unlock_inode(struct inode *inode)
18626+{
18627+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
18628+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18629+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18630+
18631+ reiser4_inode_invariant(inode);
18632+
18633+ LOCK_CNT_DEC(spin_locked_inode);
18634+ LOCK_CNT_DEC(spin_locked);
18635+
18636+ spin_unlock(&reiser4_inode_data(inode)->guard);
18637+}
18638+
18639+extern znode *inode_get_vroot(struct inode *inode);
18640+extern void inode_set_vroot(struct inode *inode, znode * vroot);
18641+
18642+extern int reiser4_max_filename_len(const struct inode *inode);
18643+extern int max_hash_collisions(const struct inode *dir);
18644+extern void reiser4_unlock_inode(struct inode *inode);
18645+extern int is_reiser4_inode(const struct inode *inode);
18646+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18647+extern struct inode *reiser4_iget(struct super_block *super,
18648+ const reiser4_key * key, int silent);
18649+extern void reiser4_iget_complete(struct inode *inode);
18650+extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18651+extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18652+extern int reiser4_inode_get_flag(const struct inode *inode,
18653+ reiser4_file_plugin_flags f);
18654+
18655+/* has inode been initialized? */
18656+static inline int
18657+is_inode_loaded(const struct inode *inode /* inode queried */ )
18658+{
18659+ assert("nikita-1120", inode != NULL);
18660+ return reiser4_inode_get_flag(inode, REISER4_LOADED);
18661+}
18662+
18663+extern file_plugin *inode_file_plugin(const struct inode *inode);
18664+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18665+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18666+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18667+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18668+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18669+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18670+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18671+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18672+ *inode);
18673+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18674+extern file_plugin *inode_create_plugin(const struct inode *inode);
18675+extern item_plugin *inode_sd_plugin(const struct inode *inode);
18676+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18677+extern file_plugin *child_create_plugin(const struct inode *inode);
18678+
18679+extern void reiser4_make_bad_inode(struct inode *inode);
18680+
18681+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18682+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18683+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18684+extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18685+
18686+/*
18687+ * update field @field in inode @i to contain value @value.
18688+ */
18689+#define INODE_SET_FIELD(i, field, value) \
18690+({ \
18691+ struct inode *__i; \
18692+ typeof(value) __v; \
18693+ \
18694+ __i = (i); \
18695+ __v = (value); \
18696+ inode_check_scale(__i, __i->field, __v); \
18697+ __i->field = __v; \
18698+})
18699+
18700+#define INODE_INC_FIELD(i, field) \
18701+({ \
18702+ struct inode *__i; \
18703+ \
18704+ __i = (i); \
18705+ inode_check_scale(__i, __i->field, __i->field + 1); \
18706+ ++ __i->field; \
18707+})
18708+
18709+#define INODE_DEC_FIELD(i, field) \
18710+({ \
18711+ struct inode *__i; \
18712+ \
18713+ __i = (i); \
18714+ inode_check_scale(__i, __i->field, __i->field - 1); \
18715+ -- __i->field; \
18716+})
18717+
18718+/* See comment before reiser4_readdir_common() for description. */
18719+static inline struct list_head *get_readdir_list(const struct inode *inode)
18720+{
18721+ return &reiser4_inode_data(inode)->lists.readdir_list;
18722+}
18723+
18724+extern void init_inode_ordering(struct inode *inode,
18725+ reiser4_object_create_data * crd, int create);
18726+
18727+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18728+{
18729+ return &reiser4_inode_data(inode)->jnodes_tree;
18730+}
18731+
18732+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18733+ * r4_inode)
18734+{
18735+ return &r4_inode->jnodes_tree;
18736+}
18737+
18738+#if REISER4_DEBUG
18739+extern void print_inode(const char *prefix, const struct inode *i);
18740+#endif
18741+
18742+int is_dir_empty(const struct inode *);
18743+
18744+/* __REISER4_INODE_H__ */
18745+#endif
18746+
18747+/* Make Linus happy.
18748+ Local variables:
18749+ c-indentation-style: "K&R"
18750+ mode-name: "LC"
18751+ c-basic-offset: 8
18752+ tab-width: 8
18753+ fill-column: 120
18754+ End:
18755+*/
18756diff --git a/fs/reiser4/ioctl.h b/fs/reiser4/ioctl.h
18757new file mode 100644
18758index 0000000..4d57737
18759--- /dev/null
18760+++ b/fs/reiser4/ioctl.h
18761@@ -0,0 +1,41 @@
18762+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18763+ * reiser4/README */
18764+
18765+#if !defined( __REISER4_IOCTL_H__ )
18766+#define __REISER4_IOCTL_H__
18767+
18768+#include <linux/fs.h>
18769+
18770+/*
18771+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18772+ * extents and fix in this state. This is used by applications that rely on
18773+ *
18774+ * . files being block aligned, and
18775+ *
18776+ * . files never migrating on disk
18777+ *
18778+ * for example, boot loaders (LILO) need this.
18779+ *
18780+ * This ioctl should be used as
18781+ *
18782+ * result = ioctl(fd, REISER4_IOC_UNPACK);
18783+ *
18784+ * File behind fd descriptor will be converted to the extents (if necessary),
18785+ * and its stat-data will be updated so that it will never be converted back
18786+ * into tails again.
18787+ */
18788+#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18789+
18790+/* __REISER4_IOCTL_H__ */
18791+#endif
18792+
18793+/* Make Linus happy.
18794+ Local variables:
18795+ c-indentation-style: "K&R"
18796+ mode-name: "LC"
18797+ c-basic-offset: 8
18798+ tab-width: 8
18799+ fill-column: 120
18800+ scroll-step: 1
18801+ End:
18802+*/
18803diff --git a/fs/reiser4/jnode.c b/fs/reiser4/jnode.c
18804new file mode 100644
18805index 0000000..1d16d41
18806--- /dev/null
18807+++ b/fs/reiser4/jnode.c
18808@@ -0,0 +1,1925 @@
18809+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18810+ * reiser4/README */
18811+/* Jnode manipulation functions. */
18812+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18813+
18814+ In particular, jnodes are used to track transactional information
18815+ associated with each block. Each znode contains jnode as ->zjnode field.
18816+
18817+ Jnode stands for either Josh or Journal node.
18818+*/
18819+
18820+/*
18821+ * Taxonomy.
18822+ *
18823+ * Jnode represents block containing data or meta-data. There are jnodes
18824+ * for:
18825+ *
18826+ * unformatted blocks (jnodes proper). There are plans, however to
18827+ * have a handle per extent unit rather than per each unformatted
18828+ * block, because there are so many of them.
18829+ *
18830+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
18831+ * for working and another for "commit" data, together forming bnode.
18832+ *
18833+ * For io-heads. These are used by log writer.
18834+ *
18835+ * For formatted nodes (znode). See comment at the top of znode.c for
18836+ * details specific to the formatted nodes (znodes).
18837+ *
18838+ * Node data.
18839+ *
18840+ * Jnode provides access to the data of node it represents. Data are
18841+ * stored in a page. Page is kept in a page cache. This means, that jnodes
18842+ * are highly interconnected with page cache and VM internals.
18843+ *
18844+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
18845+ * themselves is cached in ->data field to avoid frequent calls to
18846+ * page_address().
18847+ *
18848+ * jnode and page are attached to each other by jnode_attach_page(). This
18849+ * function places pointer to jnode in set_page_private(), sets PG_private
18850+ * flag and increments page counter.
18851+ *
18852+ * Opposite operation is performed by page_clear_jnode().
18853+ *
18854+ * jnode->pg is protected by jnode spin lock, and page->private is
18855+ * protected by page lock. See comment at the top of page_cache.c for
18856+ * more.
18857+ *
18858+ * page can be detached from jnode for two reasons:
18859+ *
18860+ * . jnode is removed from a tree (file is truncated, of formatted
18861+ * node is removed by balancing).
18862+ *
18863+ * . during memory pressure, VM calls ->releasepage() method
18864+ * (reiser4_releasepage()) to evict page from memory.
18865+ *
18866+ * (there, of course, is also umount, but this is special case we are not
18867+ * concerned with here).
18868+ *
18869+ * To protect jnode page from eviction, one calls jload() function that
18870+ * "pins" page in memory (loading it if necessary), increments
18871+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
18872+ * jrelse().
18873+ *
18874+ * Jnode life cycle.
18875+ *
18876+ * jnode is created, placed in hash table, and, optionally, in per-inode
18877+ * radix tree. Page can be attached to jnode, pinned, released, etc.
18878+ *
18879+ * When jnode is captured into atom its reference counter is
18880+ * increased. While being part of an atom, jnode can be "early
18881+ * flushed". This means that as part of flush procedure, jnode is placed
18882+ * into "relocate set", and its page is submitted to the disk. After io
18883+ * completes, page can be detached, then loaded again, re-dirtied, etc.
18884+ *
18885+ * Thread acquired reference to jnode by calling jref() and releases it by
18886+ * jput(). When last reference is removed, jnode is still retained in
18887+ * memory (cached) if it has page attached, _unless_ it is scheduled for
18888+ * destruction (has JNODE_HEARD_BANSHEE bit set).
18889+ *
18890+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
18891+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
18892+ * that is, tree lock protected unreferenced jnodes stored in the hash
18893+ * table, from recycling.
18894+ *
18895+ * This resulted in high contention on tree lock, because jref()/jput() is
18896+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
18897+ * is just about to release last reference on jnode it sets JNODE_RIP bit
18898+ * on it, and then proceed with jnode destruction (removing jnode from hash
18899+ * table, cbk_cache, detaching page, etc.). All places that change jnode
18900+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18901+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18902+ * jnode_rip_check() function), and pretend that nothing was found in hash
18903+ * table if bit is set.
18904+ *
18905+ * jput defers actual return of jnode into slab cache to some later time
18906+ * (by call_rcu()), this guarantees that other threads can safely continue
18907+ * working with JNODE_RIP-ped jnode.
18908+ *
18909+ */
18910+
18911+#include "reiser4.h"
18912+#include "debug.h"
18913+#include "dformat.h"
18914+#include "jnode.h"
18915+#include "plugin/plugin_header.h"
18916+#include "plugin/plugin.h"
18917+#include "txnmgr.h"
18918+/*#include "jnode.h"*/
18919+#include "znode.h"
18920+#include "tree.h"
18921+#include "tree_walk.h"
18922+#include "super.h"
18923+#include "inode.h"
18924+#include "page_cache.h"
18925+
18926+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
18927+#include <linux/types.h>
18928+#include <linux/slab.h>
18929+#include <linux/pagemap.h>
18930+#include <linux/swap.h>
18931+#include <linux/fs.h> /* for struct address_space */
18932+#include <linux/writeback.h> /* for inode_lock */
18933+
18934+static struct kmem_cache *_jnode_slab = NULL;
18935+
18936+static void jnode_set_type(jnode * node, jnode_type type);
18937+static int jdelete(jnode * node);
18938+static int jnode_try_drop(jnode * node);
18939+
18940+#if REISER4_DEBUG
18941+static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18942+#endif
18943+
18944+/* true if valid page is attached to jnode */
18945+static inline int jnode_is_parsed(jnode * node)
18946+{
18947+ return JF_ISSET(node, JNODE_PARSED);
18948+}
18949+
18950+/* hash table support */
18951+
18952+/* compare two jnode keys for equality. Used by hash-table macros */
18953+static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
18954+{
18955+ assert("nikita-2350", k1 != NULL);
18956+ assert("nikita-2351", k2 != NULL);
18957+
18958+ return (k1->index == k2->index && k1->objectid == k2->objectid);
18959+}
18960+
18961+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18962+static inline __u32
18963+jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
18964+{
18965+ assert("nikita-2352", key != NULL);
18966+ assert("nikita-3346", IS_POW(table->_buckets));
18967+
18968+ /* yes, this is remarkable simply (where not stupid) hash function. */
18969+ return (key->objectid + key->index) & (table->_buckets - 1);
18970+}
18971+
18972+/* The hash table definition */
18973+#define KMALLOC(size) reiser4_vmalloc(size)
18974+#define KFREE(ptr, size) vfree(ptr)
18975+TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
18976+ jnode_key_eq);
18977+#undef KFREE
18978+#undef KMALLOC
18979+
18980+/* call this to initialise jnode hash table */
18981+int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18982+{
18983+ assert("nikita-2359", tree != NULL);
18984+ return j_hash_init(&tree->jhash_table, 16384);
18985+}
18986+
18987+/* call this to destroy jnode hash table. This is called during umount. */
18988+int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18989+{
18990+ j_hash_table *jtable;
18991+ jnode *node;
18992+ jnode *next;
18993+
18994+ assert("nikita-2360", tree != NULL);
18995+
18996+ /*
18997+ * Scan hash table and free all jnodes.
18998+ */
18999+ jtable = &tree->jhash_table;
19000+ if (jtable->_table) {
19001+ for_all_in_htable(jtable, j, node, next) {
19002+ assert("nikita-2361", !atomic_read(&node->x_count));
19003+ jdrop(node);
19004+ }
19005+
19006+ j_hash_done(&tree->jhash_table);
19007+ }
19008+ return 0;
19009+}
19010+
19011+/**
19012+ * init_jnodes - create jnode cache
19013+ *
19014+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
19015+ */
19016+int init_jnodes(void)
19017+{
19018+ assert("umka-168", _jnode_slab == NULL);
19019+
19020+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
19021+ SLAB_HWCACHE_ALIGN |
19022+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
19023+ if (_jnode_slab == NULL)
19024+ return RETERR(-ENOMEM);
19025+
19026+ return 0;
19027+}
19028+
19029+/**
19030+ * done_znodes - delete znode cache
19031+ *
19032+ * This is called on reiser4 module unloading or system shutdown.
19033+ */
19034+void done_jnodes(void)
19035+{
19036+ destroy_reiser4_cache(&_jnode_slab);
19037+}
19038+
19039+/* Initialize a jnode. */
19040+void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
19041+{
19042+ assert("umka-175", node != NULL);
19043+
19044+ memset(node, 0, sizeof(jnode));
19045+ ON_DEBUG(node->magic = JMAGIC);
19046+ jnode_set_type(node, type);
19047+ atomic_set(&node->d_count, 0);
19048+ atomic_set(&node->x_count, 0);
19049+ spin_lock_init(&node->guard);
19050+ spin_lock_init(&node->load);
19051+ node->atom = NULL;
19052+ node->tree = tree;
19053+ INIT_LIST_HEAD(&node->capture_link);
19054+
19055+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
19056+
19057+ INIT_RCU_HEAD(&node->rcu);
19058+
19059+#if REISER4_DEBUG
19060+ {
19061+ reiser4_super_info_data *sbinfo;
19062+
19063+ sbinfo = get_super_private(tree->super);
19064+ spin_lock_irq(&sbinfo->all_guard);
19065+ list_add(&node->jnodes, &sbinfo->all_jnodes);
19066+ spin_unlock_irq(&sbinfo->all_guard);
19067+ }
19068+#endif
19069+}
19070+
19071+#if REISER4_DEBUG
19072+/*
19073+ * Remove jnode from ->all_jnodes list.
19074+ */
19075+static void jnode_done(jnode * node, reiser4_tree * tree)
19076+{
19077+ reiser4_super_info_data *sbinfo;
19078+
19079+ sbinfo = get_super_private(tree->super);
19080+
19081+ spin_lock_irq(&sbinfo->all_guard);
19082+ assert("nikita-2422", !list_empty(&node->jnodes));
19083+ list_del_init(&node->jnodes);
19084+ spin_unlock_irq(&sbinfo->all_guard);
19085+}
19086+#endif
19087+
19088+/* return already existing jnode of page */
19089+jnode *jnode_by_page(struct page *pg)
19090+{
19091+ assert("nikita-2066", pg != NULL);
19092+ assert("nikita-2400", PageLocked(pg));
19093+ assert("nikita-2068", PagePrivate(pg));
19094+ assert("nikita-2067", jprivate(pg) != NULL);
19095+ return jprivate(pg);
19096+}
19097+
19098+/* exported functions to allocate/free jnode objects outside this file */
19099+jnode *jalloc(void)
19100+{
19101+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
19102+ return jal;
19103+}
19104+
19105+/* return jnode back to the slab allocator */
19106+inline void jfree(jnode * node)
19107+{
19108+ assert("zam-449", node != NULL);
19109+
19110+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
19111+ NODE_LIST(node) == NOT_CAPTURED));
19112+ assert("nikita-3222", list_empty(&node->jnodes));
19113+ assert("nikita-3221", jnode_page(node) == NULL);
19114+
19115+ /* not yet phash_jnode_destroy(node); */
19116+
19117+ kmem_cache_free(_jnode_slab, node);
19118+}
19119+
19120+/*
19121+ * This function is supplied as RCU callback. It actually frees jnode when
19122+ * last reference to it is gone.
19123+ */
19124+static void jnode_free_actor(struct rcu_head *head)
19125+{
19126+ jnode *node;
19127+ jnode_type jtype;
19128+
19129+ node = container_of(head, jnode, rcu);
19130+ jtype = jnode_get_type(node);
19131+
19132+ ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
19133+
19134+ switch (jtype) {
19135+ case JNODE_IO_HEAD:
19136+ case JNODE_BITMAP:
19137+ case JNODE_UNFORMATTED_BLOCK:
19138+ jfree(node);
19139+ break;
19140+ case JNODE_FORMATTED_BLOCK:
19141+ zfree(JZNODE(node));
19142+ break;
19143+ case JNODE_INODE:
19144+ default:
19145+ wrong_return_value("nikita-3197", "Wrong jnode type");
19146+ }
19147+}
19148+
19149+/*
19150+ * Free a jnode. Post a callback to be executed later through RCU when all
19151+ * references to @node are released.
19152+ */
19153+static inline void jnode_free(jnode * node, jnode_type jtype)
19154+{
19155+ if (jtype != JNODE_INODE) {
19156+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
19157+ call_rcu(&node->rcu, jnode_free_actor);
19158+ } else
19159+ jnode_list_remove(node);
19160+}
19161+
19162+/* allocate new unformatted jnode */
19163+static jnode *jnew_unformatted(void)
19164+{
19165+ jnode *jal;
19166+
19167+ jal = jalloc();
19168+ if (jal == NULL)
19169+ return NULL;
19170+
19171+ jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
19172+ jal->key.j.mapping = NULL;
19173+ jal->key.j.index = (unsigned long)-1;
19174+ jal->key.j.objectid = 0;
19175+ return jal;
19176+}
19177+
19178+/* look for jnode with given mapping and offset within hash table */
19179+jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
19180+{
19181+ jnode_key_t jkey;
19182+ jnode *node;
19183+
19184+ assert("nikita-2353", tree != NULL);
19185+
19186+ jkey.objectid = objectid;
19187+ jkey.index = index;
19188+
19189+ /*
19190+ * hash table is _not_ protected by any lock during lookups. All we
19191+ * have to do is to disable preemption to keep RCU happy.
19192+ */
19193+
19194+ rcu_read_lock();
19195+ node = j_hash_find(&tree->jhash_table, &jkey);
19196+ if (node != NULL) {
19197+ /* protect @node from recycling */
19198+ jref(node);
19199+ assert("nikita-2955", jnode_invariant(node, 0, 0));
19200+ node = jnode_rip_check(tree, node);
19201+ }
19202+ rcu_read_unlock();
19203+ return node;
19204+}
19205+
19206+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
19207+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
19208+{
19209+ assert("vs-1694", mapping->host != NULL);
19210+
19211+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
19212+}
19213+
19214+jnode *jfind(struct address_space * mapping, unsigned long index)
19215+{
19216+ reiser4_tree *tree;
19217+ jnode *node;
19218+
19219+ assert("vs-1694", mapping->host != NULL);
19220+ tree = reiser4_tree_by_inode(mapping->host);
19221+
19222+ read_lock_tree(tree);
19223+ node = jfind_nolock(mapping, index);
19224+ if (node != NULL)
19225+ jref(node);
19226+ read_unlock_tree(tree);
19227+ return node;
19228+}
19229+
19230+static void inode_attach_jnode(jnode * node)
19231+{
19232+ struct inode *inode;
19233+ reiser4_inode *info;
19234+ struct radix_tree_root *rtree;
19235+
19236+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19237+ assert("zam-1043", node->key.j.mapping != NULL);
19238+ inode = node->key.j.mapping->host;
19239+ info = reiser4_inode_data(inode);
19240+ rtree = jnode_tree_by_reiser4_inode(info);
19241+ if (rtree->rnode == NULL) {
19242+ /* prevent inode from being pruned when it has jnodes attached
19243+ to it */
19244+ write_lock_irq(&inode->i_data.tree_lock);
19245+ inode->i_data.nrpages++;
19246+ write_unlock_irq(&inode->i_data.tree_lock);
19247+ }
19248+ assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
19249+ check_me("zam-1045",
19250+ !radix_tree_insert(rtree, node->key.j.index, node));
19251+ ON_DEBUG(info->nr_jnodes++);
19252+}
19253+
19254+static void inode_detach_jnode(jnode * node)
19255+{
19256+ struct inode *inode;
19257+ reiser4_inode *info;
19258+ struct radix_tree_root *rtree;
19259+
19260+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19261+ assert("zam-1044", node->key.j.mapping != NULL);
19262+ inode = node->key.j.mapping->host;
19263+ info = reiser4_inode_data(inode);
19264+ rtree = jnode_tree_by_reiser4_inode(info);
19265+
19266+ assert("zam-1051", info->nr_jnodes != 0);
19267+ assert("zam-1052", rtree->rnode != NULL);
19268+ ON_DEBUG(info->nr_jnodes--);
19269+
19270+ /* delete jnode from inode's radix tree of jnodes */
19271+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
19272+ if (rtree->rnode == NULL) {
19273+ /* inode can be pruned now */
19274+ write_lock_irq(&inode->i_data.tree_lock);
19275+ inode->i_data.nrpages--;
19276+ write_unlock_irq(&inode->i_data.tree_lock);
19277+ }
19278+}
19279+
19280+/* put jnode into hash table (where they can be found by flush who does not know
19281+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
19282+ faster) in places where mapping is known). Currently it is used by
19283+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
19284+ created */
19285+static void
19286+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
19287+ unsigned long index)
19288+{
19289+ j_hash_table *jtable;
19290+
19291+ assert("vs-1446", jnode_is_unformatted(node));
19292+ assert("vs-1442", node->key.j.mapping == 0);
19293+ assert("vs-1443", node->key.j.objectid == 0);
19294+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
19295+ assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19296+
19297+ node->key.j.mapping = mapping;
19298+ node->key.j.objectid = get_inode_oid(mapping->host);
19299+ node->key.j.index = index;
19300+
19301+ jtable = &jnode_get_tree(node)->jhash_table;
19302+
19303+ /* race with some other thread inserting jnode into the hash table is
19304+ * impossible, because we keep the page lock. */
19305+ /*
19306+ * following assertion no longer holds because of RCU: it is possible
19307+ * jnode is in the hash table, but with JNODE_RIP bit set.
19308+ */
19309+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19310+ j_hash_insert_rcu(jtable, node);
19311+ inode_attach_jnode(node);
19312+}
19313+
19314+static void unhash_unformatted_node_nolock(jnode * node)
19315+{
19316+ assert("vs-1683", node->key.j.mapping != NULL);
19317+ assert("vs-1684",
19318+ node->key.j.objectid ==
19319+ get_inode_oid(node->key.j.mapping->host));
19320+
19321+ /* remove jnode from hash-table */
19322+ j_hash_remove_rcu(&node->tree->jhash_table, node);
19323+ inode_detach_jnode(node);
19324+ node->key.j.mapping = NULL;
19325+ node->key.j.index = (unsigned long)-1;
19326+ node->key.j.objectid = 0;
19327+
19328+}
19329+
19330+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19331+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19332+ reiser4_uncapture_jnode */
19333+void unhash_unformatted_jnode(jnode * node)
19334+{
19335+ assert("vs-1445", jnode_is_unformatted(node));
19336+
19337+ write_lock_tree(node->tree);
19338+ unhash_unformatted_node_nolock(node);
19339+ write_unlock_tree(node->tree);
19340+}
19341+
19342+/*
19343+ * search hash table for a jnode with given oid and index. If not found,
19344+ * allocate new jnode, insert it, and also insert into radix tree for the
19345+ * given inode/mapping.
19346+ */
19347+static jnode *find_get_jnode(reiser4_tree * tree,
19348+ struct address_space *mapping,
19349+ oid_t oid, unsigned long index)
19350+{
19351+ jnode *result;
19352+ jnode *shadow;
19353+ int preload;
19354+
19355+ result = jnew_unformatted();
19356+
19357+ if (unlikely(result == NULL))
19358+ return ERR_PTR(RETERR(-ENOMEM));
19359+
19360+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
19361+ if (preload != 0)
19362+ return ERR_PTR(preload);
19363+
19364+ write_lock_tree(tree);
19365+ shadow = jfind_nolock(mapping, index);
19366+ if (likely(shadow == NULL)) {
19367+ /* add new jnode to hash table and inode's radix tree of jnodes */
19368+ jref(result);
19369+ hash_unformatted_jnode(result, mapping, index);
19370+ } else {
19371+ /* jnode is found in inode's radix tree of jnodes */
19372+ jref(shadow);
19373+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19374+ assert("vs-1498", shadow->key.j.mapping == mapping);
19375+ result = shadow;
19376+ }
19377+ write_unlock_tree(tree);
19378+
19379+ assert("nikita-2955",
19380+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
19381+ radix_tree_preload_end();
19382+ return result;
19383+}
19384+
19385+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19386+ creates) jnode corresponding to page @pg. jnode is attached to page and
19387+ inserted into jnode hash-table. */
19388+static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19389+{
19390+ /*
19391+ * There are two ways to create jnode: starting with pre-existing page
19392+ * and without page.
19393+ *
19394+ * When page already exists, jnode is created
19395+ * (jnode_of_page()->do_jget()) under page lock. This is done in
19396+ * ->writepage(), or when capturing anonymous page dirtied through
19397+ * mmap.
19398+ *
19399+ * Jnode without page is created by index_extent_jnode().
19400+ *
19401+ */
19402+
19403+ jnode *result;
19404+ oid_t oid = get_inode_oid(pg->mapping->host);
19405+
19406+ assert("umka-176", pg != NULL);
19407+ assert("nikita-2394", PageLocked(pg));
19408+
19409+ result = jprivate(pg);
19410+ if (likely(result != NULL))
19411+ return jref(result);
19412+
19413+ tree = reiser4_tree_by_page(pg);
19414+
19415+ /* check hash-table first */
19416+ result = jfind(pg->mapping, pg->index);
19417+ if (unlikely(result != NULL)) {
19418+ spin_lock_jnode(result);
19419+ jnode_attach_page(result, pg);
19420+ spin_unlock_jnode(result);
19421+ result->key.j.mapping = pg->mapping;
19422+ return result;
19423+ }
19424+
19425+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */
19426+ reiser4_ctx_gfp_mask_force(GFP_NOFS);
19427+ result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19428+ if (unlikely(IS_ERR(result)))
19429+ return result;
19430+ /* attach jnode to page */
19431+ spin_lock_jnode(result);
19432+ jnode_attach_page(result, pg);
19433+ spin_unlock_jnode(result);
19434+ return result;
19435+}
19436+
19437+/*
19438+ * return jnode for @pg, creating it if necessary.
19439+ */
19440+jnode *jnode_of_page(struct page * pg)
19441+{
19442+ jnode *result;
19443+
19444+ assert("umka-176", pg != NULL);
19445+ assert("nikita-2394", PageLocked(pg));
19446+
19447+ result = do_jget(reiser4_tree_by_page(pg), pg);
19448+
19449+ if (REISER4_DEBUG && !IS_ERR(result)) {
19450+ assert("nikita-3210", result == jprivate(pg));
19451+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19452+ if (jnode_is_unformatted(jprivate(pg))) {
19453+ assert("nikita-2364",
19454+ jprivate(pg)->key.j.index == pg->index);
19455+ assert("nikita-2367",
19456+ jprivate(pg)->key.j.mapping == pg->mapping);
19457+ assert("nikita-2365",
19458+ jprivate(pg)->key.j.objectid ==
19459+ get_inode_oid(pg->mapping->host));
19460+ assert("vs-1200",
19461+ jprivate(pg)->key.j.objectid ==
19462+ pg->mapping->host->i_ino);
19463+ assert("nikita-2356",
19464+ jnode_is_unformatted(jnode_by_page(pg)));
19465+ }
19466+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19467+ }
19468+ return result;
19469+}
19470+
19471+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19472+ * page.*/
19473+void jnode_attach_page(jnode * node, struct page *pg)
19474+{
19475+ assert("nikita-2060", node != NULL);
19476+ assert("nikita-2061", pg != NULL);
19477+
19478+ assert("nikita-2050", jprivate(pg) == 0ul);
19479+ assert("nikita-2393", !PagePrivate(pg));
19480+ assert("vs-1741", node->pg == NULL);
19481+
19482+ assert("nikita-2396", PageLocked(pg));
19483+ assert_spin_locked(&(node->guard));
19484+
19485+ page_cache_get(pg);
19486+ set_page_private(pg, (unsigned long)node);
19487+ node->pg = pg;
19488+ SetPagePrivate(pg);
19489+}
19490+
19491+/* Dual to jnode_attach_page: break a binding between page and jnode */
19492+void page_clear_jnode(struct page *page, jnode * node)
19493+{
19494+ assert("nikita-2424", page != NULL);
19495+ assert("nikita-2425", PageLocked(page));
19496+ assert("nikita-2426", node != NULL);
19497+ assert_spin_locked(&(node->guard));
19498+ assert("nikita-2428", PagePrivate(page));
19499+
19500+ assert("nikita-3551", !PageWriteback(page));
19501+
19502+ JF_CLR(node, JNODE_PARSED);
19503+ set_page_private(page, 0ul);
19504+ ClearPagePrivate(page);
19505+ node->pg = NULL;
19506+ page_cache_release(page);
19507+}
19508+
19509+#if 0
19510+/* it is only used in one place to handle error */
19511+void
19512+page_detach_jnode(struct page *page, struct address_space *mapping,
19513+ unsigned long index)
19514+{
19515+ assert("nikita-2395", page != NULL);
19516+
19517+ lock_page(page);
19518+ if ((page->mapping == mapping) && (page->index == index)
19519+ && PagePrivate(page)) {
19520+ jnode *node;
19521+
19522+ node = jprivate(page);
19523+ spin_lock_jnode(node);
19524+ page_clear_jnode(page, node);
19525+ spin_unlock_jnode(node);
19526+ }
19527+ unlock_page(page);
19528+}
19529+#endif /* 0 */
19530+
19531+/* return @node page locked.
19532+
19533+ Locking ordering requires that one first takes page lock and afterwards
19534+ spin lock on node attached to this page. Sometimes it is necessary to go in
19535+ the opposite direction. This is done through standard trylock-and-release
19536+ loop.
19537+*/
19538+static struct page *jnode_lock_page(jnode * node)
19539+{
19540+ struct page *page;
19541+
19542+ assert("nikita-2052", node != NULL);
19543+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19544+
19545+ while (1) {
19546+
19547+ spin_lock_jnode(node);
19548+ page = jnode_page(node);
19549+ if (page == NULL) {
19550+ break;
19551+ }
19552+
19553+ /* no need to page_cache_get( page ) here, because page cannot
19554+ be evicted from memory without detaching it from jnode and
19555+ this requires spin lock on jnode that we already hold.
19556+ */
19557+ if (!TestSetPageLocked(page)) {
19558+ /* We won a lock on jnode page, proceed. */
19559+ break;
19560+ }
19561+
19562+ /* Page is locked by someone else. */
19563+ page_cache_get(page);
19564+ spin_unlock_jnode(node);
19565+ wait_on_page_locked(page);
19566+ /* it is possible that page was detached from jnode and
19567+ returned to the free pool, or re-assigned while we were
19568+ waiting on locked bit. This will be rechecked on the next
19569+ loop iteration.
19570+ */
19571+ page_cache_release(page);
19572+
19573+ /* try again */
19574+ }
19575+ return page;
19576+}
19577+
19578+/*
19579+ * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19580+ * validness of jnode content.
19581+ */
19582+static inline int jparse(jnode * node)
19583+{
19584+ int result;
19585+
19586+ assert("nikita-2466", node != NULL);
19587+
19588+ spin_lock_jnode(node);
19589+ if (likely(!jnode_is_parsed(node))) {
19590+ result = jnode_ops(node)->parse(node);
19591+ if (likely(result == 0))
19592+ JF_SET(node, JNODE_PARSED);
19593+ } else
19594+ result = 0;
19595+ spin_unlock_jnode(node);
19596+ return result;
19597+}
19598+
19599+/* Lock a page attached to jnode, create and attach page to jnode if it had no
19600+ * one. */
19601+static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19602+{
19603+ struct page *page;
19604+
19605+ spin_lock_jnode(node);
19606+ page = jnode_page(node);
19607+
19608+ if (page == NULL) {
19609+ spin_unlock_jnode(node);
19610+ page = find_or_create_page(jnode_get_mapping(node),
19611+ jnode_get_index(node), gfp_flags);
19612+ if (page == NULL)
19613+ return ERR_PTR(RETERR(-ENOMEM));
19614+ } else {
19615+ if (!TestSetPageLocked(page)) {
19616+ spin_unlock_jnode(node);
19617+ return page;
19618+ }
19619+ page_cache_get(page);
19620+ spin_unlock_jnode(node);
19621+ lock_page(page);
19622+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19623+ }
19624+
19625+ spin_lock_jnode(node);
19626+ if (!jnode_page(node))
19627+ jnode_attach_page(node, page);
19628+ spin_unlock_jnode(node);
19629+
19630+ page_cache_release(page);
19631+ assert("zam-894", jnode_page(node) == page);
19632+ return page;
19633+}
19634+
19635+/* Start read operation for jnode's page if page is not up-to-date. */
19636+static int jnode_start_read(jnode * node, struct page *page)
19637+{
19638+ assert("zam-893", PageLocked(page));
19639+
19640+ if (PageUptodate(page)) {
19641+ unlock_page(page);
19642+ return 0;
19643+ }
19644+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19645+}
19646+
19647+#if REISER4_DEBUG
19648+static void check_jload(jnode * node, struct page *page)
19649+{
19650+ if (jnode_is_znode(node)) {
19651+ node40_header *nh;
19652+ znode *z;
19653+
19654+ z = JZNODE(node);
19655+ if (znode_is_any_locked(z)) {
19656+ nh = (node40_header *) kmap(page);
19657+ /* this only works for node40-only file systems. For
19658+ * debugging. */
19659+ assert("nikita-3253",
19660+ z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19661+ kunmap(page);
19662+ }
19663+ assert("nikita-3565", znode_invariant(z));
19664+ }
19665+}
19666+#else
19667+#define check_jload(node, page) noop
19668+#endif
19669+
19670+/* prefetch jnode to speed up next call to jload. Call this when you are going
19671+ * to call jload() shortly. This will bring appropriate portion of jnode into
19672+ * CPU cache. */
19673+void jload_prefetch(jnode * node)
19674+{
19675+ prefetchw(&node->x_count);
19676+}
19677+
19678+/* load jnode's data into memory */
19679+int jload_gfp(jnode * node /* node to load */ ,
19680+ gfp_t gfp_flags /* allocation flags */ ,
19681+ int do_kmap /* true if page should be kmapped */ )
19682+{
19683+ struct page *page;
19684+ int result = 0;
19685+ int parsed;
19686+
19687+ assert("nikita-3010", reiser4_schedulable());
19688+
19689+ prefetchw(&node->pg);
19690+
19691+ /* taking d-reference implies taking x-reference. */
19692+ jref(node);
19693+
19694+ /*
19695+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19696+ * should be atomic, otherwise there is a race against
19697+ * reiser4_releasepage().
19698+ */
19699+ spin_lock(&(node->load));
19700+ add_d_ref(node);
19701+ parsed = jnode_is_parsed(node);
19702+ spin_unlock(&(node->load));
19703+
19704+ if (unlikely(!parsed)) {
19705+ page = jnode_get_page_locked(node, gfp_flags);
19706+ if (unlikely(IS_ERR(page))) {
19707+ result = PTR_ERR(page);
19708+ goto failed;
19709+ }
19710+
19711+ result = jnode_start_read(node, page);
19712+ if (unlikely(result != 0))
19713+ goto failed;
19714+
19715+ wait_on_page_locked(page);
19716+ if (unlikely(!PageUptodate(page))) {
19717+ result = RETERR(-EIO);
19718+ goto failed;
19719+ }
19720+
19721+ if (do_kmap)
19722+ node->data = kmap(page);
19723+
19724+ result = jparse(node);
19725+ if (unlikely(result != 0)) {
19726+ if (do_kmap)
19727+ kunmap(page);
19728+ goto failed;
19729+ }
19730+ check_jload(node, page);
19731+ } else {
19732+ page = jnode_page(node);
19733+ check_jload(node, page);
19734+ if (do_kmap)
19735+ node->data = kmap(page);
19736+ }
19737+
19738+ if (!is_writeout_mode())
19739+ /* We do not mark pages active if jload is called as a part of
19740+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
19741+ * and write_logs() add no value to cached data, there is no
19742+ * sense to mark pages as active when they go to disk, it just
19743+ * confuses vm scanning routines because clean page could be
19744+ * moved out from inactive list as a result of this
19745+ * mark_page_accessed() call. */
19746+ mark_page_accessed(page);
19747+
19748+ return 0;
19749+
19750+ failed:
19751+ jrelse_tail(node);
19752+ return result;
19753+
19754+}
19755+
19756+/* start asynchronous reading for given jnode's page. */
19757+int jstartio(jnode * node)
19758+{
19759+ struct page *page;
19760+
19761+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19762+ if (IS_ERR(page))
19763+ return PTR_ERR(page);
19764+
19765+ return jnode_start_read(node, page);
19766+}
19767+
19768+/* Initialize a node by calling appropriate plugin instead of reading
19769+ * node from disk as in jload(). */
19770+int jinit_new(jnode * node, gfp_t gfp_flags)
19771+{
19772+ struct page *page;
19773+ int result;
19774+
19775+ jref(node);
19776+ add_d_ref(node);
19777+
19778+ page = jnode_get_page_locked(node, gfp_flags);
19779+ if (IS_ERR(page)) {
19780+ result = PTR_ERR(page);
19781+ goto failed;
19782+ }
19783+
19784+ SetPageUptodate(page);
19785+ unlock_page(page);
19786+
19787+ node->data = kmap(page);
19788+
19789+ if (!jnode_is_parsed(node)) {
19790+ jnode_plugin *jplug = jnode_ops(node);
19791+ spin_lock_jnode(node);
19792+ result = jplug->init(node);
19793+ spin_unlock_jnode(node);
19794+ if (result) {
19795+ kunmap(page);
19796+ goto failed;
19797+ }
19798+ JF_SET(node, JNODE_PARSED);
19799+ }
19800+
19801+ return 0;
19802+
19803+ failed:
19804+ jrelse(node);
19805+ return result;
19806+}
19807+
19808+/* release a reference to jnode acquired by jload(), decrement ->d_count */
19809+void jrelse_tail(jnode * node /* jnode to release references to */ )
19810+{
19811+ assert("nikita-489", atomic_read(&node->d_count) > 0);
19812+ atomic_dec(&node->d_count);
19813+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
19814+ LOCK_CNT_DEC(d_refs);
19815+ /* release reference acquired in jload_gfp() or jinit_new() */
19816+ jput(node);
19817+}
19818+
19819+/* drop reference to node data. When last reference is dropped, data are
19820+ unloaded. */
19821+void jrelse(jnode * node /* jnode to release references to */ )
19822+{
19823+ struct page *page;
19824+
19825+ assert("nikita-487", node != NULL);
19826+ assert_spin_not_locked(&(node->guard));
19827+
19828+ page = jnode_page(node);
19829+ if (likely(page != NULL)) {
19830+ /*
19831+ * it is safe not to lock jnode here, because at this point
19832+ * @node->d_count is greater than zero (if jrelse() is used
19833+ * correctly, that is). JNODE_PARSED may be not set yet, if,
19834+ * for example, we got here as a result of error handling path
19835+ * in jload(). Anyway, page cannot be detached by
19836+ * reiser4_releasepage(). truncate will invalidate page
19837+ * regardless, but this should not be a problem.
19838+ */
19839+ kunmap(page);
19840+ }
19841+ jrelse_tail(node);
19842+}
19843+
19844+/* called from jput() to wait for io completion */
19845+static void jnode_finish_io(jnode * node)
19846+{
19847+ struct page *page;
19848+
19849+ assert("nikita-2922", node != NULL);
19850+
19851+ spin_lock_jnode(node);
19852+ page = jnode_page(node);
19853+ if (page != NULL) {
19854+ page_cache_get(page);
19855+ spin_unlock_jnode(node);
19856+ wait_on_page_writeback(page);
19857+ page_cache_release(page);
19858+ } else
19859+ spin_unlock_jnode(node);
19860+}
19861+
19862+/*
19863+ * This is called by jput() when last reference to jnode is released. This is
19864+ * separate function, because we want fast path of jput() to be inline and,
19865+ * therefore, small.
19866+ */
19867+void jput_final(jnode * node)
19868+{
19869+ int r_i_p;
19870+
19871+ /* A fast check for keeping node in cache. We always keep node in cache
19872+ * if its page is present and node was not marked for deletion */
19873+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19874+ rcu_read_unlock();
19875+ return;
19876+ }
19877+ assert("edward-1432", node->page_count == 0);
19878+
19879+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19880+ /*
19881+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19882+ * this case it is safe to access node after unlock.
19883+ */
19884+ rcu_read_unlock();
19885+ if (r_i_p) {
19886+ jnode_finish_io(node);
19887+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19888+ /* node is removed from the tree. */
19889+ jdelete(node);
19890+ else
19891+ jnode_try_drop(node);
19892+ }
19893+ /* if !r_i_p some other thread is already killing it */
19894+}
19895+
19896+int jwait_io(jnode * node, int rw)
19897+{
19898+ struct page *page;
19899+ int result;
19900+
19901+ assert("zam-447", node != NULL);
19902+ assert("zam-448", jnode_page(node) != NULL);
19903+
19904+ page = jnode_page(node);
19905+
19906+ result = 0;
19907+ if (rw == READ) {
19908+ wait_on_page_locked(page);
19909+ } else {
19910+ assert("nikita-2227", rw == WRITE);
19911+ wait_on_page_writeback(page);
19912+ }
19913+ if (PageError(page))
19914+ result = RETERR(-EIO);
19915+
19916+ return result;
19917+}
19918+
19919+/*
19920+ * jnode types and plugins.
19921+ *
19922+ * jnode by itself is a "base type". There are several different jnode
19923+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19924+ * has to do different things based on jnode type. In the standard reiser4 way
19925+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19926+ *
19927+ * Functions below deal with jnode types and define methods of jnode plugin.
19928+ *
19929+ */
19930+
19931+/* set jnode type. This is done during jnode initialization. */
19932+static void jnode_set_type(jnode * node, jnode_type type)
19933+{
19934+ static unsigned long type_to_mask[] = {
19935+ [JNODE_UNFORMATTED_BLOCK] = 1,
19936+ [JNODE_FORMATTED_BLOCK] = 0,
19937+ [JNODE_BITMAP] = 2,
19938+ [JNODE_IO_HEAD] = 6,
19939+ [JNODE_INODE] = 4
19940+ };
19941+
19942+ assert("zam-647", type < LAST_JNODE_TYPE);
19943+ assert("nikita-2815", !jnode_is_loaded(node));
19944+ assert("nikita-3386", node->state == 0);
19945+
19946+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19947+}
19948+
19949+/* ->init() method of jnode plugin for jnodes that don't require plugin
19950+ * specific initialization. */
19951+static int init_noinit(jnode * node UNUSED_ARG)
19952+{
19953+ return 0;
19954+}
19955+
19956+/* ->parse() method of jnode plugin for jnodes that don't require plugin
19957+ * specific pasring. */
19958+static int parse_noparse(jnode * node UNUSED_ARG)
19959+{
19960+ return 0;
19961+}
19962+
19963+/* ->mapping() method for unformatted jnode */
19964+struct address_space *mapping_jnode(const jnode * node)
19965+{
19966+ struct address_space *map;
19967+
19968+ assert("nikita-2713", node != NULL);
19969+
19970+ /* mapping is stored in jnode */
19971+
19972+ map = node->key.j.mapping;
19973+ assert("nikita-2714", map != NULL);
19974+ assert("nikita-2897", is_reiser4_inode(map->host));
19975+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19976+ return map;
19977+}
19978+
19979+/* ->index() method for unformatted jnodes */
19980+unsigned long index_jnode(const jnode * node)
19981+{
19982+ /* index is stored in jnode */
19983+ return node->key.j.index;
19984+}
19985+
19986+/* ->remove() method for unformatted jnodes */
19987+static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19988+{
19989+ /* remove jnode from hash table and radix tree */
19990+ if (node->key.j.mapping)
19991+ unhash_unformatted_node_nolock(node);
19992+}
19993+
19994+/* ->mapping() method for znodes */
19995+static struct address_space *mapping_znode(const jnode * node)
19996+{
19997+ /* all znodes belong to fake inode */
19998+ return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19999+}
20000+
20001+/* ->index() method for znodes */
20002+static unsigned long index_znode(const jnode * node)
20003+{
20004+ unsigned long addr;
20005+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
20006+
20007+ /* index of znode is just its address (shifted) */
20008+ addr = (unsigned long)node;
20009+ return (addr - PAGE_OFFSET) >> znode_shift_order;
20010+}
20011+
20012+/* ->mapping() method for bitmap jnode */
20013+static struct address_space *mapping_bitmap(const jnode * node)
20014+{
20015+ /* all bitmap blocks belong to special bitmap inode */
20016+ return get_super_private(jnode_get_tree(node)->super)->bitmap->
20017+ i_mapping;
20018+}
20019+
20020+/* ->index() method for jnodes that are indexed by address */
20021+static unsigned long index_is_address(const jnode * node)
20022+{
20023+ unsigned long ind;
20024+
20025+ ind = (unsigned long)node;
20026+ return ind - PAGE_OFFSET;
20027+}
20028+
20029+/* resolve race with jput */
20030+jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
20031+{
20032+ /*
20033+ * This is used as part of RCU-based jnode handling.
20034+ *
20035+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
20036+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
20037+ * not protected during this, so concurrent thread may execute
20038+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
20039+ * freed in jput_final(). To avoid such races, jput_final() sets
20040+ * JNODE_RIP on jnode (under tree lock). All places that work with
20041+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
20042+ * (first without taking tree lock), and if this bit is set, released
20043+ * reference acquired by the current thread and returns NULL.
20044+ *
20045+ * As a result, if jnode is being concurrently freed, NULL is returned
20046+ * and caller should pretend that jnode wasn't found in the first
20047+ * place.
20048+ *
20049+ * Otherwise it's safe to release "rcu-read-lock" and continue with
20050+ * jnode.
20051+ */
20052+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
20053+ read_lock_tree(tree);
20054+ if (JF_ISSET(node, JNODE_RIP)) {
20055+ dec_x_ref(node);
20056+ node = NULL;
20057+ }
20058+ read_unlock_tree(tree);
20059+ }
20060+ return node;
20061+}
20062+
20063+reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
20064+{
20065+ struct inode *inode;
20066+ item_plugin *iplug;
20067+ loff_t off;
20068+
20069+ assert("nikita-3092", node != NULL);
20070+ assert("nikita-3093", key != NULL);
20071+ assert("nikita-3094", jnode_is_unformatted(node));
20072+
20073+ off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
20074+ inode = mapping_jnode(node)->host;
20075+
20076+ if (node->parent_item_id != 0)
20077+ iplug = item_plugin_by_id(node->parent_item_id);
20078+ else
20079+ iplug = NULL;
20080+
20081+ if (iplug != NULL && iplug->f.key_by_offset)
20082+ iplug->f.key_by_offset(inode, off, key);
20083+ else {
20084+ file_plugin *fplug;
20085+
20086+ fplug = inode_file_plugin(inode);
20087+ assert("zam-1007", fplug != NULL);
20088+ assert("zam-1008", fplug->key_by_inode != NULL);
20089+
20090+ fplug->key_by_inode(inode, off, key);
20091+ }
20092+
20093+ return key;
20094+}
20095+
20096+/* ->parse() method for formatted nodes */
20097+static int parse_znode(jnode * node)
20098+{
20099+ return zparse(JZNODE(node));
20100+}
20101+
20102+/* ->delete() method for formatted nodes */
20103+static void delete_znode(jnode * node, reiser4_tree * tree)
20104+{
20105+ znode *z;
20106+
20107+ assert_rw_write_locked(&(tree->tree_lock));
20108+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20109+
20110+ z = JZNODE(node);
20111+ assert("vs-899", z->c_count == 0);
20112+
20113+ /* delete znode from sibling list. */
20114+ sibling_list_remove(z);
20115+
20116+ znode_remove(z, tree);
20117+}
20118+
20119+/* ->remove() method for formatted nodes */
20120+static int remove_znode(jnode * node, reiser4_tree * tree)
20121+{
20122+ znode *z;
20123+
20124+ assert_rw_write_locked(&(tree->tree_lock));
20125+ z = JZNODE(node);
20126+
20127+ if (z->c_count == 0) {
20128+ /* detach znode from sibling list. */
20129+ sibling_list_drop(z);
20130+ /* this is called with tree spin-lock held, so call
20131+ znode_remove() directly (rather than znode_lock_remove()). */
20132+ znode_remove(z, tree);
20133+ return 0;
20134+ }
20135+ return RETERR(-EBUSY);
20136+}
20137+
20138+/* ->init() method for formatted nodes */
20139+static int init_znode(jnode * node)
20140+{
20141+ znode *z;
20142+
20143+ z = JZNODE(node);
20144+ /* call node plugin to do actual initialization */
20145+ return z->nplug->init(z);
20146+}
20147+
20148+/* ->clone() method for formatted nodes */
20149+static jnode *clone_formatted(jnode * node)
20150+{
20151+ znode *clone;
20152+
20153+ assert("vs-1430", jnode_is_znode(node));
20154+ clone = zalloc(reiser4_ctx_gfp_mask_get());
20155+ if (clone == NULL)
20156+ return ERR_PTR(RETERR(-ENOMEM));
20157+ zinit(clone, NULL, current_tree);
20158+ jnode_set_block(ZJNODE(clone), jnode_get_block(node));
20159+ /* ZJNODE(clone)->key.z is not initialized */
20160+ clone->level = JZNODE(node)->level;
20161+
20162+ return ZJNODE(clone);
20163+}
20164+
20165+/* jplug->clone for unformatted nodes */
20166+static jnode *clone_unformatted(jnode * node)
20167+{
20168+ jnode *clone;
20169+
20170+ assert("vs-1431", jnode_is_unformatted(node));
20171+ clone = jalloc();
20172+ if (clone == NULL)
20173+ return ERR_PTR(RETERR(-ENOMEM));
20174+
20175+ jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
20176+ jnode_set_block(clone, jnode_get_block(node));
20177+
20178+ return clone;
20179+
20180+}
20181+
20182+/*
20183+ * Setup jnode plugin methods for various jnode types.
20184+ */
20185+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
20186+ [JNODE_UNFORMATTED_BLOCK] = {
20187+ .h = {
20188+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20189+ .id = JNODE_UNFORMATTED_BLOCK,
20190+ .pops = NULL,
20191+ .label = "unformatted",
20192+ .desc = "unformatted node",
20193+ .linkage = {NULL, NULL}
20194+ },
20195+ .init = init_noinit,
20196+ .parse = parse_noparse,
20197+ .mapping = mapping_jnode,
20198+ .index = index_jnode,
20199+ .clone = clone_unformatted
20200+ },
20201+ [JNODE_FORMATTED_BLOCK] = {
20202+ .h = {
20203+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20204+ .id = JNODE_FORMATTED_BLOCK,
20205+ .pops = NULL,
20206+ .label = "formatted",
20207+ .desc = "formatted tree node",
20208+ .linkage = {NULL, NULL}
20209+ },
20210+ .init = init_znode,
20211+ .parse = parse_znode,
20212+ .mapping = mapping_znode,
20213+ .index = index_znode,
20214+ .clone = clone_formatted
20215+ },
20216+ [JNODE_BITMAP] = {
20217+ .h = {
20218+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20219+ .id = JNODE_BITMAP,
20220+ .pops = NULL,
20221+ .label = "bitmap",
20222+ .desc = "bitmap node",
20223+ .linkage = {NULL, NULL}
20224+ },
20225+ .init = init_noinit,
20226+ .parse = parse_noparse,
20227+ .mapping = mapping_bitmap,
20228+ .index = index_is_address,
20229+ .clone = NULL
20230+ },
20231+ [JNODE_IO_HEAD] = {
20232+ .h = {
20233+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20234+ .id = JNODE_IO_HEAD,
20235+ .pops = NULL,
20236+ .label = "io head",
20237+ .desc = "io head",
20238+ .linkage = {NULL, NULL}
20239+ },
20240+ .init = init_noinit,
20241+ .parse = parse_noparse,
20242+ .mapping = mapping_bitmap,
20243+ .index = index_is_address,
20244+ .clone = NULL
20245+ },
20246+ [JNODE_INODE] = {
20247+ .h = {
20248+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
20249+ .id = JNODE_INODE,
20250+ .pops = NULL,
20251+ .label = "inode",
20252+ .desc = "inode's builtin jnode",
20253+ .linkage = {NULL, NULL}
20254+ },
20255+ .init = NULL,
20256+ .parse = NULL,
20257+ .mapping = NULL,
20258+ .index = NULL,
20259+ .clone = NULL
20260+ }
20261+};
20262+
20263+/*
20264+ * jnode destruction.
20265+ *
20266+ * Thread may use a jnode after it acquired a reference to it. References are
20267+ * counted in ->x_count field. Reference protects jnode from being
20268+ * recycled. This is different from protecting jnode data (that are stored in
20269+ * jnode page) from being evicted from memory. Data are protected by jload()
20270+ * and released by jrelse().
20271+ *
20272+ * If thread already possesses a reference to the jnode it can acquire another
20273+ * one through jref(). Initial reference is obtained (usually) by locating
20274+ * jnode in some indexing structure that depends on jnode type: formatted
20275+ * nodes are kept in global hash table, where they are indexed by block
20276+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
20277+ * table, which is indexed by oid and offset within file, and in per-inode
20278+ * radix tree.
20279+ *
20280+ * Reference to jnode is released by jput(). If last reference is released,
20281+ * jput_final() is called. This function determines whether jnode has to be
20282+ * deleted (this happens when corresponding node is removed from the file
20283+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
20284+ * should be just "removed" (deleted from memory).
20285+ *
20286+ * Jnode destruction is signally delicate dance because of locking and RCU.
20287+ */
20288+
20289+/*
20290+ * Returns true if jnode cannot be removed right now. This check is called
20291+ * under tree lock. If it returns true, jnode is irrevocably committed to be
20292+ * deleted/removed.
20293+ */
20294+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20295+{
20296+ /* if other thread managed to acquire a reference to this jnode, don't
20297+ * free it. */
20298+ if (atomic_read(&node->x_count) > 0)
20299+ return 1;
20300+ /* also, don't free znode that has children in memory */
20301+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20302+ return 1;
20303+ return 0;
20304+}
20305+
20306+/*
20307+ * this is called as part of removing jnode. Based on jnode type, call
20308+ * corresponding function that removes jnode from indices and returns it back
20309+ * to the appropriate slab (through RCU).
20310+ */
20311+static inline void
20312+jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20313+{
20314+ switch (jtype) {
20315+ case JNODE_UNFORMATTED_BLOCK:
20316+ remove_jnode(node, tree);
20317+ break;
20318+ case JNODE_IO_HEAD:
20319+ case JNODE_BITMAP:
20320+ break;
20321+ case JNODE_INODE:
20322+ break;
20323+ case JNODE_FORMATTED_BLOCK:
20324+ remove_znode(node, tree);
20325+ break;
20326+ default:
20327+ wrong_return_value("nikita-3196", "Wrong jnode type");
20328+ }
20329+}
20330+
20331+/*
20332+ * this is called as part of deleting jnode. Based on jnode type, call
20333+ * corresponding function that removes jnode from indices and returns it back
20334+ * to the appropriate slab (through RCU).
20335+ *
20336+ * This differs from jnode_remove() only for formatted nodes---for them
20337+ * sibling list handling is different for removal and deletion.
20338+ */
20339+static inline void
20340+jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20341+{
20342+ switch (jtype) {
20343+ case JNODE_UNFORMATTED_BLOCK:
20344+ remove_jnode(node, tree);
20345+ break;
20346+ case JNODE_IO_HEAD:
20347+ case JNODE_BITMAP:
20348+ break;
20349+ case JNODE_FORMATTED_BLOCK:
20350+ delete_znode(node, tree);
20351+ break;
20352+ case JNODE_INODE:
20353+ default:
20354+ wrong_return_value("nikita-3195", "Wrong jnode type");
20355+ }
20356+}
20357+
20358+#if REISER4_DEBUG
20359+/*
20360+ * remove jnode from the debugging list of all jnodes hanging off super-block.
20361+ */
20362+void jnode_list_remove(jnode * node)
20363+{
20364+ reiser4_super_info_data *sbinfo;
20365+
20366+ sbinfo = get_super_private(jnode_get_tree(node)->super);
20367+
20368+ spin_lock_irq(&sbinfo->all_guard);
20369+ assert("nikita-2422", !list_empty(&node->jnodes));
20370+ list_del_init(&node->jnodes);
20371+ spin_unlock_irq(&sbinfo->all_guard);
20372+}
20373+#endif
20374+
20375+/*
20376+ * this is called by jput_final() to remove jnode when last reference to it is
20377+ * released.
20378+ */
20379+static int jnode_try_drop(jnode * node)
20380+{
20381+ int result;
20382+ reiser4_tree *tree;
20383+ jnode_type jtype;
20384+
20385+ assert("nikita-2491", node != NULL);
20386+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20387+
20388+ tree = jnode_get_tree(node);
20389+ jtype = jnode_get_type(node);
20390+
20391+ spin_lock_jnode(node);
20392+ write_lock_tree(tree);
20393+ /*
20394+ * if jnode has a page---leave it alone. Memory pressure will
20395+ * eventually kill page and jnode.
20396+ */
20397+ if (jnode_page(node) != NULL) {
20398+ write_unlock_tree(tree);
20399+ spin_unlock_jnode(node);
20400+ JF_CLR(node, JNODE_RIP);
20401+ return RETERR(-EBUSY);
20402+ }
20403+
20404+ /* re-check ->x_count under tree lock. */
20405+ result = jnode_is_busy(node, jtype);
20406+ if (result == 0) {
20407+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20408+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20409+
20410+ spin_unlock_jnode(node);
20411+ /* no page and no references---despatch him. */
20412+ jnode_remove(node, jtype, tree);
20413+ write_unlock_tree(tree);
20414+ jnode_free(node, jtype);
20415+ } else {
20416+ /* busy check failed: reference was acquired by concurrent
20417+ * thread. */
20418+ write_unlock_tree(tree);
20419+ spin_unlock_jnode(node);
20420+ JF_CLR(node, JNODE_RIP);
20421+ }
20422+ return result;
20423+}
20424+
20425+/* jdelete() -- Delete jnode from the tree and file system */
20426+static int jdelete(jnode * node /* jnode to finish with */ )
20427+{
20428+ struct page *page;
20429+ int result;
20430+ reiser4_tree *tree;
20431+ jnode_type jtype;
20432+
20433+ assert("nikita-467", node != NULL);
20434+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20435+
20436+ jtype = jnode_get_type(node);
20437+
20438+ page = jnode_lock_page(node);
20439+ assert_spin_locked(&(node->guard));
20440+
20441+ tree = jnode_get_tree(node);
20442+
20443+ write_lock_tree(tree);
20444+ /* re-check ->x_count under tree lock. */
20445+ result = jnode_is_busy(node, jtype);
20446+ if (likely(!result)) {
20447+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20448+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
20449+
20450+ /* detach page */
20451+ if (page != NULL) {
20452+ /*
20453+ * FIXME this is racy against jnode_extent_write().
20454+ */
20455+ page_clear_jnode(page, node);
20456+ }
20457+ spin_unlock_jnode(node);
20458+ /* goodbye */
20459+ jnode_delete(node, jtype, tree);
20460+ write_unlock_tree(tree);
20461+ jnode_free(node, jtype);
20462+ /* @node is no longer valid pointer */
20463+ if (page != NULL)
20464+ reiser4_drop_page(page);
20465+ } else {
20466+ /* busy check failed: reference was acquired by concurrent
20467+ * thread. */
20468+ JF_CLR(node, JNODE_RIP);
20469+ write_unlock_tree(tree);
20470+ spin_unlock_jnode(node);
20471+ if (page != NULL)
20472+ unlock_page(page);
20473+ }
20474+ return result;
20475+}
20476+
20477+/* drop jnode on the floor.
20478+
20479+ Return value:
20480+
20481+ -EBUSY: failed to drop jnode, because there are still references to it
20482+
20483+ 0: successfully dropped jnode
20484+
20485+*/
20486+static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20487+{
20488+ struct page *page;
20489+ jnode_type jtype;
20490+ int result;
20491+
20492+ assert("zam-602", node != NULL);
20493+ assert_rw_not_read_locked(&(tree->tree_lock));
20494+ assert_rw_not_write_locked(&(tree->tree_lock));
20495+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20496+
20497+ jtype = jnode_get_type(node);
20498+
20499+ page = jnode_lock_page(node);
20500+ assert_spin_locked(&(node->guard));
20501+
20502+ write_lock_tree(tree);
20503+
20504+ /* re-check ->x_count under tree lock. */
20505+ result = jnode_is_busy(node, jtype);
20506+ if (!result) {
20507+ assert("nikita-2488", page == jnode_page(node));
20508+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
20509+ if (page != NULL) {
20510+ assert("nikita-2126", !PageDirty(page));
20511+ assert("nikita-2127", PageUptodate(page));
20512+ assert("nikita-2181", PageLocked(page));
20513+ page_clear_jnode(page, node);
20514+ }
20515+ spin_unlock_jnode(node);
20516+ jnode_remove(node, jtype, tree);
20517+ write_unlock_tree(tree);
20518+ jnode_free(node, jtype);
20519+ if (page != NULL) {
20520+ reiser4_drop_page(page);
20521+ }
20522+ } else {
20523+ /* busy check failed: reference was acquired by concurrent
20524+ * thread. */
20525+ JF_CLR(node, JNODE_RIP);
20526+ write_unlock_tree(tree);
20527+ spin_unlock_jnode(node);
20528+ if (page != NULL)
20529+ unlock_page(page);
20530+ }
20531+ return result;
20532+}
20533+
20534+/* This function frees jnode "if possible". In particular, [dcx]_count has to
20535+ be 0 (where applicable). */
20536+void jdrop(jnode * node)
20537+{
20538+ jdrop_in_tree(node, jnode_get_tree(node));
20539+}
20540+
20541+/* IO head jnode implementation; The io heads are simple j-nodes with limited
20542+ functionality (these j-nodes are not in any hash table) just for reading
20543+ from and writing to disk. */
20544+
20545+jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20546+{
20547+ jnode *jal = jalloc();
20548+
20549+ if (jal != NULL) {
20550+ jnode_init(jal, current_tree, JNODE_IO_HEAD);
20551+ jnode_set_block(jal, block);
20552+ }
20553+
20554+ jref(jal);
20555+
20556+ return jal;
20557+}
20558+
20559+void reiser4_drop_io_head(jnode * node)
20560+{
20561+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20562+
20563+ jput(node);
20564+ jdrop(node);
20565+}
20566+
20567+/* protect keep jnode data from reiser4_releasepage() */
20568+void pin_jnode_data(jnode * node)
20569+{
20570+ assert("zam-671", jnode_page(node) != NULL);
20571+ page_cache_get(jnode_page(node));
20572+}
20573+
20574+/* make jnode data free-able again */
20575+void unpin_jnode_data(jnode * node)
20576+{
20577+ assert("zam-672", jnode_page(node) != NULL);
20578+ page_cache_release(jnode_page(node));
20579+}
20580+
20581+struct address_space *jnode_get_mapping(const jnode * node)
20582+{
20583+ assert("nikita-3162", node != NULL);
20584+ return jnode_ops(node)->mapping(node);
20585+}
20586+
20587+#if REISER4_DEBUG
20588+/* debugging aid: jnode invariant */
20589+int jnode_invariant_f(const jnode * node, char const **msg)
20590+{
20591+#define _ergo(ant, con) \
20592+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20593+#define _check(exp) ((*msg) = #exp, (exp))
20594+
20595+ return _check(node != NULL) &&
20596+ /* [jnode-queued] */
20597+ /* only relocated node can be queued, except that when znode
20598+ * is being deleted, its JNODE_RELOC bit is cleared */
20599+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20600+ JF_ISSET(node, JNODE_RELOC) ||
20601+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20602+ _check(node->jnodes.prev != NULL) &&
20603+ _check(node->jnodes.next != NULL) &&
20604+ /* [jnode-dirty] invariant */
20605+ /* dirty inode is part of atom */
20606+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20607+ /* [jnode-oid] invariant */
20608+ /* for unformatted node ->objectid and ->mapping fields are
20609+ * consistent */
20610+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20611+ node->key.j.objectid ==
20612+ get_inode_oid(node->key.j.mapping->host)) &&
20613+ /* [jnode-atom-valid] invariant */
20614+ /* node atom has valid state */
20615+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20616+ /* [jnode-page-binding] invariant */
20617+ /* if node points to page, it points back to node */
20618+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20619+ /* [jnode-refs] invariant */
20620+ /* only referenced jnode can be loaded */
20621+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20622+
20623+}
20624+
20625+static const char *jnode_type_name(jnode_type type)
20626+{
20627+ switch (type) {
20628+ case JNODE_UNFORMATTED_BLOCK:
20629+ return "unformatted";
20630+ case JNODE_FORMATTED_BLOCK:
20631+ return "formatted";
20632+ case JNODE_BITMAP:
20633+ return "bitmap";
20634+ case JNODE_IO_HEAD:
20635+ return "io head";
20636+ case JNODE_INODE:
20637+ return "inode";
20638+ case LAST_JNODE_TYPE:
20639+ return "last";
20640+ default:{
20641+ static char unknown[30];
20642+
20643+ sprintf(unknown, "unknown %i", type);
20644+ return unknown;
20645+ }
20646+ }
20647+}
20648+
20649+#define jnode_state_name( node, flag ) \
20650+ ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20651+
20652+/* debugging aid: output human readable information about @node */
20653+static void info_jnode(const char *prefix /* prefix to print */ ,
20654+ const jnode * node /* node to print */ )
20655+{
20656+ assert("umka-068", prefix != NULL);
20657+
20658+ if (node == NULL) {
20659+ printk("%s: null\n", prefix);
20660+ return;
20661+ }
20662+
20663+ printk
20664+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20665+ " block: %s, d_count: %d, x_count: %d, "
20666+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20667+ node->state,
20668+ jnode_state_name(node, JNODE_PARSED),
20669+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
20670+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
20671+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20672+ jnode_state_name(node, JNODE_ORPHAN),
20673+ jnode_state_name(node, JNODE_CREATED),
20674+ jnode_state_name(node, JNODE_RELOC),
20675+ jnode_state_name(node, JNODE_OVRWR),
20676+ jnode_state_name(node, JNODE_DIRTY),
20677+ jnode_state_name(node, JNODE_IS_DYING),
20678+ jnode_state_name(node, JNODE_RIP),
20679+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20680+ jnode_state_name(node, JNODE_WRITEBACK),
20681+ jnode_state_name(node, JNODE_NEW),
20682+ jnode_state_name(node, JNODE_DKSET),
20683+ jnode_state_name(node, JNODE_REPACK),
20684+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
20685+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
20686+ atomic_read(&node->d_count), atomic_read(&node->x_count),
20687+ jnode_page(node), node->atom, 0, 0,
20688+ jnode_type_name(jnode_get_type(node)));
20689+ if (jnode_is_unformatted(node)) {
20690+ printk("inode: %llu, index: %lu, ",
20691+ node->key.j.objectid, node->key.j.index);
20692+ }
20693+}
20694+
20695+/* debugging aid: check znode invariant and panic if it doesn't hold */
20696+static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20697+{
20698+ char const *failed_msg;
20699+ int result;
20700+ reiser4_tree *tree;
20701+
20702+ tree = jnode_get_tree(node);
20703+
20704+ assert("umka-063312", node != NULL);
20705+ assert("umka-064321", tree != NULL);
20706+
20707+ if (!jlocked && !tlocked)
20708+ spin_lock_jnode((jnode *) node);
20709+ if (!tlocked)
20710+ read_lock_tree(jnode_get_tree(node));
20711+ result = jnode_invariant_f(node, &failed_msg);
20712+ if (!result) {
20713+ info_jnode("corrupted node", node);
20714+ warning("jmacd-555", "Condition %s failed", failed_msg);
20715+ }
20716+ if (!tlocked)
20717+ read_unlock_tree(jnode_get_tree(node));
20718+ if (!jlocked && !tlocked)
20719+ spin_unlock_jnode((jnode *) node);
20720+ return result;
20721+}
20722+
20723+#endif /* REISER4_DEBUG */
20724+
20725+/* Make Linus happy.
20726+ Local variables:
20727+ c-indentation-style: "K&R"
20728+ mode-name: "LC"
20729+ c-basic-offset: 8
20730+ tab-width: 8
20731+ fill-column: 80
20732+ End:
20733+*/
20734diff --git a/fs/reiser4/jnode.h b/fs/reiser4/jnode.h
20735new file mode 100644
20736index 0000000..c05d88e
20737--- /dev/null
20738+++ b/fs/reiser4/jnode.h
20739@@ -0,0 +1,705 @@
20740+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20741+ * reiser4/README */
20742+
20743+/* Declaration of jnode. See jnode.c for details. */
20744+
20745+#ifndef __JNODE_H__
20746+#define __JNODE_H__
20747+
20748+#include "forward.h"
20749+#include "type_safe_hash.h"
20750+#include "txnmgr.h"
20751+#include "key.h"
20752+#include "debug.h"
20753+#include "dformat.h"
20754+#include "page_cache.h"
20755+#include "context.h"
20756+
20757+#include "plugin/plugin.h"
20758+
20759+#include <linux/fs.h>
20760+#include <linux/mm.h>
20761+#include <linux/spinlock.h>
20762+#include <asm/atomic.h>
20763+#include <asm/bitops.h>
20764+#include <linux/list.h>
20765+#include <linux/rcupdate.h>
20766+
20767+/* declare hash table of jnodes (jnodes proper, that is, unformatted
20768+ nodes) */
20769+TYPE_SAFE_HASH_DECLARE(j, jnode);
20770+
20771+/* declare hash table of znodes */
20772+TYPE_SAFE_HASH_DECLARE(z, znode);
20773+
20774+typedef struct {
20775+ __u64 objectid;
20776+ unsigned long index;
20777+ struct address_space *mapping;
20778+} jnode_key_t;
20779+
20780+/*
20781+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
20782+ be exactly the node we use for unformatted tree nodes.
20783+
20784+ Jnode provides following basic functionality:
20785+
20786+ . reference counting and indexing.
20787+
20788+ . integration with page cache. Jnode has ->pg reference to which page can
20789+ be attached.
20790+
20791+ . interface to transaction manager. It is jnode that is kept in transaction
20792+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20793+ means, there should be special type of jnode for inode.)
20794+
20795+ Locking:
20796+
20797+ Spin lock: the following fields are protected by the per-jnode spin lock:
20798+
20799+ ->state
20800+ ->atom
20801+ ->capture_link
20802+
20803+ Following fields are protected by the global tree lock:
20804+
20805+ ->link
20806+ ->key.z (content of ->key.z is only changed in znode_rehash())
20807+ ->key.j
20808+
20809+ Atomic counters
20810+
20811+ ->x_count
20812+ ->d_count
20813+
20814+ ->pg, and ->data are protected by spin lock for unused jnode and are
20815+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20816+ is false).
20817+
20818+ ->tree is immutable after creation
20819+
20820+ Unclear
20821+
20822+ ->blocknr: should be under jnode spin-lock, but current interface is based
20823+ on passing of block address.
20824+
20825+ If you ever need to spin lock two nodes at once, do this in "natural"
20826+ memory order: lock znode with lower address first. (See lock_two_nodes().)
20827+
20828+ Invariants involving this data-type:
20829+
20830+ [jnode-dirty]
20831+ [jnode-refs]
20832+ [jnode-oid]
20833+ [jnode-queued]
20834+ [jnode-atom-valid]
20835+ [jnode-page-binding]
20836+*/
20837+
20838+struct jnode {
20839+#if REISER4_DEBUG
20840+#define JMAGIC 0x52654973 /* "ReIs" */
20841+ int magic;
20842+#endif
20843+ /* FIRST CACHE LINE (16 bytes): data used by jload */
20844+
20845+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20846+ /* 0 */ unsigned long state;
20847+
20848+ /* lock, protecting jnode's fields. */
20849+ /* 4 */ spinlock_t load;
20850+
20851+ /* counter of references to jnode itself. Increased on jref().
20852+ Decreased on jput().
20853+ */
20854+ /* 8 */ atomic_t x_count;
20855+
20856+ /* counter of references to jnode's data. Pin data page(s) in
20857+ memory while this is greater than 0. Increased on jload().
20858+ Decreased on jrelse().
20859+ */
20860+ /* 12 */ atomic_t d_count;
20861+
20862+ /* SECOND CACHE LINE: data used by hash table lookups */
20863+
20864+ /* 16 */ union {
20865+ /* znodes are hashed by block number */
20866+ reiser4_block_nr z;
20867+ /* unformatted nodes are hashed by mapping plus offset */
20868+ jnode_key_t j;
20869+ } key;
20870+
20871+ /* THIRD CACHE LINE */
20872+
20873+ /* 32 */ union {
20874+ /* pointers to maintain hash-table */
20875+ z_hash_link z;
20876+ j_hash_link j;
20877+ } link;
20878+
20879+ /* pointer to jnode page. */
20880+ /* 36 */ struct page *pg;
20881+ /* pointer to node itself. This is page_address(node->pg) when page is
20882+ attached to the jnode
20883+ */
20884+ /* 40 */ void *data;
20885+
20886+ /* 44 */ reiser4_tree *tree;
20887+
20888+ /* FOURTH CACHE LINE: atom related fields */
20889+
20890+ /* 48 */ spinlock_t guard;
20891+
20892+ /* atom the block is in, if any */
20893+ /* 52 */ txn_atom *atom;
20894+
20895+ /* capture list */
20896+ /* 56 */ struct list_head capture_link;
20897+
20898+ /* FIFTH CACHE LINE */
20899+
20900+ /* 64 */ struct rcu_head rcu;
20901+ /* crosses cache line */
20902+
20903+ /* SIXTH CACHE LINE */
20904+
20905+ /* the real blocknr (where io is going to/from) */
20906+ /* 80 */ reiser4_block_nr blocknr;
20907+ /* Parent item type, unformatted and CRC need it for offset => key conversion. */
20908+ /* NOTE: this parent_item_id looks like jnode type. */
20909+ /* 88 */ reiser4_plugin_id parent_item_id;
20910+ /* 92 */
20911+#if REISER4_DEBUG
20912+ /* number of pages referenced by the jnode (meaningful while capturing of
20913+ page clusters) */
20914+ int page_count;
20915+ /* list of all jnodes for debugging purposes. */
20916+ struct list_head jnodes;
20917+ /* how many times this jnode was written in one transaction */
20918+ int written;
20919+ /* this indicates which atom's list the jnode is on */
20920+ atom_list list;
20921+#endif
20922+} __attribute__ ((aligned(16)));
20923+
20924+/*
20925+ * jnode types. Enumeration of existing jnode types.
20926+ */
20927+typedef enum {
20928+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
20929+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
20930+ JNODE_BITMAP, /* bitmap */
20931+ JNODE_IO_HEAD, /* jnode representing a block in the
20932+ * wandering log */
20933+ JNODE_INODE, /* jnode embedded into inode */
20934+ LAST_JNODE_TYPE
20935+} jnode_type;
20936+
20937+/* jnode states */
20938+typedef enum {
20939+ /* jnode's page is loaded and data checked */
20940+ JNODE_PARSED = 0,
20941+ /* node was deleted, not all locks on it were released. This
20942+ node is empty and is going to be removed from the tree
20943+ shortly. */
20944+ JNODE_HEARD_BANSHEE = 1,
20945+ /* left sibling pointer is valid */
20946+ JNODE_LEFT_CONNECTED = 2,
20947+ /* right sibling pointer is valid */
20948+ JNODE_RIGHT_CONNECTED = 3,
20949+
20950+ /* znode was just created and doesn't yet have a pointer from
20951+ its parent */
20952+ JNODE_ORPHAN = 4,
20953+
20954+ /* this node was created by its transaction and has not been assigned
20955+ a block address. */
20956+ JNODE_CREATED = 5,
20957+
20958+ /* this node is currently relocated */
20959+ JNODE_RELOC = 6,
20960+ /* this node is currently wandered */
20961+ JNODE_OVRWR = 7,
20962+
20963+ /* this znode has been modified */
20964+ JNODE_DIRTY = 8,
20965+
20966+ /* znode lock is being invalidated */
20967+ JNODE_IS_DYING = 9,
20968+
20969+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20970+
20971+ /* jnode is queued for flushing. */
20972+ JNODE_FLUSH_QUEUED = 12,
20973+
20974+ /* In the following bits jnode type is encoded. */
20975+ JNODE_TYPE_1 = 13,
20976+ JNODE_TYPE_2 = 14,
20977+ JNODE_TYPE_3 = 15,
20978+
20979+ /* jnode is being destroyed */
20980+ JNODE_RIP = 16,
20981+
20982+ /* znode was not captured during locking (it might so be because
20983+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20984+ JNODE_MISSED_IN_CAPTURE = 17,
20985+
20986+ /* write is in progress */
20987+ JNODE_WRITEBACK = 18,
20988+
20989+ /* FIXME: now it is used by crypto-compress plugin only */
20990+ JNODE_NEW = 19,
20991+
20992+ /* delimiting keys are already set for this znode. */
20993+ JNODE_DKSET = 20,
20994+
20995+ /* when this bit is set page and jnode can not be disconnected */
20996+ JNODE_WRITE_PREPARED = 21,
20997+
20998+ JNODE_CLUSTER_PAGE = 22,
20999+ /* Jnode is marked for repacking, that means the reiser4 flush and the
21000+ * block allocator should process this node special way */
21001+ JNODE_REPACK = 23,
21002+ /* node should be converted by flush in squalloc phase */
21003+ JNODE_CONVERTIBLE = 24,
21004+ /*
21005+ * When jnode is dirtied for the first time in given transaction,
21006+ * do_jnode_make_dirty() checks whether this jnode can possible became
21007+ * member of overwrite set. If so, this bit is set, and one block is
21008+ * reserved in the ->flush_reserved space of atom.
21009+ *
21010+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
21011+ *
21012+ * (1) flush decides that we want this block to go into relocate
21013+ * set after all.
21014+ *
21015+ * (2) wandering log is allocated (by log writer)
21016+ *
21017+ * (3) extent is allocated
21018+ *
21019+ */
21020+ JNODE_FLUSH_RESERVED = 29
21021+} reiser4_jnode_state;
21022+
21023+/* Macros for accessing the jnode state. */
21024+
21025+static inline void JF_CLR(jnode * j, int f)
21026+{
21027+ assert("unknown-1", j->magic == JMAGIC);
21028+ clear_bit(f, &j->state);
21029+}
21030+static inline int JF_ISSET(const jnode * j, int f)
21031+{
21032+ assert("unknown-2", j->magic == JMAGIC);
21033+ return test_bit(f, &((jnode *) j)->state);
21034+}
21035+static inline void JF_SET(jnode * j, int f)
21036+{
21037+ assert("unknown-3", j->magic == JMAGIC);
21038+ set_bit(f, &j->state);
21039+}
21040+
21041+static inline int JF_TEST_AND_SET(jnode * j, int f)
21042+{
21043+ assert("unknown-4", j->magic == JMAGIC);
21044+ return test_and_set_bit(f, &j->state);
21045+}
21046+
21047+static inline void spin_lock_jnode(jnode *node)
21048+{
21049+ /* check that spinlocks of lower priorities are not held */
21050+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
21051+ LOCK_CNT_NIL(spin_locked_txnh) &&
21052+ LOCK_CNT_NIL(spin_locked_zlock) &&
21053+ LOCK_CNT_NIL(rw_locked_dk) &&
21054+ LOCK_CNT_LT(spin_locked_jnode, 2)));
21055+
21056+ spin_lock(&(node->guard));
21057+
21058+ LOCK_CNT_INC(spin_locked_jnode);
21059+ LOCK_CNT_INC(spin_locked);
21060+}
21061+
21062+static inline void spin_unlock_jnode(jnode *node)
21063+{
21064+ assert_spin_locked(&(node->guard));
21065+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
21066+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
21067+
21068+ LOCK_CNT_DEC(spin_locked_jnode);
21069+ LOCK_CNT_DEC(spin_locked);
21070+
21071+ spin_unlock(&(node->guard));
21072+}
21073+
21074+static inline int jnode_is_in_deleteset(const jnode * node)
21075+{
21076+ return JF_ISSET(node, JNODE_RELOC);
21077+}
21078+
21079+extern int init_jnodes(void);
21080+extern void done_jnodes(void);
21081+
21082+/* Jnode routines */
21083+extern jnode *jalloc(void);
21084+extern void jfree(jnode * node) NONNULL;
21085+extern jnode *jclone(jnode *);
21086+extern jnode *jlookup(reiser4_tree * tree,
21087+ oid_t objectid, unsigned long ind) NONNULL;
21088+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
21089+extern jnode *jnode_by_page(struct page *pg) NONNULL;
21090+extern jnode *jnode_of_page(struct page *pg) NONNULL;
21091+void jnode_attach_page(jnode * node, struct page *pg);
21092+
21093+void unhash_unformatted_jnode(jnode *);
21094+extern jnode *page_next_jnode(jnode * node) NONNULL;
21095+extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
21096+extern void jnode_make_dirty(jnode * node) NONNULL;
21097+extern void jnode_make_clean(jnode * node) NONNULL;
21098+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
21099+extern void jnode_make_wander(jnode *) NONNULL;
21100+extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
21101+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
21102+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
21103+
21104+/**
21105+ * jnode_get_block
21106+ * @node: jnode to query
21107+ *
21108+ */
21109+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
21110+{
21111+ assert("nikita-528", node != NULL);
21112+
21113+ return &node->blocknr;
21114+}
21115+
21116+/**
21117+ * jnode_set_block
21118+ * @node: jnode to update
21119+ * @blocknr: new block nr
21120+ */
21121+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
21122+{
21123+ assert("nikita-2020", node != NULL);
21124+ assert("umka-055", blocknr != NULL);
21125+ node->blocknr = *blocknr;
21126+}
21127+
21128+
21129+/* block number for IO. Usually this is the same as jnode_get_block(), unless
21130+ * jnode was emergency flushed---then block number chosen by eflush is
21131+ * used. */
21132+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
21133+{
21134+ assert("nikita-2768", node != NULL);
21135+ assert_spin_locked(&(node->guard));
21136+
21137+ return jnode_get_block(node);
21138+}
21139+
21140+/* Jnode flush interface. */
21141+extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
21142+extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
21143+
21144+/* FIXME-VS: these are used in plugin/item/extent.c */
21145+
21146+/* does extent_get_block have to be called */
21147+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
21148+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
21149+
21150+/* the node should be converted during flush squalloc phase */
21151+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
21152+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
21153+
21154+/* Macros to convert from jnode to znode, znode to jnode. These are macros
21155+ because C doesn't allow overloading of const prototypes. */
21156+#define ZJNODE(x) (& (x) -> zjnode)
21157+#define JZNODE(x) \
21158+({ \
21159+ typeof (x) __tmp_x; \
21160+ \
21161+ __tmp_x = (x); \
21162+ assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \
21163+ (znode*) __tmp_x; \
21164+})
21165+
21166+extern int jnodes_tree_init(reiser4_tree * tree);
21167+extern int jnodes_tree_done(reiser4_tree * tree);
21168+
21169+#if REISER4_DEBUG
21170+
21171+extern int znode_is_any_locked(const znode * node);
21172+extern void jnode_list_remove(jnode * node);
21173+
21174+#else
21175+
21176+#define jnode_list_remove(node) noop
21177+
21178+#endif
21179+
21180+int znode_is_root(const znode * node) NONNULL;
21181+
21182+/* bump reference counter on @node */
21183+static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
21184+{
21185+ assert("nikita-1911", node != NULL);
21186+
21187+ atomic_inc(&node->x_count);
21188+ LOCK_CNT_INC(x_refs);
21189+}
21190+
21191+static inline void dec_x_ref(jnode * node)
21192+{
21193+ assert("nikita-3215", node != NULL);
21194+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
21195+
21196+ atomic_dec(&node->x_count);
21197+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
21198+ LOCK_CNT_DEC(x_refs);
21199+}
21200+
21201+/* jref() - increase counter of references to jnode/znode (x_count) */
21202+static inline jnode *jref(jnode * node)
21203+{
21204+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
21205+ add_x_ref(node);
21206+ return node;
21207+}
21208+
21209+/* get the page of jnode */
21210+static inline struct page *jnode_page(const jnode * node)
21211+{
21212+ return node->pg;
21213+}
21214+
21215+/* return pointer to jnode data */
21216+static inline char *jdata(const jnode * node)
21217+{
21218+ assert("nikita-1415", node != NULL);
21219+ assert("nikita-3198", jnode_page(node) != NULL);
21220+ return node->data;
21221+}
21222+
21223+static inline int jnode_is_loaded(const jnode * node)
21224+{
21225+ assert("zam-506", node != NULL);
21226+ return atomic_read(&node->d_count) > 0;
21227+}
21228+
21229+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
21230+
21231+static inline void jnode_set_reloc(jnode * node)
21232+{
21233+ assert("nikita-2431", node != NULL);
21234+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
21235+ JF_SET(node, JNODE_RELOC);
21236+}
21237+
21238+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
21239+
21240+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
21241+
21242+static inline int jload(jnode *node)
21243+{
21244+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
21245+}
21246+
21247+extern int jinit_new(jnode *, gfp_t) NONNULL;
21248+extern int jstartio(jnode *) NONNULL;
21249+
21250+extern void jdrop(jnode *) NONNULL;
21251+extern int jwait_io(jnode *, int rw) NONNULL;
21252+
21253+void jload_prefetch(jnode *);
21254+
21255+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
21256+extern void reiser4_drop_io_head(jnode * node) NONNULL;
21257+
21258+static inline reiser4_tree *jnode_get_tree(const jnode * node)
21259+{
21260+ assert("nikita-2691", node != NULL);
21261+ return node->tree;
21262+}
21263+
21264+extern void pin_jnode_data(jnode *);
21265+extern void unpin_jnode_data(jnode *);
21266+
21267+static inline jnode_type jnode_get_type(const jnode * node)
21268+{
21269+ static const unsigned long state_mask =
21270+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
21271+
21272+ static jnode_type mask_to_type[] = {
21273+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
21274+
21275+ /* 000 */
21276+ [0] = JNODE_FORMATTED_BLOCK,
21277+ /* 001 */
21278+ [1] = JNODE_UNFORMATTED_BLOCK,
21279+ /* 010 */
21280+ [2] = JNODE_BITMAP,
21281+ /* 011 */
21282+ [3] = LAST_JNODE_TYPE, /*invalid */
21283+ /* 100 */
21284+ [4] = JNODE_INODE,
21285+ /* 101 */
21286+ [5] = LAST_JNODE_TYPE,
21287+ /* 110 */
21288+ [6] = JNODE_IO_HEAD,
21289+ /* 111 */
21290+ [7] = LAST_JNODE_TYPE, /* invalid */
21291+ };
21292+
21293+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
21294+}
21295+
21296+/* returns true if node is a znode */
21297+static inline int jnode_is_znode(const jnode * node)
21298+{
21299+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21300+}
21301+
21302+static inline int jnode_is_flushprepped(jnode * node)
21303+{
21304+ assert("jmacd-78212", node != NULL);
21305+ assert_spin_locked(&(node->guard));
21306+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21307+ JF_ISSET(node, JNODE_OVRWR);
21308+}
21309+
21310+/* Return true if @node has already been processed by the squeeze and allocate
21311+ process. This implies the block address has been finalized for the
21312+ duration of this atom (or it is clean and will remain in place). If this
21313+ returns true you may use the block number as a hint. */
21314+static inline int jnode_check_flushprepped(jnode * node)
21315+{
21316+ int result;
21317+
21318+ /* It must be clean or relocated or wandered. New allocations are set to relocate. */
21319+ spin_lock_jnode(node);
21320+ result = jnode_is_flushprepped(node);
21321+ spin_unlock_jnode(node);
21322+ return result;
21323+}
21324+
21325+/* returns true if node is unformatted */
21326+static inline int jnode_is_unformatted(const jnode * node)
21327+{
21328+ assert("jmacd-0123", node != NULL);
21329+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21330+}
21331+
21332+/* returns true if node represents a cluster cache page */
21333+static inline int jnode_is_cluster_page(const jnode * node)
21334+{
21335+ assert("edward-50", node != NULL);
21336+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21337+}
21338+
21339+/* returns true is node is builtin inode's jnode */
21340+static inline int jnode_is_inode(const jnode * node)
21341+{
21342+ assert("vs-1240", node != NULL);
21343+ return jnode_get_type(node) == JNODE_INODE;
21344+}
21345+
21346+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21347+{
21348+ assert("nikita-2367", type < LAST_JNODE_TYPE);
21349+ return jnode_plugin_by_id((reiser4_plugin_id) type);
21350+}
21351+
21352+static inline jnode_plugin *jnode_ops(const jnode * node)
21353+{
21354+ assert("nikita-2366", node != NULL);
21355+
21356+ return jnode_ops_of(jnode_get_type(node));
21357+}
21358+
21359+/* Get the index of a block. */
21360+static inline unsigned long jnode_get_index(jnode * node)
21361+{
21362+ return jnode_ops(node)->index(node);
21363+}
21364+
21365+/* return true if "node" is the root */
21366+static inline int jnode_is_root(const jnode * node)
21367+{
21368+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21369+}
21370+
21371+extern struct address_space *mapping_jnode(const jnode * node);
21372+extern unsigned long index_jnode(const jnode * node);
21373+
21374+static inline void jput(jnode * node);
21375+extern void jput_final(jnode * node);
21376+
21377+/* bump data counter on @node */
21378+static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
21379+{
21380+ assert("nikita-1962", node != NULL);
21381+
21382+ atomic_inc(&node->d_count);
21383+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
21384+ LOCK_CNT_INC(d_refs);
21385+}
21386+
21387+/* jput() - decrement x_count reference counter on znode.
21388+
21389+ Count may drop to 0, jnode stays in cache until memory pressure causes the
21390+ eviction of its page. The c_count variable also ensures that children are
21391+ pressured out of memory before the parent. The jnode remains hashed as
21392+ long as the VM allows its page to stay in memory.
21393+*/
21394+static inline void jput(jnode * node)
21395+{
21396+ assert("jmacd-509", node != NULL);
21397+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
21398+ assert("zam-926", reiser4_schedulable());
21399+ LOCK_CNT_DEC(x_refs);
21400+
21401+ rcu_read_lock();
21402+ /*
21403+ * we don't need any kind of lock here--jput_final() uses RCU.
21404+ */
21405+ if (unlikely(atomic_dec_and_test(&node->x_count))) {
21406+ jput_final(node);
21407+ } else
21408+ rcu_read_unlock();
21409+ assert("nikita-3473", reiser4_schedulable());
21410+}
21411+
21412+extern void jrelse(jnode * node);
21413+extern void jrelse_tail(jnode * node);
21414+
21415+extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21416+
21417+/* resolve race with jput */
21418+static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21419+{
21420+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
21421+ node = jnode_rip_sync(tree, node);
21422+ return node;
21423+}
21424+
21425+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21426+
21427+#if REISER4_DEBUG
21428+extern int jnode_invariant_f(const jnode *node, char const **msg);
21429+#endif
21430+
21431+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21432+
21433+/* __JNODE_H__ */
21434+#endif
21435+
21436+/* Make Linus happy.
21437+ Local variables:
21438+ c-indentation-style: "K&R"
21439+ mode-name: "LC"
21440+ c-basic-offset: 8
21441+ tab-width: 8
21442+ fill-column: 120
21443+ End:
21444+*/
21445diff --git a/fs/reiser4/kassign.c b/fs/reiser4/kassign.c
21446new file mode 100644
21447index 0000000..3c8f9f5
21448--- /dev/null
21449+++ b/fs/reiser4/kassign.c
21450@@ -0,0 +1,661 @@
21451+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21452+ * reiser4/README */
21453+
21454+/* Key assignment policy implementation */
21455+
21456+/*
21457+ * In reiser4 every piece of file system data and meta-data has a key. Keys
21458+ * are used to store information in and retrieve it from reiser4 internal
21459+ * tree. In addition to this, keys define _ordering_ of all file system
21460+ * information: things having close keys are placed into the same or
21461+ * neighboring (in the tree order) nodes of the tree. As our block allocator
21462+ * tries to respect tree order (see flush.c), keys also define order in which
21463+ * things are laid out on the disk, and hence, affect performance directly.
21464+ *
21465+ * Obviously, assignment of keys to data and meta-data should be consistent
21466+ * across whole file system. Algorithm that calculates a key for a given piece
21467+ * of data or meta-data is referred to as "key assignment".
21468+ *
21469+ * Key assignment is too expensive to be implemented as a plugin (that is,
21470+ * with an ability to support different key assignment schemas in the same
21471+ * compiled kernel image). As a compromise, all key-assignment functions and
21472+ * data-structures are collected in this single file, so that modifications to
21473+ * key assignment algorithm can be localized. Additional changes may be
21474+ * required in key.[ch].
21475+ *
21476+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21477+ * may guess, there is "Plan B" too.
21478+ *
21479+ */
21480+
21481+/*
21482+ * Additional complication with key assignment implementation is a requirement
21483+ * to support different key length.
21484+ */
21485+
21486+/*
21487+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
21488+ *
21489+ * DIRECTORY ITEMS
21490+ *
21491+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
21492+ * +--------------+---+---+-+-------------+------------------+-----------------+
21493+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
21494+ * +--------------+---+---+-+-------------+------------------+-----------------+
21495+ * | | | | |
21496+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21497+ *
21498+ * dirid objectid of directory this item is for
21499+ *
21500+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21501+ *
21502+ * H 1 if last 8 bytes of the key contain hash,
21503+ * 0 if last 8 bytes of the key contain prefix-3
21504+ *
21505+ * prefix-1 first 7 characters of file name.
21506+ * Padded by zeroes if name is not long enough.
21507+ *
21508+ * prefix-2 next 8 characters of the file name.
21509+ *
21510+ * prefix-3 next 8 characters of the file name.
21511+ *
21512+ * hash hash of the rest of file name (i.e., portion of file
21513+ * name not included into prefix-1 and prefix-2).
21514+ *
21515+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21516+ * in the key. Such file names are called "short". They are distinguished by H
21517+ * bit set 0 in the key.
21518+ *
21519+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21520+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21521+ * key. Last 8 bytes of the key are occupied by hash of the remaining
21522+ * characters of the name.
21523+ *
21524+ * This key assignment reaches following important goals:
21525+ *
21526+ * (1) directory entries are sorted in approximately lexicographical
21527+ * order.
21528+ *
21529+ * (2) collisions (when multiple directory items have the same key), while
21530+ * principally unavoidable in a tree with fixed length keys, are rare.
21531+ *
21532+ * STAT DATA
21533+ *
21534+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21535+ * +--------------+---+-----------------+---+--------------+-----------------+
21536+ * | locality id | 1 | ordering | 0 | objectid | 0 |
21537+ * +--------------+---+-----------------+---+--------------+-----------------+
21538+ * | | | | |
21539+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21540+ *
21541+ * locality id object id of a directory where first name was created for
21542+ * the object
21543+ *
21544+ * ordering copy of second 8-byte portion of the key of directory
21545+ * entry for the first name of this object. Ordering has a form
21546+ * {
21547+ * fibration :7;
21548+ * h :1;
21549+ * prefix1 :56;
21550+ * }
21551+ * see description of key for directory entry above.
21552+ *
21553+ * objectid object id for this object
21554+ *
21555+ * This key assignment policy is designed to keep stat-data in the same order
21556+ * as corresponding directory items, thus speeding up readdir/stat types of
21557+ * workload.
21558+ *
21559+ * FILE BODY
21560+ *
21561+ * | 60 | 4 | 64 | 4 | 60 | 64 |
21562+ * +--------------+---+-----------------+---+--------------+-----------------+
21563+ * | locality id | 4 | ordering | 0 | objectid | offset |
21564+ * +--------------+---+-----------------+---+--------------+-----------------+
21565+ * | | | | |
21566+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
21567+ *
21568+ * locality id object id of a directory where first name was created for
21569+ * the object
21570+ *
21571+ * ordering the same as in the key of stat-data for this object
21572+ *
21573+ * objectid object id for this object
21574+ *
21575+ * offset logical offset from the beginning of this file.
21576+ * Measured in bytes.
21577+ *
21578+ *
21579+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21580+ *
21581+ * DIRECTORY ITEMS
21582+ *
21583+ * | 60 | 4 | 7 |1| 56 | 64 |
21584+ * +--------------+---+---+-+-------------+-----------------+
21585+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
21586+ * +--------------+---+---+-+-------------+-----------------+
21587+ * | | | |
21588+ * | 8 bytes | 8 bytes | 8 bytes |
21589+ *
21590+ * dirid objectid of directory this item is for
21591+ *
21592+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
21593+ *
21594+ * H 1 if last 8 bytes of the key contain hash,
21595+ * 0 if last 8 bytes of the key contain prefix-2
21596+ *
21597+ * prefix-1 first 7 characters of file name.
21598+ * Padded by zeroes if name is not long enough.
21599+ *
21600+ * prefix-2 next 8 characters of the file name.
21601+ *
21602+ * hash hash of the rest of file name (i.e., portion of file
21603+ * name not included into prefix-1).
21604+ *
21605+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21606+ * the key. Such file names are called "short". They are distinguished by H
21607+ * bit set in the key.
21608+ *
21609+ * Other file names are "long". For long name, H bit is 0, and first 7
21610+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21611+ * key are occupied by hash of the remaining characters of the name.
21612+ *
21613+ * STAT DATA
21614+ *
21615+ * | 60 | 4 | 4 | 60 | 64 |
21616+ * +--------------+---+---+--------------+-----------------+
21617+ * | locality id | 1 | 0 | objectid | 0 |
21618+ * +--------------+---+---+--------------+-----------------+
21619+ * | | | |
21620+ * | 8 bytes | 8 bytes | 8 bytes |
21621+ *
21622+ * locality id object id of a directory where first name was created for
21623+ * the object
21624+ *
21625+ * objectid object id for this object
21626+ *
21627+ * FILE BODY
21628+ *
21629+ * | 60 | 4 | 4 | 60 | 64 |
21630+ * +--------------+---+---+--------------+-----------------+
21631+ * | locality id | 4 | 0 | objectid | offset |
21632+ * +--------------+---+---+--------------+-----------------+
21633+ * | | | |
21634+ * | 8 bytes | 8 bytes | 8 bytes |
21635+ *
21636+ * locality id object id of a directory where first name was created for
21637+ * the object
21638+ *
21639+ * objectid object id for this object
21640+ *
21641+ * offset logical offset from the beginning of this file.
21642+ * Measured in bytes.
21643+ *
21644+ *
21645+ */
21646+
21647+#include "debug.h"
21648+#include "key.h"
21649+#include "kassign.h"
21650+#include "vfs_ops.h"
21651+#include "inode.h"
21652+#include "super.h"
21653+#include "dscale.h"
21654+
21655+#include <linux/types.h> /* for __u?? */
21656+#include <linux/fs.h> /* for struct super_block, etc */
21657+
21658+/* bitmask for H bit (see comment at the beginning of this file */
21659+static const __u64 longname_mark = 0x0100000000000000ull;
21660+/* bitmask for F and H portions of the key. */
21661+static const __u64 fibration_mask = 0xff00000000000000ull;
21662+
21663+/* return true if name is not completely encoded in @key */
21664+int is_longname_key(const reiser4_key * key)
21665+{
21666+ __u64 highpart;
21667+
21668+ assert("nikita-2863", key != NULL);
21669+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21670+ reiser4_print_key("oops", key);
21671+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21672+
21673+ if (REISER4_LARGE_KEY)
21674+ highpart = get_key_ordering(key);
21675+ else
21676+ highpart = get_key_objectid(key);
21677+
21678+ return (highpart & longname_mark) ? 1 : 0;
21679+}
21680+
21681+/* return true if @name is too long to be completely encoded in the key */
21682+int is_longname(const char *name UNUSED_ARG, int len)
21683+{
21684+ if (REISER4_LARGE_KEY)
21685+ return len > 23;
21686+ else
21687+ return len > 15;
21688+}
21689+
21690+/* code ascii string into __u64.
21691+
21692+ Put characters of @name into result (@str) one after another starting
21693+ from @start_idx-th highest (arithmetically) byte. This produces
21694+ endian-safe encoding. memcpy(2) will not do.
21695+
21696+*/
21697+static __u64 pack_string(const char *name /* string to encode */ ,
21698+ int start_idx /* highest byte in result from
21699+ * which to start encoding */ )
21700+{
21701+ unsigned i;
21702+ __u64 str;
21703+
21704+ str = 0;
21705+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21706+ str <<= 8;
21707+ str |= (unsigned char)name[i];
21708+ }
21709+ str <<= (sizeof str - i - start_idx) << 3;
21710+ return str;
21711+}
21712+
21713+/* opposite to pack_string(). Takes value produced by pack_string(), restores
21714+ * string encoded in it and stores result in @buf */
21715+char * reiser4_unpack_string(__u64 value, char *buf)
21716+{
21717+ do {
21718+ *buf = value >> (64 - 8);
21719+ if (*buf)
21720+ ++buf;
21721+ value <<= 8;
21722+ } while (value != 0);
21723+ *buf = 0;
21724+ return buf;
21725+}
21726+
21727+/* obtain name encoded in @key and store it in @buf */
21728+char *extract_name_from_key(const reiser4_key * key, char *buf)
21729+{
21730+ char *c;
21731+
21732+ assert("nikita-2868", !is_longname_key(key));
21733+
21734+ c = buf;
21735+ if (REISER4_LARGE_KEY) {
21736+ c = reiser4_unpack_string(get_key_ordering(key) &
21737+ ~fibration_mask, c);
21738+ c = reiser4_unpack_string(get_key_fulloid(key), c);
21739+ } else
21740+ c = reiser4_unpack_string(get_key_fulloid(key) &
21741+ ~fibration_mask, c);
21742+ reiser4_unpack_string(get_key_offset(key), c);
21743+ return buf;
21744+}
21745+
21746+/**
21747+ * complete_entry_key - calculate entry key by name
21748+ * @dir: directory where entry is (or will be) in
21749+ * @name: name to calculate key of
21750+ * @len: lenth of name
21751+ * @result: place to store result in
21752+ *
21753+ * Sets fields of entry key @result which depend on file name.
21754+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21755+ * objectid and offset. Otherwise, objectid and offset are set.
21756+ */
21757+void complete_entry_key(const struct inode *dir, const char *name,
21758+ int len, reiser4_key *result)
21759+{
21760+#if REISER4_LARGE_KEY
21761+ __u64 ordering;
21762+ __u64 objectid;
21763+ __u64 offset;
21764+
21765+ assert("nikita-1139", dir != NULL);
21766+ assert("nikita-1142", result != NULL);
21767+ assert("nikita-2867", strlen(name) == len);
21768+
21769+ /*
21770+ * key allocation algorithm for directory entries in case of large
21771+ * keys:
21772+ *
21773+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21774+ * characters into ordering field of key, next 8 charactes (if any)
21775+ * into objectid field of key and next 8 ones (of any) into offset
21776+ * field of key
21777+ *
21778+ * If file name is longer than 23 characters, put first 7 characters
21779+ * into key's ordering, next 8 to objectid and hash of remaining
21780+ * characters into offset field.
21781+ *
21782+ * To distinguish above cases, in latter set up unused high bit in
21783+ * ordering field.
21784+ */
21785+
21786+ /* [0-6] characters to ordering */
21787+ ordering = pack_string(name, 1);
21788+ if (len > 7) {
21789+ /* [7-14] characters to objectid */
21790+ objectid = pack_string(name + 7, 0);
21791+ if (len > 15) {
21792+ if (len <= 23) {
21793+ /* [15-23] characters to offset */
21794+ offset = pack_string(name + 15, 0);
21795+ } else {
21796+ /* note in a key the fact that offset contains hash. */
21797+ ordering |= longname_mark;
21798+
21799+ /* offset is the hash of the file name's tail. */
21800+ offset = inode_hash_plugin(dir)->hash(name + 15,
21801+ len - 15);
21802+ }
21803+ } else {
21804+ offset = 0ull;
21805+ }
21806+ } else {
21807+ objectid = 0ull;
21808+ offset = 0ull;
21809+ }
21810+
21811+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21812+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21813+
21814+ set_key_ordering(result, ordering);
21815+ set_key_fulloid(result, objectid);
21816+ set_key_offset(result, offset);
21817+ return;
21818+
21819+#else
21820+ __u64 objectid;
21821+ __u64 offset;
21822+
21823+ assert("nikita-1139", dir != NULL);
21824+ assert("nikita-1142", result != NULL);
21825+ assert("nikita-2867", strlen(name) == len);
21826+
21827+ /*
21828+ * key allocation algorithm for directory entries in case of not large
21829+ * keys:
21830+ *
21831+ * If name is not longer than 7 + 8 = 15 characters, put first 7
21832+ * characters into objectid field of key, next 8 charactes (if any)
21833+ * into offset field of key
21834+ *
21835+ * If file name is longer than 15 characters, put first 7 characters
21836+ * into key's objectid, and hash of remaining characters into offset
21837+ * field.
21838+ *
21839+ * To distinguish above cases, in latter set up unused high bit in
21840+ * objectid field.
21841+ */
21842+
21843+ /* [0-6] characters to objectid */
21844+ objectid = pack_string(name, 1);
21845+ if (len > 7) {
21846+ if (len <= 15) {
21847+ /* [7-14] characters to offset */
21848+ offset = pack_string(name + 7, 0);
21849+ } else {
21850+ /* note in a key the fact that offset contains hash. */
21851+ objectid |= longname_mark;
21852+
21853+ /* offset is the hash of the file name. */
21854+ offset = inode_hash_plugin(dir)->hash(name + 7,
21855+ len - 7);
21856+ }
21857+ } else
21858+ offset = 0ull;
21859+
21860+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21861+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21862+
21863+ set_key_fulloid(result, objectid);
21864+ set_key_offset(result, offset);
21865+ return;
21866+#endif /* ! REISER4_LARGE_KEY */
21867+}
21868+
21869+/* true, if @key is the key of "." */
21870+int is_dot_key(const reiser4_key * key /* key to check */ )
21871+{
21872+ assert("nikita-1717", key != NULL);
21873+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21874+ return
21875+ (get_key_ordering(key) == 0ull) &&
21876+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21877+}
21878+
21879+/* build key for stat-data.
21880+
21881+ return key of stat-data of this object. This should became sd plugin
21882+ method in the future. For now, let it be here.
21883+
21884+*/
21885+reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21886+ reiser4_key * result /* resulting key of @target
21887+ stat-data */ )
21888+{
21889+ assert("nikita-261", result != NULL);
21890+
21891+ reiser4_key_init(result);
21892+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
21893+ set_key_ordering(result, get_inode_ordering(target));
21894+ set_key_objectid(result, get_inode_oid(target));
21895+ set_key_type(result, KEY_SD_MINOR);
21896+ set_key_offset(result, (__u64) 0);
21897+ return result;
21898+}
21899+
21900+/* encode part of key into &obj_key_id
21901+
21902+ This encodes into @id part of @key sufficient to restore @key later,
21903+ given that latter is key of object (key of stat-data).
21904+
21905+ See &obj_key_id
21906+*/
21907+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21908+ obj_key_id * id /* id where key is encoded in */ )
21909+{
21910+ assert("nikita-1151", key != NULL);
21911+ assert("nikita-1152", id != NULL);
21912+
21913+ memcpy(id, key, sizeof *id);
21914+ return 0;
21915+}
21916+
21917+/* encode reference to @obj in @id.
21918+
21919+ This is like build_obj_key_id() above, but takes inode as parameter. */
21920+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21921+ obj_key_id * id /* result */ )
21922+{
21923+ reiser4_key sdkey;
21924+
21925+ assert("nikita-1166", obj != NULL);
21926+ assert("nikita-1167", id != NULL);
21927+
21928+ build_sd_key(obj, &sdkey);
21929+ build_obj_key_id(&sdkey, id);
21930+ return 0;
21931+}
21932+
21933+/* decode @id back into @key
21934+
21935+ Restore key of object stat-data from @id. This is dual to
21936+ build_obj_key_id() above.
21937+*/
21938+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
21939+ * from */ ,
21940+ reiser4_key * key /* result */ )
21941+{
21942+ assert("nikita-1153", id != NULL);
21943+ assert("nikita-1154", key != NULL);
21944+
21945+ reiser4_key_init(key);
21946+ memcpy(key, id, sizeof *id);
21947+ return 0;
21948+}
21949+
21950+/* extract objectid of directory from key of directory entry within said
21951+ directory.
21952+ */
21953+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
21954+ * directory
21955+ * entry */ )
21956+{
21957+ assert("nikita-1314", de_key != NULL);
21958+ return get_key_locality(de_key);
21959+}
21960+
21961+/* encode into @id key of directory entry.
21962+
21963+ Encode into @id information sufficient to later distinguish directory
21964+ entries within the same directory. This is not whole key, because all
21965+ directory entries within directory item share locality which is equal
21966+ to objectid of their directory.
21967+
21968+*/
21969+int build_de_id(const struct inode *dir /* inode of directory */ ,
21970+ const struct qstr *name /* name to be given to @obj by
21971+ * directory entry being
21972+ * constructed */ ,
21973+ de_id * id /* short key of directory entry */ )
21974+{
21975+ reiser4_key key;
21976+
21977+ assert("nikita-1290", dir != NULL);
21978+ assert("nikita-1292", id != NULL);
21979+
21980+ /* NOTE-NIKITA this is suboptimal. */
21981+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21982+ return build_de_id_by_key(&key, id);
21983+}
21984+
21985+/* encode into @id key of directory entry.
21986+
21987+ Encode into @id information sufficient to later distinguish directory
21988+ entries within the same directory. This is not whole key, because all
21989+ directory entries within directory item share locality which is equal
21990+ to objectid of their directory.
21991+
21992+*/
21993+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
21994+ * entry */ ,
21995+ de_id * id /* short key of directory entry */ )
21996+{
21997+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21998+ return 0;
21999+}
22000+
22001+/* restore from @id key of directory entry.
22002+
22003+ Function dual to build_de_id(): given @id and locality, build full
22004+ key of directory entry within directory item.
22005+
22006+*/
22007+int extract_key_from_de_id(const oid_t locality /* locality of directory
22008+ * entry */ ,
22009+ const de_id * id /* directory entry id */ ,
22010+ reiser4_key * key /* result */ )
22011+{
22012+ /* no need to initialise key here: all fields are overwritten */
22013+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
22014+ set_key_locality(key, locality);
22015+ set_key_type(key, KEY_FILE_NAME_MINOR);
22016+ return 0;
22017+}
22018+
22019+/* compare two &de_id's */
22020+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
22021+ const de_id * id2 /* second &de_id to compare */ )
22022+{
22023+ /* NOTE-NIKITA ugly implementation */
22024+ reiser4_key k1;
22025+ reiser4_key k2;
22026+
22027+ extract_key_from_de_id((oid_t) 0, id1, &k1);
22028+ extract_key_from_de_id((oid_t) 0, id2, &k2);
22029+ return keycmp(&k1, &k2);
22030+}
22031+
22032+/* compare &de_id with key */
22033+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
22034+ const reiser4_key * key /* key to compare */ )
22035+{
22036+ cmp_t result;
22037+ reiser4_key *k1;
22038+
22039+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
22040+ result = KEY_DIFF_EL(k1, key, 1);
22041+ if (result == EQUAL_TO) {
22042+ result = KEY_DIFF_EL(k1, key, 2);
22043+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22044+ result = KEY_DIFF_EL(k1, key, 3);
22045+ }
22046+ }
22047+ return result;
22048+}
22049+
22050+/*
22051+ * return number of bytes necessary to encode @inode identity.
22052+ */
22053+int inode_onwire_size(const struct inode *inode)
22054+{
22055+ int result;
22056+
22057+ result = dscale_bytes(get_inode_oid(inode));
22058+ result += dscale_bytes(get_inode_locality(inode));
22059+
22060+ /*
22061+ * ordering is large (it usually has highest bits set), so it makes
22062+ * little sense to dscale it.
22063+ */
22064+ if (REISER4_LARGE_KEY)
22065+ result += sizeof(get_inode_ordering(inode));
22066+ return result;
22067+}
22068+
22069+/*
22070+ * encode @inode identity at @start
22071+ */
22072+char *build_inode_onwire(const struct inode *inode, char *start)
22073+{
22074+ start += dscale_write(start, get_inode_locality(inode));
22075+ start += dscale_write(start, get_inode_oid(inode));
22076+
22077+ if (REISER4_LARGE_KEY) {
22078+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
22079+ start += sizeof(get_inode_ordering(inode));
22080+ }
22081+ return start;
22082+}
22083+
22084+/*
22085+ * extract key that was previously encoded by build_inode_onwire() at @addr
22086+ */
22087+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
22088+{
22089+ __u64 val;
22090+
22091+ addr += dscale_read(addr, &val);
22092+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
22093+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
22094+ addr += dscale_read(addr, &val);
22095+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
22096+#if REISER4_LARGE_KEY
22097+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
22098+ addr += sizeof key_id->ordering;
22099+#endif
22100+ return addr;
22101+}
22102+
22103+/* Make Linus happy.
22104+ Local variables:
22105+ c-indentation-style: "K&R"
22106+ mode-name: "LC"
22107+ c-basic-offset: 8
22108+ tab-width: 8
22109+ fill-column: 120
22110+ End:
22111+*/
22112diff --git a/fs/reiser4/kassign.h b/fs/reiser4/kassign.h
22113new file mode 100644
22114index 0000000..ee818d5
22115--- /dev/null
22116+++ b/fs/reiser4/kassign.h
22117@@ -0,0 +1,110 @@
22118+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
22119+ * reiser4/README */
22120+
22121+/* Key assignment policy interface. See kassign.c for details. */
22122+
22123+#if !defined( __KASSIGN_H__ )
22124+#define __KASSIGN_H__
22125+
22126+#include "forward.h"
22127+#include "key.h"
22128+#include "dformat.h"
22129+
22130+#include <linux/types.h> /* for __u?? */
22131+#include <linux/fs.h> /* for struct super_block, etc */
22132+#include <linux/dcache.h> /* for struct qstr */
22133+
22134+/* key assignment functions */
22135+
22136+/* Information from which key of file stat-data can be uniquely
22137+ restored. This depends on key assignment policy for
22138+ stat-data. Currently it's enough to store object id and locality id
22139+ (60+60==120) bits, because minor packing locality and offset of
22140+ stat-data key are always known constants: KEY_SD_MINOR and 0
22141+ respectively. For simplicity 4 bits are wasted in each id, and just
22142+ two 64 bit integers are stored.
22143+
22144+ This field has to be byte-aligned, because we don't want to waste
22145+ space in directory entries. There is another side of a coin of
22146+ course: we waste CPU and bus bandwidth in stead, by copying data back
22147+ and forth.
22148+
22149+ Next optimization: &obj_key_id is mainly used to address stat data from
22150+ directory entries. Under the assumption that majority of files only have
22151+ only name (one hard link) from *the* parent directory it seems reasonable
22152+ to only store objectid of stat data and take its locality from key of
22153+ directory item.
22154+
22155+ This requires some flag to be added to the &obj_key_id to distinguish
22156+ between these two cases. Remaining bits in flag byte are then asking to be
22157+ used to store file type.
22158+
22159+ This optimization requires changes in directory item handling code.
22160+
22161+*/
22162+typedef struct obj_key_id {
22163+ d8 locality[sizeof(__u64)];
22164+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
22165+ )
22166+ d8 objectid[sizeof(__u64)];
22167+}
22168+obj_key_id;
22169+
22170+/* Information sufficient to uniquely identify directory entry within
22171+ compressed directory item.
22172+
22173+ For alignment issues see &obj_key_id above.
22174+*/
22175+typedef struct de_id {
22176+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
22177+ d8 objectid[sizeof(__u64)];
22178+ d8 offset[sizeof(__u64)];
22179+}
22180+de_id;
22181+
22182+extern int inode_onwire_size(const struct inode *obj);
22183+extern char *build_inode_onwire(const struct inode *obj, char *area);
22184+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
22185+
22186+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
22187+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
22188+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
22189+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
22190+extern int build_de_id(const struct inode *dir, const struct qstr *name,
22191+ de_id * id);
22192+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
22193+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
22194+ reiser4_key * key);
22195+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
22196+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
22197+
22198+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
22199+extern void build_entry_key_common(const struct inode *dir,
22200+ const struct qstr *name,
22201+ reiser4_key * result);
22202+extern void build_entry_key_stable_entry(const struct inode *dir,
22203+ const struct qstr *name,
22204+ reiser4_key * result);
22205+extern int is_dot_key(const reiser4_key * key);
22206+extern reiser4_key *build_sd_key(const struct inode *target,
22207+ reiser4_key * result);
22208+
22209+extern int is_longname_key(const reiser4_key * key);
22210+extern int is_longname(const char *name, int len);
22211+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
22212+extern char *reiser4_unpack_string(__u64 value, char *buf);
22213+extern void complete_entry_key(const struct inode *dir, const char *name,
22214+ int len, reiser4_key *result);
22215+
22216+/* __KASSIGN_H__ */
22217+#endif
22218+
22219+/* Make Linus happy.
22220+ Local variables:
22221+ c-indentation-style: "K&R"
22222+ mode-name: "LC"
22223+ c-basic-offset: 8
22224+ tab-width: 8
22225+ fill-column: 120
22226+ End:
22227+*/
22228diff --git a/fs/reiser4/key.c b/fs/reiser4/key.c
22229new file mode 100644
22230index 0000000..384c318
22231--- /dev/null
22232+++ b/fs/reiser4/key.c
22233@@ -0,0 +1,137 @@
22234+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22235+
22236+/* Key manipulations. */
22237+
22238+#include "debug.h"
22239+#include "key.h"
22240+#include "super.h"
22241+#include "reiser4.h"
22242+
22243+#include <linux/types.h> /* for __u?? */
22244+
22245+/* Minimal possible key: all components are zero. It is presumed that this is
22246+ independent of key scheme. */
22247+static const reiser4_key MINIMAL_KEY = {
22248+ .el = {
22249+ 0ull,
22250+ ON_LARGE_KEY(0ull,)
22251+ 0ull,
22252+ 0ull
22253+ }
22254+};
22255+
22256+/* Maximal possible key: all components are ~0. It is presumed that this is
22257+ independent of key scheme. */
22258+static const reiser4_key MAXIMAL_KEY = {
22259+ .el = {
22260+ __constant_cpu_to_le64(~0ull),
22261+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22262+ __constant_cpu_to_le64(~0ull),
22263+ __constant_cpu_to_le64(~0ull)
22264+ }
22265+};
22266+
22267+/* Initialize key. */
22268+void reiser4_key_init(reiser4_key * key /* key to init */ )
22269+{
22270+ assert("nikita-1169", key != NULL);
22271+ memset(key, 0, sizeof *key);
22272+}
22273+
22274+/* minimal possible key in the tree. Return pointer to the static storage. */
22275+const reiser4_key *reiser4_min_key(void)
22276+{
22277+ return &MINIMAL_KEY;
22278+}
22279+
22280+/* maximum possible key in the tree. Return pointer to the static storage. */
22281+const reiser4_key *reiser4_max_key(void)
22282+{
22283+ return &MAXIMAL_KEY;
22284+}
22285+
22286+#if REISER4_DEBUG
22287+/* debugging aid: print symbolic name of key type */
22288+static const char *type_name(unsigned int key_type /* key type */ )
22289+{
22290+ switch (key_type) {
22291+ case KEY_FILE_NAME_MINOR:
22292+ return "file name";
22293+ case KEY_SD_MINOR:
22294+ return "stat data";
22295+ case KEY_ATTR_NAME_MINOR:
22296+ return "attr name";
22297+ case KEY_ATTR_BODY_MINOR:
22298+ return "attr body";
22299+ case KEY_BODY_MINOR:
22300+ return "file body";
22301+ default:
22302+ return "unknown";
22303+ }
22304+}
22305+
22306+/* debugging aid: print human readable information about key */
22307+void reiser4_print_key(const char *prefix /* prefix to print */ ,
22308+ const reiser4_key * key /* key to print */ )
22309+{
22310+ /* turn bold on */
22311+ /* printf ("\033[1m"); */
22312+ if (key == NULL)
22313+ printk("%s: null key\n", prefix);
22314+ else {
22315+ if (REISER4_LARGE_KEY)
22316+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22317+ get_key_locality(key),
22318+ get_key_type(key),
22319+ get_key_ordering(key),
22320+ get_key_band(key),
22321+ get_key_objectid(key), get_key_offset(key));
22322+ else
22323+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22324+ get_key_locality(key),
22325+ get_key_type(key),
22326+ get_key_band(key),
22327+ get_key_objectid(key), get_key_offset(key));
22328+ /*
22329+ * if this is a key of directory entry, try to decode part of
22330+ * a name stored in the key, and output it.
22331+ */
22332+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22333+ char buf[DE_NAME_BUF_LEN];
22334+ char *c;
22335+
22336+ c = buf;
22337+ c = reiser4_unpack_string(get_key_ordering(key), c);
22338+ reiser4_unpack_string(get_key_fulloid(key), c);
22339+ printk("[%s", buf);
22340+ if (is_longname_key(key))
22341+ /*
22342+ * only part of the name is stored in the key.
22343+ */
22344+ printk("...]\n");
22345+ else {
22346+ /*
22347+ * whole name is stored in the key.
22348+ */
22349+ reiser4_unpack_string(get_key_offset(key), buf);
22350+ printk("%s]\n", buf);
22351+ }
22352+ } else {
22353+ printk("[%s]\n", type_name(get_key_type(key)));
22354+ }
22355+ }
22356+ /* turn bold off */
22357+ /* printf ("\033[m\017"); */
22358+}
22359+
22360+#endif
22361+
22362+/* Make Linus happy.
22363+ Local variables:
22364+ c-indentation-style: "K&R"
22365+ mode-name: "LC"
22366+ c-basic-offset: 8
22367+ tab-width: 8
22368+ fill-column: 120
22369+ End:
22370+*/
22371diff --git a/fs/reiser4/key.h b/fs/reiser4/key.h
22372new file mode 100644
22373index 0000000..3f6b47e
22374--- /dev/null
22375+++ b/fs/reiser4/key.h
22376@@ -0,0 +1,384 @@
22377+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22378+
22379+/* Declarations of key-related data-structures and operations on keys. */
22380+
22381+#if !defined( __REISER4_KEY_H__ )
22382+#define __REISER4_KEY_H__
22383+
22384+#include "dformat.h"
22385+#include "forward.h"
22386+#include "debug.h"
22387+
22388+#include <linux/types.h> /* for __u?? */
22389+
22390+/* Operations on keys in reiser4 tree */
22391+
22392+/* No access to any of these fields shall be done except via a
22393+ wrapping macro/function, and that wrapping macro/function shall
22394+ convert to little endian order. Compare keys will consider cpu byte order. */
22395+
22396+/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
22397+ which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
22398+ within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong
22399+ approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
22400+ right one. */
22401+
22402+/* possible values for minor packing locality (4 bits required) */
22403+typedef enum {
22404+ /* file name */
22405+ KEY_FILE_NAME_MINOR = 0,
22406+ /* stat-data */
22407+ KEY_SD_MINOR = 1,
22408+ /* file attribute name */
22409+ KEY_ATTR_NAME_MINOR = 2,
22410+ /* file attribute value */
22411+ KEY_ATTR_BODY_MINOR = 3,
22412+ /* file body (tail or extent) */
22413+ KEY_BODY_MINOR = 4,
22414+} key_minor_locality;
22415+
22416+/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
22417+ Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
22418+ and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to
22419+ segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
22420+ block_alloc.c to check the node type when deciding where to allocate the node.
22421+
22422+ The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it
22423+ should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our
22424+ current implementation tails have a different minor packing locality from extents, and no files have both extents and
22425+ tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now....
22426+*/
22427+
22428+/* Arbitrary major packing localities can be assigned to objects using
22429+ the reiser4(filenameA/..packing<=some_number) system call.
22430+
22431+ In reiser4, the creat() syscall creates a directory
22432+
22433+ whose default flow (that which is referred to if the directory is
22434+ read as a file) is the traditional unix file body.
22435+
22436+ whose directory plugin is the 'filedir'
22437+
22438+ whose major packing locality is that of the parent of the object created.
22439+
22440+ The static_stat item is a particular commonly used directory
22441+ compression (the one for normal unix files).
22442+
22443+ The filedir plugin checks to see if the static_stat item exists.
22444+ There is a unique key for static_stat. If yes, then it uses the
22445+ static_stat item for all of the values that it contains. The
22446+ static_stat item contains a flag for each stat it contains which
22447+ indicates whether one should look outside the static_stat item for its
22448+ contents.
22449+*/
22450+
22451+/* offset of fields in reiser4_key. Value of each element of this enum
22452+ is index within key (thought as array of __u64's) where this field
22453+ is. */
22454+typedef enum {
22455+ /* major "locale", aka dirid. Sits in 1st element */
22456+ KEY_LOCALITY_INDEX = 0,
22457+ /* minor "locale", aka item type. Sits in 1st element */
22458+ KEY_TYPE_INDEX = 0,
22459+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22460+ /* "object band". Sits in 2nd element */
22461+ KEY_BAND_INDEX,
22462+ /* objectid. Sits in 2nd element */
22463+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22464+ /* full objectid. Sits in 2nd element */
22465+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22466+ /* Offset. Sits in 3rd element */
22467+ KEY_OFFSET_INDEX,
22468+ /* Name hash. Sits in 3rd element */
22469+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22470+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22471+ KEY_LAST_INDEX
22472+} reiser4_key_field_index;
22473+
22474+/* key in reiser4 internal "balanced" tree. It is just array of three
22475+ 64bit integers in disk byte order (little-endian by default). This
22476+ array is actually indexed by reiser4_key_field. Each __u64 within
22477+ this array is called "element". Logical key component encoded within
22478+ elements are called "fields".
22479+
22480+ We declare this as union with second component dummy to suppress
22481+ inconvenient array<->pointer casts implied in C. */
22482+union reiser4_key {
22483+ __le64 el[KEY_LAST_INDEX];
22484+ int pad;
22485+};
22486+
22487+/* bitmasks showing where within reiser4_key particular key is stored. */
22488+/* major locality occupies higher 60 bits of the first element */
22489+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22490+
22491+/* minor locality occupies lower 4 bits of the first element */
22492+#define KEY_TYPE_MASK 0xfull
22493+
22494+/* controversial band occupies higher 4 bits of the 2nd element */
22495+#define KEY_BAND_MASK 0xf000000000000000ull
22496+
22497+/* objectid occupies lower 60 bits of the 2nd element */
22498+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22499+
22500+/* full 64bit objectid*/
22501+#define KEY_FULLOID_MASK 0xffffffffffffffffull
22502+
22503+/* offset is just 3rd L.M.Nt itself */
22504+#define KEY_OFFSET_MASK 0xffffffffffffffffull
22505+
22506+/* ordering is whole second element */
22507+#define KEY_ORDERING_MASK 0xffffffffffffffffull
22508+
22509+/* how many bits key element should be shifted to left to get particular field */
22510+typedef enum {
22511+ KEY_LOCALITY_SHIFT = 4,
22512+ KEY_TYPE_SHIFT = 0,
22513+ KEY_BAND_SHIFT = 60,
22514+ KEY_OBJECTID_SHIFT = 0,
22515+ KEY_FULLOID_SHIFT = 0,
22516+ KEY_OFFSET_SHIFT = 0,
22517+ KEY_ORDERING_SHIFT = 0,
22518+} reiser4_key_field_shift;
22519+
22520+static inline __u64
22521+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22522+{
22523+ assert("nikita-753", key != NULL);
22524+ assert("nikita-754", off < KEY_LAST_INDEX);
22525+ return le64_to_cpu(get_unaligned(&key->el[off]));
22526+}
22527+
22528+static inline void
22529+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22530+{
22531+ assert("nikita-755", key != NULL);
22532+ assert("nikita-756", off < KEY_LAST_INDEX);
22533+ put_unaligned(cpu_to_le64(value), &key->el[off]);
22534+}
22535+
22536+/* macro to define getter and setter functions for field F with type T */
22537+#define DEFINE_KEY_FIELD( L, U, T ) \
22538+static inline T get_key_ ## L ( const reiser4_key *key ) \
22539+{ \
22540+ assert( "nikita-750", key != NULL ); \
22541+ return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \
22542+ KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \
22543+} \
22544+ \
22545+static inline void set_key_ ## L ( reiser4_key *key, T loc ) \
22546+{ \
22547+ __u64 el; \
22548+ \
22549+ assert( "nikita-752", key != NULL ); \
22550+ \
22551+ el = get_key_el( key, KEY_ ## U ## _INDEX ); \
22552+ /* clear field bits in the key */ \
22553+ el &= ~KEY_ ## U ## _MASK; \
22554+ /* actually it should be \
22555+ \
22556+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
22557+ \
22558+ but we trust user to never pass values that wouldn't fit \
22559+ into field. Clearing extra bits is one operation, but this \
22560+ function is time-critical. \
22561+ But check this in assertion. */ \
22562+ assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \
22563+ ~KEY_ ## U ## _MASK ) == 0 ); \
22564+ el |= ( loc << KEY_ ## U ## _SHIFT ); \
22565+ set_key_el( key, KEY_ ## U ## _INDEX, el ); \
22566+}
22567+
22568+typedef __u64 oid_t;
22569+
22570+/* define get_key_locality(), set_key_locality() */
22571+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22572+/* define get_key_type(), set_key_type() */
22573+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22574+/* define get_key_band(), set_key_band() */
22575+DEFINE_KEY_FIELD(band, BAND, __u64);
22576+/* define get_key_objectid(), set_key_objectid() */
22577+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22578+/* define get_key_fulloid(), set_key_fulloid() */
22579+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22580+/* define get_key_offset(), set_key_offset() */
22581+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22582+#if (REISER4_LARGE_KEY)
22583+/* define get_key_ordering(), set_key_ordering() */
22584+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22585+#else
22586+static inline __u64 get_key_ordering(const reiser4_key * key)
22587+{
22588+ return 0;
22589+}
22590+
22591+static inline void set_key_ordering(reiser4_key * key, __u64 val)
22592+{
22593+}
22594+#endif
22595+
22596+/* key comparison result */
22597+typedef enum { LESS_THAN = -1, /* if first key is less than second */
22598+ EQUAL_TO = 0, /* if keys are equal */
22599+ GREATER_THAN = +1 /* if first key is greater than second */
22600+} cmp_t;
22601+
22602+void reiser4_key_init(reiser4_key * key);
22603+
22604+/* minimal possible key in the tree. Return pointer to the static storage. */
22605+extern const reiser4_key *reiser4_min_key(void);
22606+extern const reiser4_key *reiser4_max_key(void);
22607+
22608+/* helper macro for keycmp() */
22609+#define KEY_DIFF(k1, k2, field) \
22610+({ \
22611+ typeof (get_key_ ## field (k1)) f1; \
22612+ typeof (get_key_ ## field (k2)) f2; \
22613+ \
22614+ f1 = get_key_ ## field (k1); \
22615+ f2 = get_key_ ## field (k2); \
22616+ \
22617+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22618+})
22619+
22620+/* helper macro for keycmp() */
22621+#define KEY_DIFF_EL(k1, k2, off) \
22622+({ \
22623+ __u64 e1; \
22624+ __u64 e2; \
22625+ \
22626+ e1 = get_key_el(k1, off); \
22627+ e2 = get_key_el(k2, off); \
22628+ \
22629+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22630+})
22631+
22632+/* compare `k1' and `k2'. This function is a heart of "key allocation
22633+ policy". All you need to implement new policy is to add yet another
22634+ clause here. */
22635+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22636+ const reiser4_key * k2 /* second key to compare */ )
22637+{
22638+ cmp_t result;
22639+
22640+ /*
22641+ * This function is the heart of reiser4 tree-routines. Key comparison
22642+ * is among most heavily used operations in the file system.
22643+ */
22644+
22645+ assert("nikita-439", k1 != NULL);
22646+ assert("nikita-440", k2 != NULL);
22647+
22648+ /* there is no actual branch here: condition is compile time constant
22649+ * and constant folding and propagation ensures that only one branch
22650+ * is actually compiled in. */
22651+
22652+ if (REISER4_PLANA_KEY_ALLOCATION) {
22653+ /* if physical order of fields in a key is identical
22654+ with logical order, we can implement key comparison
22655+ as three 64bit comparisons. */
22656+ /* logical order of fields in plan-a:
22657+ locality->type->objectid->offset. */
22658+ /* compare locality and type at once */
22659+ result = KEY_DIFF_EL(k1, k2, 0);
22660+ if (result == EQUAL_TO) {
22661+ /* compare objectid (and band if it's there) */
22662+ result = KEY_DIFF_EL(k1, k2, 1);
22663+ /* compare offset */
22664+ if (result == EQUAL_TO) {
22665+ result = KEY_DIFF_EL(k1, k2, 2);
22666+ if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22667+ result = KEY_DIFF_EL(k1, k2, 3);
22668+ }
22669+ }
22670+ }
22671+ } else if (REISER4_3_5_KEY_ALLOCATION) {
22672+ result = KEY_DIFF(k1, k2, locality);
22673+ if (result == EQUAL_TO) {
22674+ result = KEY_DIFF(k1, k2, objectid);
22675+ if (result == EQUAL_TO) {
22676+ result = KEY_DIFF(k1, k2, type);
22677+ if (result == EQUAL_TO)
22678+ result = KEY_DIFF(k1, k2, offset);
22679+ }
22680+ }
22681+ } else
22682+ impossible("nikita-441", "Unknown key allocation scheme!");
22683+ return result;
22684+}
22685+
22686+/* true if @k1 equals @k2 */
22687+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22688+ const reiser4_key * k2 /* second key to compare */ )
22689+{
22690+ assert("nikita-1879", k1 != NULL);
22691+ assert("nikita-1880", k2 != NULL);
22692+ return !memcmp(k1, k2, sizeof *k1);
22693+}
22694+
22695+/* true if @k1 is less than @k2 */
22696+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22697+ const reiser4_key * k2 /* second key to compare */ )
22698+{
22699+ assert("nikita-1952", k1 != NULL);
22700+ assert("nikita-1953", k2 != NULL);
22701+ return keycmp(k1, k2) == LESS_THAN;
22702+}
22703+
22704+/* true if @k1 is less than or equal to @k2 */
22705+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22706+ const reiser4_key * k2 /* second key to compare */ )
22707+{
22708+ assert("nikita-1954", k1 != NULL);
22709+ assert("nikita-1955", k2 != NULL);
22710+ return keycmp(k1, k2) != GREATER_THAN;
22711+}
22712+
22713+/* true if @k1 is greater than @k2 */
22714+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22715+ const reiser4_key * k2 /* second key to compare */ )
22716+{
22717+ assert("nikita-1959", k1 != NULL);
22718+ assert("nikita-1960", k2 != NULL);
22719+ return keycmp(k1, k2) == GREATER_THAN;
22720+}
22721+
22722+/* true if @k1 is greater than or equal to @k2 */
22723+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22724+ const reiser4_key * k2 /* second key to compare */ )
22725+{
22726+ assert("nikita-1956", k1 != NULL);
22727+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
22728+ * November 3: Laika */
22729+ return keycmp(k1, k2) != LESS_THAN;
22730+}
22731+
22732+static inline void prefetchkey(reiser4_key * key)
22733+{
22734+ prefetch(key);
22735+ prefetch(&key->el[KEY_CACHELINE_END]);
22736+}
22737+
22738+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22739+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22740+/* size of a buffer suitable to hold human readable key representation */
22741+#define KEY_BUF_LEN (80)
22742+
22743+#if REISER4_DEBUG
22744+extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22745+#else
22746+#define reiser4_print_key(p,k) noop
22747+#endif
22748+
22749+/* __FS_REISERFS_KEY_H__ */
22750+#endif
22751+
22752+/* Make Linus happy.
22753+ Local variables:
22754+ c-indentation-style: "K&R"
22755+ mode-name: "LC"
22756+ c-basic-offset: 8
22757+ tab-width: 8
22758+ fill-column: 120
22759+ End:
22760+*/
22761diff --git a/fs/reiser4/ktxnmgrd.c b/fs/reiser4/ktxnmgrd.c
22762new file mode 100644
22763index 0000000..15bb6d6
22764--- /dev/null
22765+++ b/fs/reiser4/ktxnmgrd.c
22766@@ -0,0 +1,215 @@
22767+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22768+/* Transaction manager daemon. */
22769+
22770+/*
22771+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22772+ * needed/important for the following reasons:
22773+ *
22774+ * 1. in reiser4 atom is not committed immediately when last transaction
22775+ * handle closes, unless atom is either too old or too large (see
22776+ * atom_should_commit()). This is done to avoid committing too frequently.
22777+ * because:
22778+ *
22779+ * 2. sometimes we don't want to commit atom when closing last transaction
22780+ * handle even if it is old and fat enough. For example, because we are at
22781+ * this point under directory semaphore, and committing would stall all
22782+ * accesses to this directory.
22783+ *
22784+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22785+ * either due to (tunable) timeout or because it was explicitly woken up by
22786+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22787+ * eligible.
22788+ *
22789+ */
22790+
22791+#include "debug.h"
22792+#include "txnmgr.h"
22793+#include "tree.h"
22794+#include "ktxnmgrd.h"
22795+#include "super.h"
22796+#include "reiser4.h"
22797+
22798+#include <linux/sched.h> /* for struct task_struct */
22799+#include <linux/wait.h>
22800+#include <linux/suspend.h>
22801+#include <linux/kernel.h>
22802+#include <linux/writeback.h>
22803+#include <linux/kthread.h>
22804+#include <linux/freezer.h>
22805+
22806+static int scan_mgr(struct super_block *);
22807+
22808+/*
22809+ * change current->comm so that ps, top, and friends will see changed
22810+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
22811+ * be it will make lonely system administrator feeling less alone at 3 A.M.
22812+ */
22813+#define set_comm( state ) \
22814+ snprintf( current -> comm, sizeof( current -> comm ), \
22815+ "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22816+
22817+/**
22818+ * ktxnmgrd - kernel txnmgr daemon
22819+ * @arg: pointer to super block
22820+ *
22821+ * The background transaction manager daemon, started as a kernel thread during
22822+ * reiser4 initialization.
22823+ */
22824+static int ktxnmgrd(void *arg)
22825+{
22826+ struct super_block *super;
22827+ ktxnmgrd_context *ctx;
22828+ txn_mgr *mgr;
22829+ int done = 0;
22830+
22831+ super = arg;
22832+ mgr = &get_super_private(super)->tmgr;
22833+
22834+ /*
22835+ * do_fork() just copies task_struct into the new thread. ->fs_context
22836+ * shouldn't be copied of course. This shouldn't be a problem for the
22837+ * rest of the code though.
22838+ */
22839+ current->journal_info = NULL;
22840+ ctx = mgr->daemon;
22841+ while (1) {
22842+ try_to_freeze();
22843+ set_comm("wait");
22844+ {
22845+ DEFINE_WAIT(__wait);
22846+
22847+ prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22848+ if (kthread_should_stop()) {
22849+ done = 1;
22850+ } else
22851+ schedule_timeout(ctx->timeout);
22852+ finish_wait(&ctx->wait, &__wait);
22853+ }
22854+ if (done)
22855+ break;
22856+ set_comm("run");
22857+ spin_lock(&ctx->guard);
22858+ /*
22859+ * wait timed out or ktxnmgrd was woken up by explicit request
22860+ * to commit something. Scan list of atoms in txnmgr and look
22861+ * for too old atoms.
22862+ */
22863+ do {
22864+ ctx->rescan = 0;
22865+ scan_mgr(super);
22866+ spin_lock(&ctx->guard);
22867+ if (ctx->rescan) {
22868+ /*
22869+ * the list could be modified while ctx
22870+ * spinlock was released, we have to repeat
22871+ * scanning from the beginning
22872+ */
22873+ break;
22874+ }
22875+ } while (ctx->rescan);
22876+ spin_unlock(&ctx->guard);
22877+ }
22878+ return 0;
22879+}
22880+
22881+#undef set_comm
22882+
22883+/**
22884+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22885+ * @super: pointer to super block
22886+ *
22887+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22888+ * manager. Starts kernel txnmgr daemon. This is called on mount.
22889+ */
22890+int reiser4_init_ktxnmgrd(struct super_block *super)
22891+{
22892+ txn_mgr *mgr;
22893+ ktxnmgrd_context *ctx;
22894+
22895+ mgr = &get_super_private(super)->tmgr;
22896+
22897+ assert("zam-1014", mgr->daemon == NULL);
22898+
22899+ ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22900+ if (ctx == NULL)
22901+ return RETERR(-ENOMEM);
22902+
22903+ assert("nikita-2442", ctx != NULL);
22904+
22905+ memset(ctx, 0, sizeof *ctx);
22906+ init_waitqueue_head(&ctx->wait);
22907+
22908+ /*kcond_init(&ctx->startup);*/
22909+ spin_lock_init(&ctx->guard);
22910+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22911+ ctx->rescan = 1;
22912+ mgr->daemon = ctx;
22913+
22914+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22915+ if (IS_ERR(ctx->tsk)) {
22916+ int ret = PTR_ERR(ctx->tsk);
22917+ mgr->daemon = NULL;
22918+ kfree(ctx);
22919+ return RETERR(ret);
22920+ }
22921+ return 0;
22922+}
22923+
22924+void ktxnmgrd_kick(txn_mgr *mgr)
22925+{
22926+ assert("nikita-3234", mgr != NULL);
22927+ assert("nikita-3235", mgr->daemon != NULL);
22928+ wake_up(&mgr->daemon->wait);
22929+}
22930+
22931+int is_current_ktxnmgrd(void)
22932+{
22933+ return (get_current_super_private()->tmgr.daemon->tsk == current);
22934+}
22935+
22936+/**
22937+ * scan_mgr - commit atoms which are to be committed
22938+ * @super: super block to commit atoms of
22939+ *
22940+ * Commits old atoms.
22941+ */
22942+static int scan_mgr(struct super_block *super)
22943+{
22944+ int ret;
22945+ reiser4_context ctx;
22946+
22947+ init_stack_context(&ctx, super);
22948+
22949+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
22950+
22951+ reiser4_exit_context(&ctx);
22952+ return ret;
22953+}
22954+
22955+/**
22956+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22957+ * @mgr:
22958+ *
22959+ * This is called on umount. Stops ktxnmgrd and free t
22960+ */
22961+void reiser4_done_ktxnmgrd(struct super_block *super)
22962+{
22963+ txn_mgr *mgr;
22964+
22965+ mgr = &get_super_private(super)->tmgr;
22966+ assert("zam-1012", mgr->daemon != NULL);
22967+
22968+ kthread_stop(mgr->daemon->tsk);
22969+ kfree(mgr->daemon);
22970+ mgr->daemon = NULL;
22971+}
22972+
22973+/*
22974+ * Local variables:
22975+ * c-indentation-style: "K&R"
22976+ * mode-name: "LC"
22977+ * c-basic-offset: 8
22978+ * tab-width: 8
22979+ * fill-column: 120
22980+ * End:
22981+ */
22982diff --git a/fs/reiser4/ktxnmgrd.h b/fs/reiser4/ktxnmgrd.h
22983new file mode 100644
22984index 0000000..d00f1d9
22985--- /dev/null
22986+++ b/fs/reiser4/ktxnmgrd.h
22987@@ -0,0 +1,52 @@
22988+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22989+ * reiser4/README */
22990+
22991+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22992+
22993+#ifndef __KTXNMGRD_H__
22994+#define __KTXNMGRD_H__
22995+
22996+#include "txnmgr.h"
22997+
22998+#include <linux/fs.h>
22999+#include <linux/wait.h>
23000+#include <linux/completion.h>
23001+#include <linux/spinlock.h>
23002+#include <asm/atomic.h>
23003+#include <linux/sched.h> /* for struct task_struct */
23004+
23005+/* in this structure all data necessary to start up, shut down and communicate
23006+ * with ktxnmgrd are kept. */
23007+struct ktxnmgrd_context {
23008+ /* wait queue head on which ktxnmgrd sleeps */
23009+ wait_queue_head_t wait;
23010+ /* spin lock protecting all fields of this structure */
23011+ spinlock_t guard;
23012+ /* timeout of sleeping on ->wait */
23013+ signed long timeout;
23014+ /* kernel thread running ktxnmgrd */
23015+ struct task_struct *tsk;
23016+ /* list of all file systems served by this ktxnmgrd */
23017+ struct list_head queue;
23018+ /* should ktxnmgrd repeat scanning of atoms? */
23019+ unsigned int rescan:1;
23020+};
23021+
23022+extern int reiser4_init_ktxnmgrd(struct super_block *);
23023+extern void reiser4_done_ktxnmgrd(struct super_block *);
23024+
23025+extern void ktxnmgrd_kick(txn_mgr * mgr);
23026+extern int is_current_ktxnmgrd(void);
23027+
23028+/* __KTXNMGRD_H__ */
23029+#endif
23030+
23031+/* Make Linus happy.
23032+ Local variables:
23033+ c-indentation-style: "K&R"
23034+ mode-name: "LC"
23035+ c-basic-offset: 8
23036+ tab-width: 8
23037+ fill-column: 120
23038+ End:
23039+*/
23040diff --git a/fs/reiser4/lock.c b/fs/reiser4/lock.c
23041new file mode 100644
23042index 0000000..cdca928
23043--- /dev/null
23044+++ b/fs/reiser4/lock.c
23045@@ -0,0 +1,1232 @@
23046+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23047+ * reiser4/README */
23048+
23049+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
23050+ order. V4 balances the tree from the bottom up, and searches the tree from
23051+ the top down, and that is really the way we want it, so tradition won't work
23052+ for us.
23053+
23054+ Instead we have two lock orderings, a high priority lock ordering, and a low
23055+ priority lock ordering. Each node in the tree has a lock in its znode.
23056+
23057+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
23058+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
23059+ process may have a pending lock request to a node locked by another process.
23060+ Note: we lock and unlock, but do not transfer locks: it is possible
23061+ transferring locks instead would save some bus locking....
23062+
23063+ Deadlock occurs when we have a loop constructed from process locked sets and
23064+ lock request vectors.
23065+
23066+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
23067+ memory is extended with "znodes" with which we connect nodes with their left
23068+ and right neighbors using sibling pointers stored in the znodes. When we
23069+ perform balancing operations we often go from left to right and from right to
23070+ left.
23071+
23072+ +-P1-+ +-P3-+
23073+ |+--+| V1 |+--+|
23074+ ||N1|| -------> ||N3||
23075+ |+--+| |+--+|
23076+ +----+ +----+
23077+ ^ |
23078+ |V2 |V3
23079+ | v
23080+ +---------P2---------+
23081+ |+--+ +--+|
23082+ ||N2| -------- |N4||
23083+ |+--+ +--+|
23084+ +--------------------+
23085+
23086+ We solve this by ensuring that only low priority processes lock in top to
23087+ bottom order and from right to left, and high priority processes lock from
23088+ bottom to top and left to right.
23089+
23090+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
23091+ kill those damn busy loops.
23092+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
23093+ stage) cannot be ordered that way. There are no rules what nodes can belong
23094+ to the atom and what nodes cannot. We cannot define what is right or left
23095+ direction, what is top or bottom. We can take immediate parent or side
23096+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
23097+ not a far right neighbor for other nodes from the same atom. It breaks
23098+ deadlock avoidance rules and hi-low priority locking cannot be applied for
23099+ atom locks.
23100+
23101+ How does it help to avoid deadlocks ?
23102+
23103+ Suppose we have a deadlock with n processes. Processes from one priority
23104+ class never deadlock because they take locks in one consistent
23105+ order.
23106+
23107+ So, any possible deadlock loop must have low priority as well as high
23108+ priority processes. There are no other lock priority levels except low and
23109+ high. We know that any deadlock loop contains at least one node locked by a
23110+ low priority process and requested by a high priority process. If this
23111+ situation is caught and resolved it is sufficient to avoid deadlocks.
23112+
23113+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
23114+
23115+ The deadlock prevention algorithm is based on comparing
23116+ priorities of node owners (processes which keep znode locked) and
23117+ requesters (processes which want to acquire a lock on znode). We
23118+ implement a scheme where low-priority owners yield locks to
23119+ high-priority requesters. We created a signal passing system that
23120+ is used to ask low-priority processes to yield one or more locked
23121+ znodes.
23122+
23123+ The condition when a znode needs to change its owners is described by the
23124+ following formula:
23125+
23126+ #############################################
23127+ # #
23128+ # (number of high-priority requesters) > 0 #
23129+ # AND #
23130+ # (numbers of high-priority owners) == 0 #
23131+ # #
23132+ #############################################
23133+
23134+ Note that a low-priority process delays node releasing if another
23135+ high-priority process owns this node. So, slightly more strictly speaking,
23136+ to have a deadlock capable cycle you must have a loop in which a high
23137+ priority process is waiting on a low priority process to yield a node, which
23138+ is slightly different from saying a high priority process is waiting on a
23139+ node owned by a low priority process.
23140+
23141+ It is enough to avoid deadlocks if we prevent any low-priority process from
23142+ falling asleep if its locked set contains a node which satisfies the
23143+ deadlock condition.
23144+
23145+ That condition is implicitly or explicitly checked in all places where new
23146+ high-priority requests may be added or removed from node request queue or
23147+ high-priority process takes or releases a lock on node. The main
23148+ goal of these checks is to never lose the moment when node becomes "has
23149+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
23150+ at that time.
23151+
23152+ The information about received signals is stored in the per-process
23153+ structure (lock stack) and analyzed before a low-priority process goes to
23154+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
23155+ sleeping process up and forces him to re-check lock status and received
23156+ signal info. If "must-yield-this-lock" signals were received the locking
23157+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
23158+
23159+ V4 LOCKING DRAWBACKS
23160+
23161+ If we have already balanced on one level, and we are propagating our changes
23162+ upward to a higher level, it could be very messy to surrender all locks on
23163+ the lower level because we put so much computational work into it, and
23164+ reverting them to their state before they were locked might be very complex.
23165+ We also don't want to acquire all locks before performing balancing because
23166+ that would either be almost as much work as the balancing, or it would be
23167+ too conservative and lock too much. We want balancing to be done only at
23168+ high priority. Yet, we might want to go to the left one node and use some
23169+ of its empty space... So we make one attempt at getting the node to the left
23170+ using try_lock, and if it fails we do without it, because we didn't really
23171+ need it, it was only a nice to have.
23172+
23173+ LOCK STRUCTURES DESCRIPTION
23174+
23175+ The following data structures are used in the reiser4 locking
23176+ implementation:
23177+
23178+ All fields related to long-term locking are stored in znode->lock.
23179+
23180+ The lock stack is a per thread object. It owns all znodes locked by the
23181+ thread. One znode may be locked by several threads in case of read lock or
23182+ one znode may be write locked by one thread several times. The special link
23183+ objects (lock handles) support n<->m relation between znodes and lock
23184+ owners.
23185+
23186+ <Thread 1> <Thread 2>
23187+
23188+ +---------+ +---------+
23189+ | LS1 | | LS2 |
23190+ +---------+ +---------+
23191+ ^ ^
23192+ |---------------+ +----------+
23193+ v v v v
23194+ +---------+ +---------+ +---------+ +---------+
23195+ | LH1 | | LH2 | | LH3 | | LH4 |
23196+ +---------+ +---------+ +---------+ +---------+
23197+ ^ ^ ^ ^
23198+ | +------------+ |
23199+ v v v
23200+ +---------+ +---------+ +---------+
23201+ | Z1 | | Z2 | | Z3 |
23202+ +---------+ +---------+ +---------+
23203+
23204+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
23205+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
23206+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
23207+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
23208+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
23209+ is locked (for read) twice by different threads and two lock handles are on
23210+ its list. Each lock handle represents a single relation of a locking of a
23211+ znode by a thread. Locking of a znode is an establishing of a locking
23212+ relation between the lock stack and the znode by adding of a new lock handle
23213+ to a list of lock handles, the lock stack. The lock stack links all lock
23214+ handles for all znodes locked by the lock stack. The znode list groups all
23215+ lock handles for all locks stacks which locked the znode.
23216+
23217+ Yet another relation may exist between znode and lock owners. If lock
23218+ procedure cannot immediately take lock on an object it adds the lock owner
23219+ on special `requestors' list belongs to znode. That list represents a
23220+ queue of pending lock requests. Because one lock owner may request only
23221+ only one lock object at a time, it is a 1->n relation between lock objects
23222+ and a lock owner implemented as it is described above. Full information
23223+ (priority, pointers to lock and link objects) about each lock request is
23224+ stored in lock owner structure in `request' field.
23225+
23226+ SHORT_TERM LOCKING
23227+
23228+ This is a list of primitive operations over lock stacks / lock handles /
23229+ znodes and locking descriptions for them.
23230+
23231+ 1. locking / unlocking which is done by two list insertion/deletion, one
23232+ to/from znode's list of lock handles, another one is to/from lock stack's
23233+ list of lock handles. The first insertion is protected by
23234+ znode->lock.guard spinlock. The list owned by the lock stack can be
23235+ modified only by thread who owns the lock stack and nobody else can
23236+ modify/read it. There is nothing to be protected by a spinlock or
23237+ something else.
23238+
23239+ 2. adding/removing a lock request to/from znode requesters list. The rule is
23240+ that znode->lock.guard spinlock should be taken for this.
23241+
23242+ 3. we can traverse list of lock handles and use references to lock stacks who
23243+ locked given znode if znode->lock.guard spinlock is taken.
23244+
23245+ 4. If a lock stack is associated with a znode as a lock requestor or lock
23246+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
23247+ (lock stack's) fields should be protected from being accessed in parallel
23248+ by two or more threads. Please look at lock_stack structure definition
23249+ for the info how those fields are protected. */
23250+
23251+/* Znode lock and capturing intertwining. */
23252+/* In current implementation we capture formatted nodes before locking
23253+ them. Take a look on longterm lock znode, reiser4_try_capture() request
23254+ precedes locking requests. The longterm_lock_znode function unconditionally
23255+ captures znode before even checking of locking conditions.
23256+
23257+ Another variant is to capture znode after locking it. It was not tested, but
23258+ at least one deadlock condition is supposed to be there. One thread has
23259+ locked a znode (Node-1) and calls reiser4_try_capture() for it.
23260+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
23261+ Second thread is a flushing thread, its current atom is the atom Node-1
23262+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
23263+ is locked by the first thread. The described situation is a deadlock. */
23264+
23265+#include "debug.h"
23266+#include "txnmgr.h"
23267+#include "znode.h"
23268+#include "jnode.h"
23269+#include "tree.h"
23270+#include "plugin/node/node.h"
23271+#include "super.h"
23272+
23273+#include <linux/spinlock.h>
23274+
23275+#if REISER4_DEBUG
23276+static int request_is_deadlock_safe(znode *, znode_lock_mode,
23277+ znode_lock_request);
23278+#endif
23279+
23280+/* Returns a lock owner associated with current thread */
23281+lock_stack *get_current_lock_stack(void)
23282+{
23283+ return &get_current_context()->stack;
23284+}
23285+
23286+/* Wakes up all low priority owners informing them about possible deadlock */
23287+static void wake_up_all_lopri_owners(znode * node)
23288+{
23289+ lock_handle *handle;
23290+
23291+ assert_spin_locked(&(node->lock.guard));
23292+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
23293+ assert("nikita-1832", handle->node == node);
23294+ /* count this signal in owner->nr_signaled */
23295+ if (!handle->signaled) {
23296+ handle->signaled = 1;
23297+ atomic_inc(&handle->owner->nr_signaled);
23298+ /* Wake up a single process */
23299+ reiser4_wake_up(handle->owner);
23300+ }
23301+ }
23302+}
23303+
23304+/* Adds a lock to a lock owner, which means creating a link to the lock and
23305+ putting the link into the two lists all links are on (the doubly linked list
23306+ that forms the lock_stack, and the doubly linked list of links attached
23307+ to a lock.
23308+*/
23309+static inline void
23310+link_object(lock_handle * handle, lock_stack * owner, znode * node)
23311+{
23312+ assert("jmacd-810", handle->owner == NULL);
23313+ assert_spin_locked(&(node->lock.guard));
23314+
23315+ handle->owner = owner;
23316+ handle->node = node;
23317+
23318+ assert("reiser4-4",
23319+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23320+
23321+ /* add lock handle to the end of lock_stack's list of locks */
23322+ list_add_tail(&handle->locks_link, &owner->locks);
23323+ ON_DEBUG(owner->nr_locks++);
23324+ reiser4_ctx_gfp_mask_set();
23325+
23326+ /* add lock handle to the head of znode's list of owners */
23327+ list_add(&handle->owners_link, &node->lock.owners);
23328+ handle->signaled = 0;
23329+}
23330+
23331+/* Breaks a relation between a lock and its owner */
23332+static inline void unlink_object(lock_handle * handle)
23333+{
23334+ assert("zam-354", handle->owner != NULL);
23335+ assert("nikita-1608", handle->node != NULL);
23336+ assert_spin_locked(&(handle->node->lock.guard));
23337+ assert("nikita-1829", handle->owner == get_current_lock_stack());
23338+ assert("reiser4-5", handle->owner->nr_locks > 0);
23339+
23340+ /* remove lock handle from lock_stack's list of locks */
23341+ list_del(&handle->locks_link);
23342+ ON_DEBUG(handle->owner->nr_locks--);
23343+ reiser4_ctx_gfp_mask_set();
23344+ assert("reiser4-6",
23345+ ergo(list_empty_careful(&handle->owner->locks),
23346+ handle->owner->nr_locks == 0));
23347+ /* remove lock handle from znode's list of owners */
23348+ list_del(&handle->owners_link);
23349+ /* indicates that lock handle is free now */
23350+ handle->node = NULL;
23351+#if REISER4_DEBUG
23352+ INIT_LIST_HEAD(&handle->locks_link);
23353+ INIT_LIST_HEAD(&handle->owners_link);
23354+ handle->owner = NULL;
23355+#endif
23356+}
23357+
23358+/* Actually locks an object knowing that we are able to do this */
23359+static void lock_object(lock_stack * owner)
23360+{
23361+ lock_request *request;
23362+ znode *node;
23363+
23364+ request = &owner->request;
23365+ node = request->node;
23366+ assert_spin_locked(&(node->lock.guard));
23367+ if (request->mode == ZNODE_READ_LOCK) {
23368+ node->lock.nr_readers++;
23369+ } else {
23370+ /* check that we don't switched from read to write lock */
23371+ assert("nikita-1840", node->lock.nr_readers <= 0);
23372+ /* We allow recursive locking; a node can be locked several
23373+ times for write by same process */
23374+ node->lock.nr_readers--;
23375+ }
23376+
23377+ link_object(request->handle, owner, node);
23378+
23379+ if (owner->curpri) {
23380+ node->lock.nr_hipri_owners++;
23381+ }
23382+}
23383+
23384+/* Check for recursive write locking */
23385+static int recursive(lock_stack * owner)
23386+{
23387+ int ret;
23388+ znode *node;
23389+ lock_handle *lh;
23390+
23391+ node = owner->request.node;
23392+
23393+ /* Owners list is not empty for a locked node */
23394+ assert("zam-314", !list_empty_careful(&node->lock.owners));
23395+ assert("nikita-1841", owner == get_current_lock_stack());
23396+ assert_spin_locked(&(node->lock.guard));
23397+
23398+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23399+ ret = (lh->owner == owner);
23400+
23401+ /* Recursive read locking should be done usual way */
23402+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23403+ /* mixing of read/write locks is not allowed */
23404+ assert("zam-341", !ret || znode_is_wlocked(node));
23405+
23406+ return ret;
23407+}
23408+
23409+#if REISER4_DEBUG
23410+/* Returns true if the lock is held by the calling thread. */
23411+int znode_is_any_locked(const znode * node)
23412+{
23413+ lock_handle *handle;
23414+ lock_stack *stack;
23415+ int ret;
23416+
23417+ if (!znode_is_locked(node)) {
23418+ return 0;
23419+ }
23420+
23421+ stack = get_current_lock_stack();
23422+
23423+ spin_lock_stack(stack);
23424+
23425+ ret = 0;
23426+
23427+ list_for_each_entry(handle, &stack->locks, locks_link) {
23428+ if (handle->node == node) {
23429+ ret = 1;
23430+ break;
23431+ }
23432+ }
23433+
23434+ spin_unlock_stack(stack);
23435+
23436+ return ret;
23437+}
23438+
23439+#endif
23440+
23441+/* Returns true if a write lock is held by the calling thread. */
23442+int znode_is_write_locked(const znode * node)
23443+{
23444+ lock_stack *stack;
23445+ lock_handle *handle;
23446+
23447+ assert("jmacd-8765", node != NULL);
23448+
23449+ if (!znode_is_wlocked(node)) {
23450+ return 0;
23451+ }
23452+
23453+ stack = get_current_lock_stack();
23454+
23455+ /*
23456+ * When znode is write locked, all owner handles point to the same lock
23457+ * stack. Get pointer to lock stack from the first lock handle from
23458+ * znode's owner list
23459+ */
23460+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23461+
23462+ return (handle->owner == stack);
23463+}
23464+
23465+/* This "deadlock" condition is the essential part of reiser4 locking
23466+ implementation. This condition is checked explicitly by calling
23467+ check_deadlock_condition() or implicitly in all places where znode lock
23468+ state (set of owners and request queue) is changed. Locking code is
23469+ designed to use this condition to trigger procedure of passing object from
23470+ low priority owner(s) to high priority one(s).
23471+
23472+ The procedure results in passing an event (setting lock_handle->signaled
23473+ flag) and counting this event in nr_signaled field of owner's lock stack
23474+ object and wakeup owner's process.
23475+*/
23476+static inline int check_deadlock_condition(znode * node)
23477+{
23478+ assert_spin_locked(&(node->lock.guard));
23479+ return node->lock.nr_hipri_requests > 0
23480+ && node->lock.nr_hipri_owners == 0;
23481+}
23482+
23483+static int check_livelock_condition(znode * node, znode_lock_mode mode)
23484+{
23485+ zlock * lock = &node->lock;
23486+
23487+ return mode == ZNODE_READ_LOCK &&
23488+ lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23489+}
23490+
23491+/* checks lock/request compatibility */
23492+static int can_lock_object(lock_stack * owner)
23493+{
23494+ znode *node = owner->request.node;
23495+
23496+ assert_spin_locked(&(node->lock.guard));
23497+
23498+ /* See if the node is disconnected. */
23499+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23500+ return RETERR(-EINVAL);
23501+
23502+ /* Do not ever try to take a lock if we are going in low priority
23503+ direction and a node have a high priority request without high
23504+ priority owners. */
23505+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23506+ return RETERR(-E_REPEAT);
23507+ if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23508+ return RETERR(-E_REPEAT);
23509+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23510+ return RETERR(-E_REPEAT);
23511+ return 0;
23512+}
23513+
23514+/* Setting of a high priority to the process. It clears "signaled" flags
23515+ because znode locked by high-priority process can't satisfy our "deadlock
23516+ condition". */
23517+static void set_high_priority(lock_stack * owner)
23518+{
23519+ assert("nikita-1846", owner == get_current_lock_stack());
23520+ /* Do nothing if current priority is already high */
23521+ if (!owner->curpri) {
23522+ /* We don't need locking for owner->locks list, because, this
23523+ * function is only called with the lock stack of the current
23524+ * thread, and no other thread can play with owner->locks list
23525+ * and/or change ->node pointers of lock handles in this list.
23526+ *
23527+ * (Interrupts also are not involved.)
23528+ */
23529+ lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23530+ while (&owner->locks != &item->locks_link) {
23531+ znode *node = item->node;
23532+
23533+ spin_lock_zlock(&node->lock);
23534+
23535+ node->lock.nr_hipri_owners++;
23536+
23537+ /* we can safely set signaled to zero, because
23538+ previous statement (nr_hipri_owners ++) guarantees
23539+ that signaled will be never set again. */
23540+ item->signaled = 0;
23541+ spin_unlock_zlock(&node->lock);
23542+
23543+ item = list_entry(item->locks_link.next, lock_handle, locks_link);
23544+ }
23545+ owner->curpri = 1;
23546+ atomic_set(&owner->nr_signaled, 0);
23547+ }
23548+}
23549+
23550+/* Sets a low priority to the process. */
23551+static void set_low_priority(lock_stack * owner)
23552+{
23553+ assert("nikita-3075", owner == get_current_lock_stack());
23554+ /* Do nothing if current priority is already low */
23555+ if (owner->curpri) {
23556+ /* scan all locks (lock handles) held by @owner, which is
23557+ actually current thread, and check whether we are reaching
23558+ deadlock possibility anywhere.
23559+ */
23560+ lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23561+ while (&owner->locks != &handle->locks_link) {
23562+ znode *node = handle->node;
23563+ spin_lock_zlock(&node->lock);
23564+ /* this thread just was hipri owner of @node, so
23565+ nr_hipri_owners has to be greater than zero. */
23566+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23567+ node->lock.nr_hipri_owners--;
23568+ /* If we have deadlock condition, adjust a nr_signaled
23569+ field. It is enough to set "signaled" flag only for
23570+ current process, other low-pri owners will be
23571+ signaled and waken up after current process unlocks
23572+ this object and any high-priority requestor takes
23573+ control. */
23574+ if (check_deadlock_condition(node)
23575+ && !handle->signaled) {
23576+ handle->signaled = 1;
23577+ atomic_inc(&owner->nr_signaled);
23578+ }
23579+ spin_unlock_zlock(&node->lock);
23580+ handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23581+ }
23582+ owner->curpri = 0;
23583+ }
23584+}
23585+
23586+static void remove_lock_request(lock_stack * requestor)
23587+{
23588+ zlock * lock = &requestor->request.node->lock;
23589+
23590+ if (requestor->curpri) {
23591+ assert("nikita-1838", lock->nr_hipri_requests > 0);
23592+ lock->nr_hipri_requests--;
23593+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
23594+ lock->nr_hipri_write_requests --;
23595+ }
23596+ list_del(&requestor->requestors_link);
23597+}
23598+
23599+static void invalidate_all_lock_requests(znode * node)
23600+{
23601+ lock_stack *requestor, *tmp;
23602+
23603+ assert_spin_locked(&(node->lock.guard));
23604+
23605+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23606+ remove_lock_request(requestor);
23607+ requestor->request.ret_code = -EINVAL;
23608+ reiser4_wake_up(requestor);
23609+ requestor->request.mode = ZNODE_NO_LOCK;
23610+ }
23611+}
23612+
23613+static void dispatch_lock_requests(znode * node)
23614+{
23615+ lock_stack *requestor, *tmp;
23616+
23617+ assert_spin_locked(&(node->lock.guard));
23618+
23619+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23620+ if (znode_is_write_locked(node))
23621+ break;
23622+ if (!can_lock_object(requestor)) {
23623+ lock_object(requestor);
23624+ remove_lock_request(requestor);
23625+ requestor->request.ret_code = 0;
23626+ reiser4_wake_up(requestor);
23627+ requestor->request.mode = ZNODE_NO_LOCK;
23628+ }
23629+ }
23630+}
23631+
23632+/* release long-term lock, acquired by longterm_lock_znode() */
23633+void longterm_unlock_znode(lock_handle * handle)
23634+{
23635+ znode *node = handle->node;
23636+ lock_stack *oldowner = handle->owner;
23637+ int hipri;
23638+ int readers;
23639+ int rdelta;
23640+ int youdie;
23641+
23642+ /*
23643+ * this is time-critical and highly optimized code. Modify carefully.
23644+ */
23645+
23646+ assert("jmacd-1021", handle != NULL);
23647+ assert("jmacd-1022", handle->owner != NULL);
23648+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23649+
23650+ assert("zam-130", oldowner == get_current_lock_stack());
23651+
23652+ LOCK_CNT_DEC(long_term_locked_znode);
23653+
23654+ /*
23655+ * to minimize amount of operations performed under lock, pre-compute
23656+ * all variables used within critical section. This makes code
23657+ * obscure.
23658+ */
23659+
23660+ /* was this lock of hi or lo priority */
23661+ hipri = oldowner->curpri ? 1 : 0;
23662+ /* number of readers */
23663+ readers = node->lock.nr_readers;
23664+ /* +1 if write lock, -1 if read lock */
23665+ rdelta = (readers > 0) ? -1 : +1;
23666+ /* true if node is to die and write lock is released */
23667+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23668+
23669+ spin_lock_zlock(&node->lock);
23670+
23671+ assert("zam-101", znode_is_locked(node));
23672+
23673+ /* Adjust a number of high priority owners of this lock */
23674+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23675+ node->lock.nr_hipri_owners -= hipri;
23676+
23677+ /* Handle znode deallocation on last write-lock release. */
23678+ if (znode_is_wlocked_once(node)) {
23679+ if (youdie) {
23680+ forget_znode(handle);
23681+ assert("nikita-2191", znode_invariant(node));
23682+ zput(node);
23683+ return;
23684+ }
23685+ }
23686+
23687+ if (handle->signaled)
23688+ atomic_dec(&oldowner->nr_signaled);
23689+
23690+ /* Unlocking means owner<->object link deletion */
23691+ unlink_object(handle);
23692+
23693+ /* This is enough to be sure whether an object is completely
23694+ unlocked. */
23695+ node->lock.nr_readers += rdelta;
23696+
23697+ /* If the node is locked it must have an owners list. Likewise, if
23698+ the node is unlocked it must have an empty owners list. */
23699+ assert("zam-319", equi(znode_is_locked(node),
23700+ !list_empty_careful(&node->lock.owners)));
23701+
23702+#if REISER4_DEBUG
23703+ if (!znode_is_locked(node))
23704+ ++node->times_locked;
23705+#endif
23706+
23707+ /* If there are pending lock requests we wake up a requestor */
23708+ if (!znode_is_wlocked(node))
23709+ dispatch_lock_requests(node);
23710+ if (check_deadlock_condition(node))
23711+ wake_up_all_lopri_owners(node);
23712+ spin_unlock_zlock(&node->lock);
23713+
23714+ /* minus one reference from handle->node */
23715+ assert("nikita-2190", znode_invariant(node));
23716+ ON_DEBUG(check_lock_data());
23717+ ON_DEBUG(check_lock_node_data(node));
23718+ zput(node);
23719+}
23720+
23721+/* final portion of longterm-lock */
23722+static int
23723+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23724+{
23725+ znode *node = owner->request.node;
23726+
23727+ assert_spin_locked(&(node->lock.guard));
23728+
23729+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
23730+ if (ok == 0) {
23731+ lock_object(owner);
23732+ owner->request.mode = 0;
23733+ /* count a reference from lockhandle->node
23734+
23735+ znode was already referenced at the entry to this function,
23736+ hence taking spin-lock here is not necessary (see comment
23737+ in the zref()).
23738+ */
23739+ zref(node);
23740+
23741+ LOCK_CNT_INC(long_term_locked_znode);
23742+ }
23743+ spin_unlock_zlock(&node->lock);
23744+ ON_DEBUG(check_lock_data());
23745+ ON_DEBUG(check_lock_node_data(node));
23746+ return ok;
23747+}
23748+
23749+/*
23750+ * version of longterm_znode_lock() optimized for the most common case: read
23751+ * lock without any special flags. This is the kind of lock that any tree
23752+ * traversal takes on the root node of the tree, which is very frequent.
23753+ */
23754+static int longterm_lock_tryfast(lock_stack * owner)
23755+{
23756+ int result;
23757+ znode *node;
23758+ zlock *lock;
23759+
23760+ node = owner->request.node;
23761+ lock = &node->lock;
23762+
23763+ assert("nikita-3340", reiser4_schedulable());
23764+ assert("nikita-3341", request_is_deadlock_safe(node,
23765+ ZNODE_READ_LOCK,
23766+ ZNODE_LOCK_LOPRI));
23767+ spin_lock_zlock(lock);
23768+ result = can_lock_object(owner);
23769+ spin_unlock_zlock(lock);
23770+
23771+ if (likely(result != -EINVAL)) {
23772+ spin_lock_znode(node);
23773+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23774+ spin_unlock_znode(node);
23775+ spin_lock_zlock(lock);
23776+ if (unlikely(result != 0)) {
23777+ owner->request.mode = 0;
23778+ } else {
23779+ result = can_lock_object(owner);
23780+ if (unlikely(result == -E_REPEAT)) {
23781+ /* fall back to longterm_lock_znode() */
23782+ spin_unlock_zlock(lock);
23783+ return 1;
23784+ }
23785+ }
23786+ return lock_tail(owner, result, ZNODE_READ_LOCK);
23787+ } else
23788+ return 1;
23789+}
23790+
23791+/* locks given lock object */
23792+int longterm_lock_znode(
23793+ /* local link object (allocated by lock owner thread, usually on its own
23794+ * stack) */
23795+ lock_handle * handle,
23796+ /* znode we want to lock. */
23797+ znode * node,
23798+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23799+ znode_lock_mode mode,
23800+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23801+ znode_lock_request request) {
23802+ int ret;
23803+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23804+ int non_blocking = 0;
23805+ int has_atom;
23806+ txn_capture cap_flags;
23807+ zlock *lock;
23808+ txn_handle *txnh;
23809+ tree_level level;
23810+
23811+ /* Get current process context */
23812+ lock_stack *owner = get_current_lock_stack();
23813+
23814+ /* Check that the lock handle is initialized and isn't already being
23815+ * used. */
23816+ assert("jmacd-808", handle->owner == NULL);
23817+ assert("nikita-3026", reiser4_schedulable());
23818+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23819+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23820+ /* long term locks are not allowed in the VM contexts (->writepage(),
23821+ * prune_{d,i}cache()).
23822+ *
23823+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23824+ * bug caused by d_splice_alias() only working for directories.
23825+ */
23826+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23827+ assert ("zam-1055", mode != ZNODE_NO_LOCK);
23828+
23829+ cap_flags = 0;
23830+ if (request & ZNODE_LOCK_NONBLOCK) {
23831+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
23832+ non_blocking = 1;
23833+ }
23834+
23835+ if (request & ZNODE_LOCK_DONT_FUSE)
23836+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
23837+
23838+ /* If we are changing our process priority we must adjust a number
23839+ of high priority owners for each znode that we already lock */
23840+ if (hipri) {
23841+ set_high_priority(owner);
23842+ } else {
23843+ set_low_priority(owner);
23844+ }
23845+
23846+ level = znode_get_level(node);
23847+
23848+ /* Fill request structure with our values. */
23849+ owner->request.mode = mode;
23850+ owner->request.handle = handle;
23851+ owner->request.node = node;
23852+
23853+ txnh = get_current_context()->trans;
23854+ lock = &node->lock;
23855+
23856+ if (mode == ZNODE_READ_LOCK && request == 0) {
23857+ ret = longterm_lock_tryfast(owner);
23858+ if (ret <= 0)
23859+ return ret;
23860+ }
23861+
23862+ has_atom = (txnh->atom != NULL);
23863+
23864+ /* Synchronize on node's zlock guard lock. */
23865+ spin_lock_zlock(lock);
23866+
23867+ if (znode_is_locked(node) &&
23868+ mode == ZNODE_WRITE_LOCK && recursive(owner))
23869+ return lock_tail(owner, 0, mode);
23870+
23871+ for (;;) {
23872+ /* Check the lock's availability: if it is unavaiable we get
23873+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
23874+ invalid. */
23875+ ret = can_lock_object(owner);
23876+
23877+ if (unlikely(ret == -EINVAL)) {
23878+ /* @node is dying. Leave it alone. */
23879+ break;
23880+ }
23881+
23882+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
23883+ /* either locking of @node by the current thread will
23884+ * lead to the deadlock, or lock modes are
23885+ * incompatible. */
23886+ break;
23887+ }
23888+
23889+ assert("nikita-1844", (ret == 0)
23890+ || ((ret == -E_REPEAT) && !non_blocking));
23891+ /* If we can get the lock... Try to capture first before
23892+ taking the lock. */
23893+
23894+ /* first handle commonest case where node and txnh are already
23895+ * in the same atom. */
23896+ /* safe to do without taking locks, because:
23897+ *
23898+ * 1. read of aligned word is atomic with respect to writes to
23899+ * this word
23900+ *
23901+ * 2. false negatives are handled in reiser4_try_capture().
23902+ *
23903+ * 3. false positives are impossible.
23904+ *
23905+ * PROOF: left as an exercise to the curious reader.
23906+ *
23907+ * Just kidding. Here is one:
23908+ *
23909+ * At the time T0 txnh->atom is stored in txnh_atom.
23910+ *
23911+ * At the time T1 node->atom is stored in node_atom.
23912+ *
23913+ * At the time T2 we observe that
23914+ *
23915+ * txnh_atom != NULL && node_atom == txnh_atom.
23916+ *
23917+ * Imagine that at this moment we acquire node and txnh spin
23918+ * lock in this order. Suppose that under spin lock we have
23919+ *
23920+ * node->atom != txnh->atom, (S1)
23921+ *
23922+ * at the time T3.
23923+ *
23924+ * txnh->atom != NULL still, because txnh is open by the
23925+ * current thread.
23926+ *
23927+ * Suppose node->atom == NULL, that is, node was un-captured
23928+ * between T1, and T3. But un-capturing of formatted node is
23929+ * always preceded by the call to reiser4_invalidate_lock(),
23930+ * which marks znode as JNODE_IS_DYING under zlock spin
23931+ * lock. Contradiction, because can_lock_object() above checks
23932+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23933+ *
23934+ * Suppose that node->atom != node_atom, that is, atom, node
23935+ * belongs to was fused into another atom: node_atom was fused
23936+ * into node->atom. Atom of txnh was equal to node_atom at T2,
23937+ * which means that under spin lock, txnh->atom == node->atom,
23938+ * because txnh->atom can only follow fusion
23939+ * chain. Contradicts S1.
23940+ *
23941+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
23942+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
23943+ * contradicts S1. Hence S1 is false. QED.
23944+ *
23945+ */
23946+
23947+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23948+ ;
23949+ } else {
23950+ /*
23951+ * unlock zlock spin lock here. It is possible for
23952+ * longterm_unlock_znode() to sneak in here, but there
23953+ * is no harm: reiser4_invalidate_lock() will mark znode
23954+ * as JNODE_IS_DYING and this will be noted by
23955+ * can_lock_object() below.
23956+ */
23957+ spin_unlock_zlock(lock);
23958+ spin_lock_znode(node);
23959+ ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23960+ spin_unlock_znode(node);
23961+ spin_lock_zlock(lock);
23962+ if (unlikely(ret != 0)) {
23963+ /* In the failure case, the txnmgr releases
23964+ the znode's lock (or in some cases, it was
23965+ released a while ago). There's no need to
23966+ reacquire it so we should return here,
23967+ avoid releasing the lock. */
23968+ owner->request.mode = 0;
23969+ break;
23970+ }
23971+
23972+ /* Check the lock's availability again -- this is
23973+ because under some circumstances the capture code
23974+ has to release and reacquire the znode spinlock. */
23975+ ret = can_lock_object(owner);
23976+ }
23977+
23978+ /* This time, a return of (ret == 0) means we can lock, so we
23979+ should break out of the loop. */
23980+ if (likely(ret != -E_REPEAT || non_blocking))
23981+ break;
23982+
23983+ /* Lock is unavailable, we have to wait. */
23984+ ret = reiser4_prepare_to_sleep(owner);
23985+ if (unlikely(ret != 0))
23986+ break;
23987+
23988+ assert_spin_locked(&(node->lock.guard));
23989+ if (hipri) {
23990+ /* If we are going in high priority direction then
23991+ increase high priority requests counter for the
23992+ node */
23993+ lock->nr_hipri_requests++;
23994+ if (mode == ZNODE_WRITE_LOCK)
23995+ lock->nr_hipri_write_requests ++;
23996+ /* If there are no high priority owners for a node,
23997+ then immediately wake up low priority owners, so
23998+ they can detect possible deadlock */
23999+ if (lock->nr_hipri_owners == 0)
24000+ wake_up_all_lopri_owners(node);
24001+ }
24002+ list_add_tail(&owner->requestors_link, &lock->requestors);
24003+
24004+ /* Ok, here we have prepared a lock request, so unlock
24005+ a znode ... */
24006+ spin_unlock_zlock(lock);
24007+ /* ... and sleep */
24008+ reiser4_go_to_sleep(owner);
24009+ if (owner->request.mode == ZNODE_NO_LOCK)
24010+ goto request_is_done;
24011+ spin_lock_zlock(lock);
24012+ if (owner->request.mode == ZNODE_NO_LOCK) {
24013+ spin_unlock_zlock(lock);
24014+ request_is_done:
24015+ if (owner->request.ret_code == 0) {
24016+ LOCK_CNT_INC(long_term_locked_znode);
24017+ zref(node);
24018+ }
24019+ return owner->request.ret_code;
24020+ }
24021+ remove_lock_request(owner);
24022+ }
24023+
24024+ return lock_tail(owner, ret, mode);
24025+}
24026+
24027+/* lock object invalidation means changing of lock object state to `INVALID'
24028+ and waiting for all other processes to cancel theirs lock requests. */
24029+void reiser4_invalidate_lock(lock_handle * handle /* path to lock
24030+ * owner and lock
24031+ * object is being
24032+ * invalidated. */ )
24033+{
24034+ znode *node = handle->node;
24035+ lock_stack *owner = handle->owner;
24036+
24037+ assert("zam-325", owner == get_current_lock_stack());
24038+ assert("zam-103", znode_is_write_locked(node));
24039+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
24040+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
24041+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
24042+ assert("nikita-3097", znode_is_wlocked_once(node));
24043+ assert_spin_locked(&(node->lock.guard));
24044+
24045+ if (handle->signaled)
24046+ atomic_dec(&owner->nr_signaled);
24047+
24048+ ZF_SET(node, JNODE_IS_DYING);
24049+ unlink_object(handle);
24050+ node->lock.nr_readers = 0;
24051+
24052+ invalidate_all_lock_requests(node);
24053+ spin_unlock_zlock(&node->lock);
24054+}
24055+
24056+/* Initializes lock_stack. */
24057+void init_lock_stack(lock_stack * owner /* pointer to
24058+ * allocated
24059+ * structure. */ )
24060+{
24061+ INIT_LIST_HEAD(&owner->locks);
24062+ INIT_LIST_HEAD(&owner->requestors_link);
24063+ spin_lock_init(&owner->sguard);
24064+ owner->curpri = 1;
24065+ init_waitqueue_head(&owner->wait);
24066+}
24067+
24068+/* Initializes lock object. */
24069+void reiser4_init_lock(zlock * lock /* pointer on allocated
24070+ * uninitialized lock object
24071+ * structure. */ )
24072+{
24073+ memset(lock, 0, sizeof(zlock));
24074+ spin_lock_init(&lock->guard);
24075+ INIT_LIST_HEAD(&lock->requestors);
24076+ INIT_LIST_HEAD(&lock->owners);
24077+}
24078+
24079+/* Transfer a lock handle (presumably so that variables can be moved between stack and
24080+ heap locations). */
24081+static void
24082+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
24083+{
24084+ znode *node = old->node;
24085+ lock_stack *owner = old->owner;
24086+ int signaled;
24087+
24088+ /* locks_list, modified by link_object() is not protected by
24089+ anything. This is valid because only current thread ever modifies
24090+ locks_list of its lock_stack.
24091+ */
24092+ assert("nikita-1827", owner == get_current_lock_stack());
24093+ assert("nikita-1831", new->owner == NULL);
24094+
24095+ spin_lock_zlock(&node->lock);
24096+
24097+ signaled = old->signaled;
24098+ if (unlink_old) {
24099+ unlink_object(old);
24100+ } else {
24101+ if (node->lock.nr_readers > 0) {
24102+ node->lock.nr_readers += 1;
24103+ } else {
24104+ node->lock.nr_readers -= 1;
24105+ }
24106+ if (signaled) {
24107+ atomic_inc(&owner->nr_signaled);
24108+ }
24109+ if (owner->curpri) {
24110+ node->lock.nr_hipri_owners += 1;
24111+ }
24112+ LOCK_CNT_INC(long_term_locked_znode);
24113+
24114+ zref(node);
24115+ }
24116+ link_object(new, owner, node);
24117+ new->signaled = signaled;
24118+
24119+ spin_unlock_zlock(&node->lock);
24120+}
24121+
24122+void move_lh(lock_handle * new, lock_handle * old)
24123+{
24124+ move_lh_internal(new, old, /*unlink_old */ 1);
24125+}
24126+
24127+void copy_lh(lock_handle * new, lock_handle * old)
24128+{
24129+ move_lh_internal(new, old, /*unlink_old */ 0);
24130+}
24131+
24132+/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
24133+int reiser4_check_deadlock(void)
24134+{
24135+ lock_stack *owner = get_current_lock_stack();
24136+ return atomic_read(&owner->nr_signaled) != 0;
24137+}
24138+
24139+/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
24140+ priorities. */
24141+int reiser4_prepare_to_sleep(lock_stack * owner)
24142+{
24143+ assert("nikita-1847", owner == get_current_lock_stack());
24144+
24145+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
24146+ * counted in nr_signaled */
24147+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
24148+ assert("zam-959", !owner->curpri);
24149+ return RETERR(-E_DEADLOCK);
24150+ }
24151+ return 0;
24152+}
24153+
24154+/* Wakes up a single thread */
24155+void __reiser4_wake_up(lock_stack * owner)
24156+{
24157+ atomic_set(&owner->wakeup, 1);
24158+ wake_up(&owner->wait);
24159+}
24160+
24161+/* Puts a thread to sleep */
24162+void reiser4_go_to_sleep(lock_stack * owner)
24163+{
24164+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
24165+ assert("nikita-3027", reiser4_schedulable());
24166+
24167+ wait_event(owner->wait, atomic_read(&owner->wakeup));
24168+ atomic_set(&owner->wakeup, 0);
24169+}
24170+
24171+int lock_stack_isclean(lock_stack * owner)
24172+{
24173+ if (list_empty_careful(&owner->locks)) {
24174+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
24175+ return 1;
24176+ }
24177+
24178+ return 0;
24179+}
24180+
24181+#if REISER4_DEBUG
24182+
24183+/*
24184+ * debugging functions
24185+ */
24186+
24187+static void list_check(struct list_head *head)
24188+{
24189+ struct list_head *pos;
24190+
24191+ list_for_each(pos, head)
24192+ assert("", (pos->prev != NULL && pos->next != NULL &&
24193+ pos->prev->next == pos && pos->next->prev == pos));
24194+}
24195+
24196+/* check consistency of locking data-structures hanging of the @stack */
24197+static void check_lock_stack(lock_stack * stack)
24198+{
24199+ spin_lock_stack(stack);
24200+ /* check that stack->locks is not corrupted */
24201+ list_check(&stack->locks);
24202+ spin_unlock_stack(stack);
24203+}
24204+
24205+/* check consistency of locking data structures */
24206+void check_lock_data(void)
24207+{
24208+ check_lock_stack(&get_current_context()->stack);
24209+}
24210+
24211+/* check consistency of locking data structures for @node */
24212+void check_lock_node_data(znode * node)
24213+{
24214+ spin_lock_zlock(&node->lock);
24215+ list_check(&node->lock.owners);
24216+ list_check(&node->lock.requestors);
24217+ spin_unlock_zlock(&node->lock);
24218+}
24219+
24220+/* check that given lock request is dead lock safe. This check is, of course,
24221+ * not exhaustive. */
24222+static int
24223+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
24224+ znode_lock_request request)
24225+{
24226+ lock_stack *owner;
24227+
24228+ owner = get_current_lock_stack();
24229+ /*
24230+ * check that hipri lock request is not issued when there are locked
24231+ * nodes at the higher levels.
24232+ */
24233+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
24234+ znode_get_level(node) != 0) {
24235+ lock_handle *item;
24236+
24237+ list_for_each_entry(item, &owner->locks, locks_link) {
24238+ znode *other;
24239+
24240+ other = item->node;
24241+
24242+ if (znode_get_level(other) == 0)
24243+ continue;
24244+ if (znode_get_level(other) > znode_get_level(node))
24245+ return 0;
24246+ }
24247+ }
24248+ return 1;
24249+}
24250+
24251+#endif
24252+
24253+/* return pointer to static storage with name of lock_mode. For
24254+ debugging */
24255+const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
24256+{
24257+ if (lock == ZNODE_READ_LOCK)
24258+ return "read";
24259+ else if (lock == ZNODE_WRITE_LOCK)
24260+ return "write";
24261+ else {
24262+ static char buf[30];
24263+
24264+ sprintf(buf, "unknown: %i", lock);
24265+ return buf;
24266+ }
24267+}
24268+
24269+/* Make Linus happy.
24270+ Local variables:
24271+ c-indentation-style: "K&R"
24272+ mode-name: "LC"
24273+ c-basic-offset: 8
24274+ tab-width: 8
24275+ fill-column: 79
24276+ End:
24277+*/
24278diff --git a/fs/reiser4/lock.h b/fs/reiser4/lock.h
24279new file mode 100644
24280index 0000000..e130466
24281--- /dev/null
24282+++ b/fs/reiser4/lock.h
24283@@ -0,0 +1,249 @@
24284+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
24285+
24286+/* Long term locking data structures. See lock.c for details. */
24287+
24288+#ifndef __LOCK_H__
24289+#define __LOCK_H__
24290+
24291+#include "forward.h"
24292+#include "debug.h"
24293+#include "dformat.h"
24294+#include "key.h"
24295+#include "coord.h"
24296+#include "plugin/node/node.h"
24297+#include "txnmgr.h"
24298+#include "readahead.h"
24299+
24300+#include <linux/types.h>
24301+#include <linux/spinlock.h>
24302+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
24303+#include <asm/atomic.h>
24304+#include <linux/wait.h>
24305+
24306+/* Per-znode lock object */
24307+struct zlock {
24308+ spinlock_t guard;
24309+ /* The number of readers if positive; the number of recursively taken
24310+ write locks if negative. Protected by zlock spin lock. */
24311+ int nr_readers;
24312+ /* A number of processes (lock_stacks) that have this object
24313+ locked with high priority */
24314+ unsigned nr_hipri_owners;
24315+ /* A number of attempts to lock znode in high priority direction */
24316+ unsigned nr_hipri_requests;
24317+ /* A linked list of lock_handle objects that contains pointers
24318+ for all lock_stacks which have this lock object locked */
24319+ unsigned nr_hipri_write_requests;
24320+ struct list_head owners;
24321+ /* A linked list of lock_stacks that wait for this lock */
24322+ struct list_head requestors;
24323+};
24324+
24325+static inline void spin_lock_zlock(zlock *lock)
24326+{
24327+ /* check that zlock is not locked */
24328+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
24329+ /* check that spinlocks of lower priorities are not held */
24330+ assert("", LOCK_CNT_NIL(spin_locked_stack));
24331+
24332+ spin_lock(&lock->guard);
24333+
24334+ LOCK_CNT_INC(spin_locked_zlock);
24335+ LOCK_CNT_INC(spin_locked);
24336+}
24337+
24338+static inline void spin_unlock_zlock(zlock *lock)
24339+{
24340+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24341+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24342+
24343+ LOCK_CNT_DEC(spin_locked_zlock);
24344+ LOCK_CNT_DEC(spin_locked);
24345+
24346+ spin_unlock(&lock->guard);
24347+}
24348+
24349+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
24350+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
24351+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
24352+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
24353+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0)
24354+#define lock_mode_compatible(lock, mode) \
24355+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24356+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24357+
24358+/* Since we have R/W znode locks we need additional bidirectional `link'
24359+ objects to implement n<->m relationship between lock owners and lock
24360+ objects. We call them `lock handles'.
24361+
24362+ Locking: see lock.c/"SHORT-TERM LOCKING"
24363+*/
24364+struct lock_handle {
24365+ /* This flag indicates that a signal to yield a lock was passed to
24366+ lock owner and counted in owner->nr_signalled
24367+
24368+ Locking: this is accessed under spin lock on ->node.
24369+ */
24370+ int signaled;
24371+ /* A link to owner of a lock */
24372+ lock_stack *owner;
24373+ /* A link to znode locked */
24374+ znode *node;
24375+ /* A list of all locks for a process */
24376+ struct list_head locks_link;
24377+ /* A list of all owners for a znode */
24378+ struct list_head owners_link;
24379+};
24380+
24381+typedef struct lock_request {
24382+ /* A pointer to uninitialized link object */
24383+ lock_handle *handle;
24384+ /* A pointer to the object we want to lock */
24385+ znode *node;
24386+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24387+ znode_lock_mode mode;
24388+ /* how dispatch_lock_requests() returns lock request result code */
24389+ int ret_code;
24390+} lock_request;
24391+
24392+/* A lock stack structure for accumulating locks owned by a process */
24393+struct lock_stack {
24394+ /* A guard lock protecting a lock stack */
24395+ spinlock_t sguard;
24396+ /* number of znodes which were requested by high priority processes */
24397+ atomic_t nr_signaled;
24398+ /* Current priority of a process
24399+
24400+ This is only accessed by the current thread and thus requires no
24401+ locking.
24402+ */
24403+ int curpri;
24404+ /* A list of all locks owned by this process. Elements can be added to
24405+ * this list only by the current thread. ->node pointers in this list
24406+ * can be only changed by the current thread. */
24407+ struct list_head locks;
24408+ /* When lock_stack waits for the lock, it puts itself on double-linked
24409+ requestors list of that lock */
24410+ struct list_head requestors_link;
24411+ /* Current lock request info.
24412+
24413+ This is only accessed by the current thread and thus requires no
24414+ locking.
24415+ */
24416+ lock_request request;
24417+ /* the following two fields are the lock stack's
24418+ * synchronization object to use with the standard linux/wait.h
24419+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
24420+ * usage details. */
24421+ wait_queue_head_t wait;
24422+ atomic_t wakeup;
24423+#if REISER4_DEBUG
24424+ int nr_locks; /* number of lock handles in the above list */
24425+#endif
24426+};
24427+
24428+/*
24429+ User-visible znode locking functions
24430+*/
24431+
24432+extern int longterm_lock_znode(lock_handle * handle,
24433+ znode * node,
24434+ znode_lock_mode mode,
24435+ znode_lock_request request);
24436+
24437+extern void longterm_unlock_znode(lock_handle * handle);
24438+
24439+extern int reiser4_check_deadlock(void);
24440+
24441+extern lock_stack *get_current_lock_stack(void);
24442+
24443+extern void init_lock_stack(lock_stack * owner);
24444+extern void reiser4_init_lock(zlock * lock);
24445+
24446+static inline void init_lh(lock_handle *lh)
24447+{
24448+#if REISER4_DEBUG
24449+ memset(lh, 0, sizeof *lh);
24450+ INIT_LIST_HEAD(&lh->locks_link);
24451+ INIT_LIST_HEAD(&lh->owners_link);
24452+#else
24453+ lh->node = NULL;
24454+#endif
24455+}
24456+
24457+static inline void done_lh(lock_handle *lh)
24458+{
24459+ assert("zam-342", lh != NULL);
24460+ if (lh->node != NULL)
24461+ longterm_unlock_znode(lh);
24462+}
24463+
24464+extern void move_lh(lock_handle * new, lock_handle * old);
24465+extern void copy_lh(lock_handle * new, lock_handle * old);
24466+
24467+extern int reiser4_prepare_to_sleep(lock_stack * owner);
24468+extern void reiser4_go_to_sleep(lock_stack * owner);
24469+extern void __reiser4_wake_up(lock_stack * owner);
24470+
24471+extern int lock_stack_isclean(lock_stack * owner);
24472+
24473+/* zlock object state check macros: only used in assertions. Both forms imply that the
24474+ lock is held by the current thread. */
24475+extern int znode_is_write_locked(const znode *);
24476+extern void reiser4_invalidate_lock(lock_handle *);
24477+
24478+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24479+#define spin_ordering_pred_stack(stack) \
24480+ (LOCK_CNT_NIL(spin_locked_stack) && \
24481+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
24482+ LOCK_CNT_NIL(spin_locked_inode) && \
24483+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
24484+ LOCK_CNT_NIL(spin_locked_super_eflush) )
24485+
24486+static inline void spin_lock_stack(lock_stack *stack)
24487+{
24488+ assert("", spin_ordering_pred_stack(stack));
24489+ spin_lock(&(stack->sguard));
24490+ LOCK_CNT_INC(spin_locked_stack);
24491+ LOCK_CNT_INC(spin_locked);
24492+}
24493+
24494+static inline void spin_unlock_stack(lock_stack *stack)
24495+{
24496+ assert_spin_locked(&(stack->sguard));
24497+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24498+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24499+ LOCK_CNT_DEC(spin_locked_stack);
24500+ LOCK_CNT_DEC(spin_locked);
24501+ spin_unlock(&(stack->sguard));
24502+}
24503+
24504+static inline void reiser4_wake_up(lock_stack * owner)
24505+{
24506+ spin_lock_stack(owner);
24507+ __reiser4_wake_up(owner);
24508+ spin_unlock_stack(owner);
24509+}
24510+
24511+const char *lock_mode_name(znode_lock_mode lock);
24512+
24513+#if REISER4_DEBUG
24514+extern void check_lock_data(void);
24515+extern void check_lock_node_data(znode * node);
24516+#else
24517+#define check_lock_data() noop
24518+#define check_lock_node_data() noop
24519+#endif
24520+
24521+/* __LOCK_H__ */
24522+#endif
24523+
24524+/* Make Linus happy.
24525+ Local variables:
24526+ c-indentation-style: "K&R"
24527+ mode-name: "LC"
24528+ c-basic-offset: 8
24529+ tab-width: 8
24530+ fill-column: 120
24531+ End:
24532+*/
24533diff --git a/fs/reiser4/oid.c b/fs/reiser4/oid.c
24534new file mode 100644
24535index 0000000..f311d06
24536--- /dev/null
24537+++ b/fs/reiser4/oid.c
24538@@ -0,0 +1,141 @@
24539+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24540+
24541+#include "debug.h"
24542+#include "super.h"
24543+#include "txnmgr.h"
24544+
24545+/* we used to have oid allocation plugin. It was removed because it
24546+ was recognized as providing unneeded level of abstraction. If one
24547+ ever will find it useful - look at yet_unneeded_abstractions/oid
24548+*/
24549+
24550+/*
24551+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
24552+ * are provided by disk format plugin that reads them from the disk during
24553+ * mount.
24554+ */
24555+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24556+{
24557+ reiser4_super_info_data *sbinfo;
24558+
24559+ sbinfo = get_super_private(super);
24560+
24561+ sbinfo->next_to_use = next;
24562+ sbinfo->oids_in_use = nr_files;
24563+ return 0;
24564+}
24565+
24566+/*
24567+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24568+ * runs out of oids.
24569+ */
24570+oid_t oid_allocate(struct super_block * super)
24571+{
24572+ reiser4_super_info_data *sbinfo;
24573+ oid_t oid;
24574+
24575+ sbinfo = get_super_private(super);
24576+
24577+ spin_lock_reiser4_super(sbinfo);
24578+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24579+ oid = sbinfo->next_to_use++;
24580+ sbinfo->oids_in_use++;
24581+ } else
24582+ oid = ABSOLUTE_MAX_OID;
24583+ spin_unlock_reiser4_super(sbinfo);
24584+ return oid;
24585+}
24586+
24587+/*
24588+ * Tell oid allocator that @oid is now free.
24589+ */
24590+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24591+{
24592+ reiser4_super_info_data *sbinfo;
24593+
24594+ sbinfo = get_super_private(super);
24595+
24596+ spin_lock_reiser4_super(sbinfo);
24597+ sbinfo->oids_in_use--;
24598+ spin_unlock_reiser4_super(sbinfo);
24599+ return 0;
24600+}
24601+
24602+/*
24603+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
24604+ * without actually allocating it. This is used by disk format plugin to save
24605+ * oid allocator state on the disk.
24606+ */
24607+oid_t oid_next(const struct super_block * super)
24608+{
24609+ reiser4_super_info_data *sbinfo;
24610+ oid_t oid;
24611+
24612+ sbinfo = get_super_private(super);
24613+
24614+ spin_lock_reiser4_super(sbinfo);
24615+ oid = sbinfo->next_to_use;
24616+ spin_unlock_reiser4_super(sbinfo);
24617+ return oid;
24618+}
24619+
24620+/*
24621+ * returns number of currently used oids. This is used by statfs(2) to report
24622+ * number of "inodes" and by disk format plugin to save oid allocator state on
24623+ * the disk.
24624+ */
24625+long oids_used(const struct super_block *super)
24626+{
24627+ reiser4_super_info_data *sbinfo;
24628+ oid_t used;
24629+
24630+ sbinfo = get_super_private(super);
24631+
24632+ spin_lock_reiser4_super(sbinfo);
24633+ used = sbinfo->oids_in_use;
24634+ spin_unlock_reiser4_super(sbinfo);
24635+ if (used < (__u64) ((long)~0) >> 1)
24636+ return (long)used;
24637+ else
24638+ return (long)-1;
24639+}
24640+
24641+/*
24642+ * Count oid as allocated in atom. This is done after call to oid_allocate()
24643+ * at the point when we are irrevocably committed to creation of the new file
24644+ * (i.e., when oid allocation cannot be any longer rolled back due to some
24645+ * error).
24646+ */
24647+void oid_count_allocated(void)
24648+{
24649+ txn_atom *atom;
24650+
24651+ atom = get_current_atom_locked();
24652+ atom->nr_objects_created++;
24653+ spin_unlock_atom(atom);
24654+}
24655+
24656+/*
24657+ * Count oid as free in atom. This is done after call to oid_release() at the
24658+ * point when we are irrevocably committed to the deletion of the file (i.e.,
24659+ * when oid release cannot be any longer rolled back due to some error).
24660+ */
24661+void oid_count_released(void)
24662+{
24663+ txn_atom *atom;
24664+
24665+ atom = get_current_atom_locked();
24666+ atom->nr_objects_deleted++;
24667+ spin_unlock_atom(atom);
24668+}
24669+
24670+/*
24671+ Local variables:
24672+ c-indentation-style: "K&R"
24673+ mode-name: "LC"
24674+ c-basic-offset: 8
24675+ tab-width: 8
24676+ fill-column: 120
24677+ scroll-step: 1
24678+ End:
24679+*/
24680diff --git a/fs/reiser4/page_cache.c b/fs/reiser4/page_cache.c
24681new file mode 100644
24682index 0000000..e1f436d
24683--- /dev/null
24684+++ b/fs/reiser4/page_cache.c
24685@@ -0,0 +1,736 @@
24686+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24687+ * reiser4/README */
24688+
24689+/* Memory pressure hooks. Fake inodes handling. */
24690+
24691+/* GLOSSARY
24692+
24693+ . Formatted and unformatted nodes.
24694+ Elements of reiser4 balanced tree to store data and metadata.
24695+ Unformatted nodes are pointed to by extent pointers. Such nodes
24696+ are used to store data of large objects. Unlike unformatted nodes,
24697+ formatted ones have associated format described by node4X plugin.
24698+
24699+ . Jnode (or journal node)
24700+ The in-memory header which is used to track formatted and unformatted
24701+ nodes, bitmap nodes, etc. In particular, jnodes are used to track
24702+ transactional information associated with each block(see reiser4/jnode.c
24703+ for details).
24704+
24705+ . Znode
24706+ The in-memory header which is used to track formatted nodes. Contains
24707+ embedded jnode (see reiser4/znode.c for details).
24708+*/
24709+
24710+/* We store all file system meta data (and data, of course) in the page cache.
24711+
24712+ What does this mean? In stead of using bread/brelse we create special
24713+ "fake" inode (one per super block) and store content of formatted nodes
24714+ into pages bound to this inode in the page cache. In newer kernels bread()
24715+ already uses inode attached to block device (bd_inode). Advantage of having
24716+ our own fake inode is that we can install appropriate methods in its
24717+ address_space operations. Such methods are called by VM on memory pressure
24718+ (or during background page flushing) and we can use them to react
24719+ appropriately.
24720+
24721+ In initial version we only support one block per page. Support for multiple
24722+ blocks per page is complicated by relocation.
24723+
24724+ To each page, used by reiser4, jnode is attached. jnode is analogous to
24725+ buffer head. Difference is that jnode is bound to the page permanently:
24726+ jnode cannot be removed from memory until its backing page is.
24727+
24728+ jnode contain pointer to page (->pg field) and page contain pointer to
24729+ jnode in ->private field. Pointer from jnode to page is protected to by
24730+ jnode's spinlock and pointer from page to jnode is protected by page lock
24731+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24732+ lock. To go into reverse direction use jnode_lock_page() function that uses
24733+ standard try-lock-and-release device.
24734+
24735+ Properties:
24736+
24737+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24738+ reference counter is increased.
24739+
24740+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24741+ reference counter is decreased.
24742+
24743+ 3. on jload() reference counter on jnode page is increased, page is
24744+ kmapped and `referenced'.
24745+
24746+ 4. on jrelse() inverse operations are performed.
24747+
24748+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24749+
24750+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24751+ historically.]
24752+
24753+ [In the following discussion, `lock' invariably means long term lock on
24754+ znode.] (What about page locks?)
24755+
24756+ There is some special class of deadlock possibilities related to memory
24757+ pressure. Locks acquired by other reiser4 threads are accounted for in
24758+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24759+ invoked additional hidden arc is added to the locking graph: thread that
24760+ tries to allocate memory waits for ->vm_writeback() to finish. If this
24761+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24762+ prevention is useless.
24763+
24764+ Another related problem is possibility for ->vm_writeback() to run out of
24765+ memory itself. This is not a problem for ext2 and friends, because their
24766+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
24767+ definitely able to allocate huge amounts of memory.
24768+
24769+ It seems that there is no reliable way to cope with the problems above. In
24770+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
24771+ context) wouldn't perform any flushing itself, but rather should just wake
24772+ up some auxiliary thread dedicated for this purpose (or, the same thread
24773+ that does periodic commit of old atoms (ktxnmgrd.c)).
24774+
24775+ Details:
24776+
24777+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
24778+ page can be ultimately released by try_to_free_pages() under presumptions
24779+ that:
24780+
24781+ a. ->vm_writeback() for F is no-op, and
24782+
24783+ b. none of the threads accessing F are making any progress, and
24784+
24785+ c. other reiser4 mounts obey the same memory reservation protocol as F
24786+ (described below).
24787+
24788+ For example, clean un-pinned page, or page occupied by ext2 data are
24789+ reclaimable against any reiser4 mount.
24790+
24791+ When there is more than one reiser4 mount in a system, condition (c) makes
24792+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24793+
24794+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24795+
24796+ Fake inode is used to bound formatted nodes and each node is indexed within
24797+ fake inode by its block number. If block size of smaller than page size, it
24798+ may so happen that block mapped to the page with formatted node is occupied
24799+ by unformatted node or is unallocated. This lead to some complications,
24800+ because flushing whole page can lead to an incorrect overwrite of
24801+ unformatted node that is moreover, can be cached in some other place as
24802+ part of the file body. To avoid this, buffers for unformatted nodes are
24803+ never marked dirty. Also pages in the fake are never marked dirty. This
24804+ rules out usage of ->writepage() as memory pressure hook. In stead
24805+ ->releasepage() is used.
24806+
24807+ Josh is concerned that page->buffer is going to die. This should not pose
24808+ significant problem though, because we need to add some data structures to
24809+ the page anyway (jnode) and all necessary book keeping can be put there.
24810+
24811+*/
24812+
24813+/* Life cycle of pages/nodes.
24814+
24815+ jnode contains reference to page and page contains reference back to
24816+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
24817+ cannot be released back into free pool.
24818+
24819+ 1. Formatted nodes.
24820+
24821+ 1. formatted node is represented by znode. When new znode is created its
24822+ ->pg pointer is NULL initially.
24823+
24824+ 2. when node content is loaded into znode (by call to zload()) for the
24825+ first time following happens (in call to ->read_node() or
24826+ ->allocate_node()):
24827+
24828+ 1. new page is added to the page cache.
24829+
24830+ 2. this page is attached to znode and its ->count is increased.
24831+
24832+ 3. page is kmapped.
24833+
24834+ 3. if more calls to zload() follow (without corresponding zrelses), page
24835+ counter is left intact and in its stead ->d_count is increased in znode.
24836+
24837+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24838+ ->release_node() is called and page is kunmapped as result.
24839+
24840+ 5. at some moment node can be captured by a transaction. Its ->x_count
24841+ is then increased by transaction manager.
24842+
24843+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24844+ bit set) following will happen (also see comment at the top of znode.c):
24845+
24846+ 1. when last lock is released, node will be uncaptured from
24847+ transaction. This released reference that transaction manager acquired
24848+ at the step 5.
24849+
24850+ 2. when last reference is released, zput() detects that node is
24851+ actually deleted and calls ->delete_node()
24852+ operation. page_cache_delete_node() implementation detaches jnode from
24853+ page and releases page.
24854+
24855+ 7. otherwise (node wasn't removed from the tree), last reference to
24856+ znode will be released after transaction manager committed transaction
24857+ node was in. This implies squallocing of this node (see
24858+ flush.c). Nothing special happens at this point. Znode is still in the
24859+ hash table and page is still attached to it.
24860+
24861+ 8. znode is actually removed from the memory because of the memory
24862+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
24863+ removed by the call to zdrop(). At this moment, page is detached from
24864+ znode and removed from the inode address space.
24865+
24866+*/
24867+
24868+#include "debug.h"
24869+#include "dformat.h"
24870+#include "key.h"
24871+#include "txnmgr.h"
24872+#include "jnode.h"
24873+#include "znode.h"
24874+#include "block_alloc.h"
24875+#include "tree.h"
24876+#include "vfs_ops.h"
24877+#include "inode.h"
24878+#include "super.h"
24879+#include "entd.h"
24880+#include "page_cache.h"
24881+#include "ktxnmgrd.h"
24882+
24883+#include <linux/types.h>
24884+#include <linux/fs.h>
24885+#include <linux/mm.h> /* for struct page */
24886+#include <linux/swap.h> /* for struct page */
24887+#include <linux/pagemap.h>
24888+#include <linux/bio.h>
24889+#include <linux/writeback.h>
24890+#include <linux/blkdev.h>
24891+
24892+static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24893+
24894+static struct address_space_operations formatted_fake_as_ops;
24895+
24896+static const oid_t fake_ino = 0x1;
24897+static const oid_t bitmap_ino = 0x2;
24898+static const oid_t cc_ino = 0x3;
24899+
24900+static void
24901+init_fake_inode(struct super_block *super, struct inode *fake,
24902+ struct inode **pfake)
24903+{
24904+ assert("nikita-2168", fake->i_state & I_NEW);
24905+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
24906+ *pfake = fake;
24907+ /* NOTE-NIKITA something else? */
24908+ unlock_new_inode(fake);
24909+}
24910+
24911+/**
24912+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24913+ * @super: super block to init fake inode for
24914+ *
24915+ * Initializes fake inode to which formatted nodes are bound in the page cache
24916+ * and inode for bitmaps.
24917+ */
24918+int reiser4_init_formatted_fake(struct super_block *super)
24919+{
24920+ struct inode *fake;
24921+ struct inode *bitmap;
24922+ struct inode *cc;
24923+ reiser4_super_info_data *sinfo;
24924+
24925+ assert("nikita-1703", super != NULL);
24926+
24927+ sinfo = get_super_private_nocheck(super);
24928+ fake = iget_locked(super, oid_to_ino(fake_ino));
24929+
24930+ if (fake != NULL) {
24931+ init_fake_inode(super, fake, &sinfo->fake);
24932+
24933+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24934+ if (bitmap != NULL) {
24935+ init_fake_inode(super, bitmap, &sinfo->bitmap);
24936+
24937+ cc = iget_locked(super, oid_to_ino(cc_ino));
24938+ if (cc != NULL) {
24939+ init_fake_inode(super, cc, &sinfo->cc);
24940+ return 0;
24941+ } else {
24942+ iput(sinfo->fake);
24943+ iput(sinfo->bitmap);
24944+ sinfo->fake = NULL;
24945+ sinfo->bitmap = NULL;
24946+ }
24947+ } else {
24948+ iput(sinfo->fake);
24949+ sinfo->fake = NULL;
24950+ }
24951+ }
24952+ return RETERR(-ENOMEM);
24953+}
24954+
24955+/**
24956+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24957+ * @super: super block to init fake inode for
24958+ *
24959+ * Releases inodes which were used as address spaces of bitmap and formatted
24960+ * nodes.
24961+ */
24962+void reiser4_done_formatted_fake(struct super_block *super)
24963+{
24964+ reiser4_super_info_data *sinfo;
24965+
24966+ sinfo = get_super_private_nocheck(super);
24967+
24968+ if (sinfo->fake != NULL) {
24969+ iput(sinfo->fake);
24970+ sinfo->fake = NULL;
24971+ }
24972+
24973+ if (sinfo->bitmap != NULL) {
24974+ iput(sinfo->bitmap);
24975+ sinfo->bitmap = NULL;
24976+ }
24977+
24978+ if (sinfo->cc != NULL) {
24979+ iput(sinfo->cc);
24980+ sinfo->cc = NULL;
24981+ }
24982+ return;
24983+}
24984+
24985+void reiser4_wait_page_writeback(struct page *page)
24986+{
24987+ assert("zam-783", PageLocked(page));
24988+
24989+ do {
24990+ unlock_page(page);
24991+ wait_on_page_writeback(page);
24992+ lock_page(page);
24993+ } while (PageWriteback(page));
24994+}
24995+
24996+/* return tree @page is in */
24997+reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24998+{
24999+ assert("nikita-2461", page != NULL);
25000+ return &get_super_private(page->mapping->host->i_sb)->tree;
25001+}
25002+
25003+/* completion handler for single page bio-based read.
25004+
25005+ mpage_end_io_read() would also do. But it's static.
25006+
25007+*/
25008+static int
25009+end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
25010+ int err UNUSED_ARG)
25011+{
25012+ struct page *page;
25013+
25014+ if (bio->bi_size != 0) {
25015+ warning("nikita-3332", "Truncated single page read: %i",
25016+ bio->bi_size);
25017+ return 1;
25018+ }
25019+
25020+ page = bio->bi_io_vec[0].bv_page;
25021+
25022+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
25023+ SetPageUptodate(page);
25024+ } else {
25025+ ClearPageUptodate(page);
25026+ SetPageError(page);
25027+ }
25028+ unlock_page(page);
25029+ bio_put(bio);
25030+ return 0;
25031+}
25032+
25033+/* completion handler for single page bio-based write.
25034+
25035+ mpage_end_io_write() would also do. But it's static.
25036+
25037+*/
25038+static int
25039+end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
25040+ int err UNUSED_ARG)
25041+{
25042+ struct page *page;
25043+
25044+ if (bio->bi_size != 0) {
25045+ warning("nikita-3333", "Truncated single page write: %i",
25046+ bio->bi_size);
25047+ return 1;
25048+ }
25049+
25050+ page = bio->bi_io_vec[0].bv_page;
25051+
25052+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
25053+ SetPageError(page);
25054+ end_page_writeback(page);
25055+ bio_put(bio);
25056+ return 0;
25057+}
25058+
25059+/* ->readpage() method for formatted nodes */
25060+static int formatted_readpage(struct file *f UNUSED_ARG,
25061+ struct page *page /* page to read */ )
25062+{
25063+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
25064+ return reiser4_page_io(page, jprivate(page), READ,
25065+ reiser4_ctx_gfp_mask_get());
25066+}
25067+
25068+/**
25069+ * reiser4_page_io - submit single-page bio request
25070+ * @page: page to perform io for
25071+ * @node: jnode of page
25072+ * @rw: read or write
25073+ * @gfp: gfp mask for bio allocation
25074+ *
25075+ * Submits single page read or write.
25076+ */
25077+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
25078+{
25079+ struct bio *bio;
25080+ int result;
25081+
25082+ assert("nikita-2094", page != NULL);
25083+ assert("nikita-2226", PageLocked(page));
25084+ assert("nikita-2634", node != NULL);
25085+ assert("nikita-2893", rw == READ || rw == WRITE);
25086+
25087+ if (rw) {
25088+ if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
25089+ unlock_page(page);
25090+ return 0;
25091+ }
25092+ }
25093+
25094+ bio = page_bio(page, node, rw, gfp);
25095+ if (!IS_ERR(bio)) {
25096+ if (rw == WRITE) {
25097+ set_page_writeback(page);
25098+ unlock_page(page);
25099+ }
25100+ reiser4_submit_bio(rw, bio);
25101+ result = 0;
25102+ } else {
25103+ unlock_page(page);
25104+ result = PTR_ERR(bio);
25105+ }
25106+
25107+ return result;
25108+}
25109+
25110+/* helper function to construct bio for page */
25111+static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
25112+{
25113+ struct bio *bio;
25114+ assert("nikita-2092", page != NULL);
25115+ assert("nikita-2633", node != NULL);
25116+
25117+ /* Simple implementation in the assumption that blocksize == pagesize.
25118+
25119+ We only have to submit one block, but submit_bh() will allocate bio
25120+ anyway, so lets use all the bells-and-whistles of bio code.
25121+ */
25122+
25123+ bio = bio_alloc(gfp, 1);
25124+ if (bio != NULL) {
25125+ int blksz;
25126+ struct super_block *super;
25127+ reiser4_block_nr blocknr;
25128+
25129+ super = page->mapping->host->i_sb;
25130+ assert("nikita-2029", super != NULL);
25131+ blksz = super->s_blocksize;
25132+ assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
25133+
25134+ spin_lock_jnode(node);
25135+ blocknr = *jnode_get_io_block(node);
25136+ spin_unlock_jnode(node);
25137+
25138+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
25139+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
25140+
25141+ bio->bi_bdev = super->s_bdev;
25142+ /* fill bio->bi_sector before calling bio_add_page(), because
25143+ * q->merge_bvec_fn may want to inspect it (see
25144+ * drivers/md/linear.c:linear_mergeable_bvec() for example. */
25145+ bio->bi_sector = blocknr * (blksz >> 9);
25146+
25147+ if (!bio_add_page(bio, page, blksz, 0)) {
25148+ warning("nikita-3452",
25149+ "Single page bio cannot be constructed");
25150+ return ERR_PTR(RETERR(-EINVAL));
25151+ }
25152+
25153+ /* bio -> bi_idx is filled by bio_init() */
25154+ bio->bi_end_io = (rw == READ) ?
25155+ end_bio_single_page_read : end_bio_single_page_write;
25156+
25157+ return bio;
25158+ } else
25159+ return ERR_PTR(RETERR(-ENOMEM));
25160+}
25161+
25162+/* this function is internally called by jnode_make_dirty() */
25163+int reiser4_set_page_dirty_internal(struct page *page)
25164+{
25165+ struct address_space *mapping;
25166+
25167+ mapping = page->mapping;
25168+ BUG_ON(mapping == NULL);
25169+
25170+ if (!TestSetPageDirty(page)) {
25171+ if (mapping_cap_account_dirty(mapping))
25172+ inc_zone_page_state(page, NR_FILE_DIRTY);
25173+
25174+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
25175+ }
25176+
25177+ /* znode must be dirty ? */
25178+ if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
25179+ assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
25180+ return 0;
25181+}
25182+
25183+#if REISER4_DEBUG
25184+
25185+/**
25186+ * can_hit_entd
25187+ *
25188+ * This is used on
25189+ */
25190+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25191+{
25192+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25193+ return 1;
25194+ if (ctx->super != s)
25195+ return 1;
25196+ if (get_super_private(s)->entd.tsk == current)
25197+ return 0;
25198+ if (!lock_stack_isclean(&ctx->stack))
25199+ return 0;
25200+ if (ctx->trans->atom != NULL)
25201+ return 0;
25202+ return 1;
25203+}
25204+
25205+#endif
25206+
25207+/**
25208+ * reiser4_writepage - writepage of struct address_space_operations
25209+ * @page: page to write
25210+ * @wbc:
25211+ *
25212+ *
25213+ */
25214+/* Common memory pressure notification. */
25215+int reiser4_writepage(struct page *page,
25216+ struct writeback_control *wbc)
25217+{
25218+ struct super_block *s;
25219+ reiser4_context *ctx;
25220+
25221+ assert("vs-828", PageLocked(page));
25222+
25223+ s = page->mapping->host->i_sb;
25224+ ctx = get_current_context_check();
25225+
25226+ assert("", can_hit_entd(ctx, s));
25227+
25228+ return write_page_by_ent(page, wbc);
25229+}
25230+
25231+/* ->set_page_dirty() method of formatted address_space */
25232+static int formatted_set_page_dirty(struct page *page)
25233+{
25234+ assert("nikita-2173", page != NULL);
25235+ BUG();
25236+ return __set_page_dirty_nobuffers(page);
25237+}
25238+
25239+/* writepages method of address space operations in reiser4 is used to involve
25240+ into transactions pages which are dirtied via mmap. Only regular files can
25241+ have such pages. Fake inode is used to access formatted nodes via page
25242+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
25243+ nothing to do */
25244+static int
25245+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25246+{
25247+ return 0;
25248+}
25249+
25250+/* address space operations for the fake inode */
25251+static struct address_space_operations formatted_fake_as_ops = {
25252+ /* Perform a writeback of a single page as a memory-freeing
25253+ * operation. */
25254+ .writepage = reiser4_writepage,
25255+ /* this is called to read formatted node */
25256+ .readpage = formatted_readpage,
25257+ /* ->sync_page() method of fake inode address space operations. Called
25258+ from wait_on_page() and lock_page().
25259+
25260+ This is most annoyingly misnomered method. Actually it is called
25261+ from wait_on_page_bit() and lock_page() and its purpose is to
25262+ actually start io by jabbing device drivers.
25263+ */
25264+ .sync_page = block_sync_page,
25265+ /* Write back some dirty pages from this mapping. Called from sync.
25266+ called during sync (pdflush) */
25267+ .writepages = writepages_fake,
25268+ /* Set a page dirty */
25269+ .set_page_dirty = formatted_set_page_dirty,
25270+ /* used for read-ahead. Not applicable */
25271+ .readpages = NULL,
25272+ .prepare_write = NULL,
25273+ .commit_write = NULL,
25274+ .bmap = NULL,
25275+ /* called just before page is being detached from inode mapping and
25276+ removed from memory. Called on truncate, cut/squeeze, and
25277+ umount. */
25278+ .invalidatepage = reiser4_invalidatepage,
25279+ /* this is called by shrink_cache() so that file system can try to
25280+ release objects (jnodes, buffers, journal heads) attached to page
25281+ and, may be made page itself free-able.
25282+ */
25283+ .releasepage = reiser4_releasepage,
25284+ .direct_IO = NULL
25285+};
25286+
25287+/* called just before page is released (no longer used by reiser4). Callers:
25288+ jdelete() and extent2tail(). */
25289+void reiser4_drop_page(struct page *page)
25290+{
25291+ assert("nikita-2181", PageLocked(page));
25292+ clear_page_dirty_for_io(page);
25293+ ClearPageUptodate(page);
25294+#if defined(PG_skipped)
25295+ ClearPageSkipped(page);
25296+#endif
25297+ unlock_page(page);
25298+}
25299+
25300+#define JNODE_GANG_SIZE (16)
25301+
25302+/* find all jnodes from range specified and invalidate them */
25303+static int
25304+truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25305+{
25306+ reiser4_inode *info;
25307+ int truncated_jnodes;
25308+ reiser4_tree *tree;
25309+ unsigned long index;
25310+ unsigned long end;
25311+
25312+ if (inode_file_plugin(inode) ==
25313+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
25314+ /* No need to get rid of jnodes here: if the single jnode of
25315+ page cluster did not have page, then it was found and killed
25316+ before in
25317+ truncate_page_cluster_cryptcompress()->jput()->jput_final(),
25318+ otherwise it will be dropped by reiser4_invalidatepage() */
25319+ return 0;
25320+ truncated_jnodes = 0;
25321+
25322+ info = reiser4_inode_data(inode);
25323+ tree = reiser4_tree_by_inode(inode);
25324+
25325+ index = from;
25326+ end = from + count;
25327+
25328+ while (1) {
25329+ jnode *gang[JNODE_GANG_SIZE];
25330+ int taken;
25331+ int i;
25332+ jnode *node;
25333+
25334+ assert("nikita-3466", index <= end);
25335+
25336+ read_lock_tree(tree);
25337+ taken =
25338+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25339+ (void **)gang, index,
25340+ JNODE_GANG_SIZE);
25341+ for (i = 0; i < taken; ++i) {
25342+ node = gang[i];
25343+ if (index_jnode(node) < end)
25344+ jref(node);
25345+ else
25346+ gang[i] = NULL;
25347+ }
25348+ read_unlock_tree(tree);
25349+
25350+ for (i = 0; i < taken; ++i) {
25351+ node = gang[i];
25352+ if (node != NULL) {
25353+ index = max(index, index_jnode(node));
25354+ spin_lock_jnode(node);
25355+ assert("edward-1457", node->pg == NULL);
25356+ /* this is always called after
25357+ truncate_inode_pages_range(). Therefore, here
25358+ jnode can not have page. New pages can not be
25359+ created because truncate_jnodes_range goes
25360+ under exclusive access on file obtained,
25361+ where as new page creation requires
25362+ non-exclusive access obtained */
25363+ JF_SET(node, JNODE_HEARD_BANSHEE);
25364+ reiser4_uncapture_jnode(node);
25365+ unhash_unformatted_jnode(node);
25366+ truncated_jnodes++;
25367+ jput(node);
25368+ } else
25369+ break;
25370+ }
25371+ if (i != taken || taken == 0)
25372+ break;
25373+ }
25374+ return truncated_jnodes;
25375+}
25376+
25377+/* Truncating files in reiser4: problems and solutions.
25378+
25379+ VFS calls fs's truncate after it has called truncate_inode_pages()
25380+ to get rid of pages corresponding to part of file being truncated.
25381+ In reiser4 it may cause existence of unallocated extents which do
25382+ not have jnodes. Flush code does not expect that. Solution of this
25383+ problem is straightforward. As vfs's truncate is implemented using
25384+ setattr operation, it seems reasonable to have ->setattr() that
25385+ will cut file body. However, flush code also does not expect dirty
25386+ pages without parent items, so it is impossible to cut all items,
25387+ then truncate all pages in two steps. We resolve this problem by
25388+ cutting items one-by-one. Each such fine-grained step performed
25389+ under longterm znode lock calls at the end ->kill_hook() method of
25390+ a killed item to remove its binded pages and jnodes.
25391+
25392+ The following function is a common part of mentioned kill hooks.
25393+ Also, this is called before tail-to-extent conversion (to not manage
25394+ few copies of the data).
25395+*/
25396+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25397+ unsigned long count, int even_cows)
25398+{
25399+ loff_t from_bytes, count_bytes;
25400+
25401+ if (count == 0)
25402+ return;
25403+ from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25404+ count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25405+
25406+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25407+ truncate_inode_pages_range(mapping, from_bytes,
25408+ from_bytes + count_bytes - 1);
25409+ truncate_jnodes_range(mapping->host, from, count);
25410+}
25411+
25412+/*
25413+ * Local variables:
25414+ * c-indentation-style: "K&R"
25415+ * mode-name: "LC"
25416+ * c-basic-offset: 8
25417+ * tab-width: 8
25418+ * fill-column: 120
25419+ * scroll-step: 1
25420+ * End:
25421+ */
25422diff --git a/fs/reiser4/page_cache.h b/fs/reiser4/page_cache.h
25423new file mode 100644
25424index 0000000..ab74f8f
25425--- /dev/null
25426+++ b/fs/reiser4/page_cache.h
25427@@ -0,0 +1,68 @@
25428+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25429+ * reiser4/README */
25430+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25431+
25432+#if !defined( __REISER4_PAGE_CACHE_H__ )
25433+#define __REISER4_PAGE_CACHE_H__
25434+
25435+#include "forward.h"
25436+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
25437+
25438+#include <linux/fs.h> /* for struct super_block, address_space */
25439+#include <linux/mm.h> /* for struct page */
25440+#include <linux/pagemap.h> /* for lock_page() */
25441+#include <linux/vmalloc.h> /* for __vmalloc() */
25442+
25443+extern int reiser4_init_formatted_fake(struct super_block *);
25444+extern void reiser4_done_formatted_fake(struct super_block *);
25445+
25446+extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25447+
25448+extern int reiser4_set_page_dirty_internal(struct page *);
25449+
25450+#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25451+
25452+extern void reiser4_wait_page_writeback(struct page *);
25453+static inline void lock_and_wait_page_writeback(struct page *page)
25454+{
25455+ lock_page(page);
25456+ if (unlikely(PageWriteback(page)))
25457+ reiser4_wait_page_writeback(page);
25458+}
25459+
25460+#define jprivate(page) ((jnode *)page_private(page))
25461+
25462+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25463+extern void reiser4_drop_page(struct page *);
25464+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25465+ unsigned long count, int even_cows);
25466+extern void capture_reiser4_inodes(struct super_block *,
25467+ struct writeback_control *);
25468+static inline void * reiser4_vmalloc (unsigned long size)
25469+{
25470+ return __vmalloc(size,
25471+ reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25472+ PAGE_KERNEL);
25473+}
25474+
25475+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25476+
25477+#if REISER4_DEBUG
25478+extern void print_page(const char *prefix, struct page *page);
25479+#else
25480+#define print_page(prf, p) noop
25481+#endif
25482+
25483+/* __REISER4_PAGE_CACHE_H__ */
25484+#endif
25485+
25486+/* Make Linus happy.
25487+ Local variables:
25488+ c-indentation-style: "K&R"
25489+ mode-name: "LC"
25490+ c-basic-offset: 8
25491+ tab-width: 8
25492+ fill-column: 120
25493+ scroll-step: 1
25494+ End:
25495+*/
25496diff --git a/fs/reiser4/plugin/Makefile b/fs/reiser4/plugin/Makefile
25497new file mode 100644
25498index 0000000..4b2c9f8
25499--- /dev/null
25500+++ b/fs/reiser4/plugin/Makefile
25501@@ -0,0 +1,26 @@
25502+obj-$(CONFIG_REISER4_FS) += plugins.o
25503+
25504+plugins-objs := \
25505+ plugin.o \
25506+ plugin_set.o \
25507+ object.o \
25508+ inode_ops.o \
25509+ inode_ops_rename.o \
25510+ file_ops.o \
25511+ file_ops_readdir.o \
25512+ file_plugin_common.o \
25513+ dir_plugin_common.o \
25514+ digest.o \
25515+ hash.o \
25516+ fibration.o \
25517+ tail_policy.o \
25518+ regular.o
25519+
25520+obj-$(CONFIG_REISER4_FS) += item/
25521+obj-$(CONFIG_REISER4_FS) += file/
25522+obj-$(CONFIG_REISER4_FS) += dir/
25523+obj-$(CONFIG_REISER4_FS) += node/
25524+obj-$(CONFIG_REISER4_FS) += compress/
25525+obj-$(CONFIG_REISER4_FS) += space/
25526+obj-$(CONFIG_REISER4_FS) += disk_format/
25527+obj-$(CONFIG_REISER4_FS) += security/
25528diff --git a/fs/reiser4/plugin/cluster.c b/fs/reiser4/plugin/cluster.c
25529new file mode 100644
25530index 0000000..b400d5f
25531--- /dev/null
25532+++ b/fs/reiser4/plugin/cluster.c
25533@@ -0,0 +1,71 @@
25534+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25535+
25536+/* Contains reiser4 cluster plugins (see
25537+ http://www.namesys.com/cryptcompress_design.html
25538+ "Concepts of clustering" for details). */
25539+
25540+#include "plugin_header.h"
25541+#include "plugin.h"
25542+#include "../inode.h"
25543+
25544+static int change_cluster(struct inode *inode,
25545+ reiser4_plugin * plugin,
25546+ pset_member memb)
25547+{
25548+ assert("edward-1324", inode != NULL);
25549+ assert("edward-1325", plugin != NULL);
25550+ assert("edward-1326", is_reiser4_inode(inode));
25551+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25552+
25553+ /* Can't change the cluster plugin for already existent regular files. */
25554+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25555+ return RETERR(-EINVAL);
25556+
25557+ /* If matches, nothing to change. */
25558+ if (inode_hash_plugin(inode) != NULL &&
25559+ inode_hash_plugin(inode)->h.id == plugin->h.id)
25560+ return 0;
25561+
25562+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25563+ PSET_CLUSTER, plugin);
25564+}
25565+
25566+static reiser4_plugin_ops cluster_plugin_ops = {
25567+ .init = NULL,
25568+ .load = NULL,
25569+ .save_len = NULL,
25570+ .save = NULL,
25571+ .change = &change_cluster
25572+};
25573+
25574+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
25575+ [CLUSTER_ ## ID ## _ID] = { \
25576+ .h = { \
25577+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25578+ .id = CLUSTER_ ## ID ## _ID, \
25579+ .pops = &cluster_plugin_ops, \
25580+ .label = LABEL, \
25581+ .desc = DESC, \
25582+ .linkage = {NULL, NULL} \
25583+ }, \
25584+ .shift = SHIFT \
25585+ }
25586+
25587+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25588+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25589+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25590+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25591+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25592+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25593+};
25594+
25595+/*
25596+ Local variables:
25597+ c-indentation-style: "K&R"
25598+ mode-name: "LC"
25599+ c-basic-offset: 8
25600+ tab-width: 8
25601+ fill-column: 120
25602+ scroll-step: 1
25603+ End:
25604+*/
25605diff --git a/fs/reiser4/plugin/cluster.h b/fs/reiser4/plugin/cluster.h
25606new file mode 100644
25607index 0000000..019f156
25608--- /dev/null
25609+++ b/fs/reiser4/plugin/cluster.h
25610@@ -0,0 +1,343 @@
25611+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25612+
25613+/* This file contains page/cluster index translators and offset modulators
25614+ See http://www.namesys.com/cryptcompress_design.html for details */
25615+
25616+#if !defined( __FS_REISER4_CLUSTER_H__ )
25617+#define __FS_REISER4_CLUSTER_H__
25618+
25619+#include "../inode.h"
25620+
25621+static inline int inode_cluster_shift(struct inode *inode)
25622+{
25623+ assert("edward-92", inode != NULL);
25624+ assert("edward-93", reiser4_inode_data(inode) != NULL);
25625+
25626+ return inode_cluster_plugin(inode)->shift;
25627+}
25628+
25629+static inline unsigned cluster_nrpages_shift(struct inode *inode)
25630+{
25631+ return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25632+}
25633+
25634+/* cluster size in page units */
25635+static inline unsigned cluster_nrpages(struct inode *inode)
25636+{
25637+ return 1U << cluster_nrpages_shift(inode);
25638+}
25639+
25640+static inline size_t inode_cluster_size(struct inode *inode)
25641+{
25642+ assert("edward-96", inode != NULL);
25643+
25644+ return 1U << inode_cluster_shift(inode);
25645+}
25646+
25647+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25648+{
25649+ return idx >> cluster_nrpages_shift(inode);
25650+}
25651+
25652+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25653+{
25654+ return idx << cluster_nrpages_shift(inode);
25655+}
25656+
25657+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25658+{
25659+ return clust_to_pg(pg_to_clust(idx, inode), inode);
25660+}
25661+
25662+static inline pgoff_t off_to_pg(loff_t off)
25663+{
25664+ return (off >> PAGE_CACHE_SHIFT);
25665+}
25666+
25667+static inline loff_t pg_to_off(pgoff_t idx)
25668+{
25669+ return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25670+}
25671+
25672+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25673+{
25674+ return off >> inode_cluster_shift(inode);
25675+}
25676+
25677+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25678+{
25679+ return (loff_t) idx << inode_cluster_shift(inode);
25680+}
25681+
25682+static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25683+{
25684+ return (count + (1UL << shift) - 1) >> shift;
25685+}
25686+
25687+/* number of pages occupied by @count bytes */
25688+static inline pgoff_t count_to_nrpages(loff_t count)
25689+{
25690+ return count_to_nr(count, PAGE_CACHE_SHIFT);
25691+}
25692+
25693+/* number of clusters occupied by @count bytes */
25694+static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25695+{
25696+ return count_to_nr(count, inode_cluster_shift(inode));
25697+}
25698+
25699+/* number of clusters occupied by @count pages */
25700+static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25701+{
25702+ return count_to_nr(count, cluster_nrpages_shift(inode));
25703+}
25704+
25705+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25706+{
25707+ return clust_to_off(off_to_clust(off, inode), inode);
25708+}
25709+
25710+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25711+{
25712+ return clust_to_pg(off_to_clust(off, inode), inode);
25713+}
25714+
25715+static inline unsigned off_to_pgoff(loff_t off)
25716+{
25717+ return off & (PAGE_CACHE_SIZE - 1);
25718+}
25719+
25720+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25721+{
25722+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25723+}
25724+
25725+static inline unsigned
25726+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25727+{
25728+ return off_to_cloff(pg_to_off(idx), inode);
25729+}
25730+
25731+/* if @size != 0, returns index of the page
25732+ which contains the last byte of the file */
25733+static inline pgoff_t size_to_pg(loff_t size)
25734+{
25735+ return (size ? off_to_pg(size - 1) : 0);
25736+}
25737+
25738+/* minimal index of the page which doesn't contain
25739+ file data */
25740+static inline pgoff_t size_to_next_pg(loff_t size)
25741+{
25742+ return (size ? off_to_pg(size - 1) + 1 : 0);
25743+}
25744+
25745+/* how many bytes of file of size @cnt can be contained
25746+ in page of index @idx */
25747+static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25748+{
25749+ if (idx > off_to_pg(cnt))
25750+ return 0;
25751+ if (idx < off_to_pg(cnt))
25752+ return PAGE_CACHE_SIZE;
25753+ return off_to_pgoff(cnt);
25754+}
25755+
25756+/* how many bytes of file of size @cnt can be contained
25757+ in logical cluster of index @idx */
25758+static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25759+ struct inode *inode)
25760+{
25761+ if (idx > off_to_clust(cnt, inode))
25762+ return 0;
25763+ if (idx < off_to_clust(cnt, inode))
25764+ return inode_cluster_size(inode);
25765+ return off_to_cloff(cnt, inode);
25766+}
25767+
25768+static inline unsigned
25769+fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25770+{
25771+ assert("edward-288", clust != NULL);
25772+ assert("edward-289", inode != NULL);
25773+
25774+ return cnt_to_clcnt(inode->i_size, clust->index, inode);
25775+}
25776+
25777+static inline int
25778+cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25779+{
25780+ return clust->tc.lsize == inode_cluster_size(inode);
25781+}
25782+
25783+static inline void reiser4_slide_init(reiser4_slide_t * win)
25784+{
25785+ assert("edward-1084", win != NULL);
25786+ memset(win, 0, sizeof *win);
25787+}
25788+
25789+static inline tfm_action
25790+cluster_get_tfm_act(tfm_cluster_t * tc)
25791+{
25792+ assert("edward-1356", tc != NULL);
25793+ return tc->act;
25794+}
25795+
25796+static inline void
25797+cluster_set_tfm_act(tfm_cluster_t * tc, tfm_action act)
25798+{
25799+ assert("edward-1356", tc != NULL);
25800+ tc->act = act;
25801+}
25802+
25803+static inline void
25804+cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25805+ assert("edward-84", clust != NULL);
25806+ memset(clust, 0, sizeof *clust);
25807+ cluster_set_tfm_act(&clust->tc, act);
25808+ clust->dstat = INVAL_DISK_CLUSTER;
25809+ clust->win = window;
25810+}
25811+
25812+static inline void
25813+cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25814+{
25815+ cluster_init_act (clust, TFMA_READ, window);
25816+}
25817+
25818+static inline void
25819+cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25820+{
25821+ cluster_init_act (clust, TFMA_WRITE, window);
25822+}
25823+
25824+static inline int dclust_get_extension_dsize(hint_t * hint)
25825+{
25826+ return hint->ext_coord.extension.ctail.dsize;
25827+}
25828+
25829+static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25830+{
25831+ hint->ext_coord.extension.ctail.dsize = dsize;
25832+}
25833+
25834+static inline int dclust_get_extension_shift(hint_t * hint)
25835+{
25836+ return hint->ext_coord.extension.ctail.shift;
25837+}
25838+
25839+static inline int dclust_get_extension_ncount(hint_t * hint)
25840+{
25841+ return hint->ext_coord.extension.ctail.ncount;
25842+}
25843+
25844+static inline void dclust_inc_extension_ncount(hint_t * hint)
25845+{
25846+ hint->ext_coord.extension.ctail.ncount ++;
25847+}
25848+
25849+static inline void dclust_init_extension(hint_t * hint)
25850+{
25851+ memset(&hint->ext_coord.extension.ctail, 0,
25852+ sizeof(hint->ext_coord.extension.ctail));
25853+}
25854+
25855+static inline int hint_is_unprepped_dclust(hint_t * hint)
25856+{
25857+ assert("edward-1451", hint_is_valid(hint));
25858+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25859+}
25860+
25861+static inline void coord_set_between_clusters(coord_t * coord)
25862+{
25863+#if REISER4_DEBUG
25864+ int result;
25865+ result = zload(coord->node);
25866+ assert("edward-1296", !result);
25867+#endif
25868+ if (!coord_is_between_items(coord)) {
25869+ coord->between = AFTER_ITEM;
25870+ coord->unit_pos = 0;
25871+ }
25872+#if REISER4_DEBUG
25873+ zrelse(coord->node);
25874+#endif
25875+}
25876+
25877+int reiser4_inflate_cluster(reiser4_cluster_t *, struct inode *);
25878+int find_disk_cluster(reiser4_cluster_t *, struct inode *, int read,
25879+ znode_lock_mode mode);
25880+int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25881+int reiser4_deflate_cluster(reiser4_cluster_t *, struct inode *);
25882+void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t start,
25883+ int even_cows);
25884+void invalidate_hint_cluster(reiser4_cluster_t * clust);
25885+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25886+ znode_lock_mode mode);
25887+int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25888+ znode_lock_mode lock_mode);
25889+void reset_cluster_params(reiser4_cluster_t * clust);
25890+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25891+ int count);
25892+int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25893+ int capture);
25894+void reiser4_release_cluster_pages(reiser4_cluster_t *);
25895+void put_cluster_handle(reiser4_cluster_t * clust);
25896+int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25897+int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25898+void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25899+void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25900+
25901+/* move cluster handle to the target position
25902+ specified by the page of index @pgidx
25903+*/
25904+static inline void move_cluster_forward(reiser4_cluster_t * clust,
25905+ struct inode *inode,
25906+ pgoff_t pgidx)
25907+{
25908+ assert("edward-1297", clust != NULL);
25909+ assert("edward-1298", inode != NULL);
25910+
25911+ reset_cluster_params(clust);
25912+ if (clust->index_valid &&
25913+ /* Hole in the indices. Hint became invalid and can not be
25914+ used by find_cluster_item() even if seal/node versions
25915+ will coincide */
25916+ pg_to_clust(pgidx, inode) != clust->index + 1) {
25917+ reiser4_unset_hint(clust->hint);
25918+ invalidate_hint_cluster(clust);
25919+ }
25920+ clust->index = pg_to_clust(pgidx, inode);
25921+ clust->index_valid = 1;
25922+}
25923+
25924+static inline int
25925+alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25926+{
25927+ assert("edward-791", clust != NULL);
25928+ assert("edward-792", inode != NULL);
25929+ clust->pages =
25930+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25931+ reiser4_ctx_gfp_mask_get());
25932+ if (!clust->pages)
25933+ return -ENOMEM;
25934+ return 0;
25935+}
25936+
25937+static inline void free_clust_pages(reiser4_cluster_t * clust)
25938+{
25939+ kfree(clust->pages);
25940+}
25941+
25942+#endif /* __FS_REISER4_CLUSTER_H__ */
25943+
25944+/* Make Linus happy.
25945+ Local variables:
25946+ c-indentation-style: "K&R"
25947+ mode-name: "LC"
25948+ c-basic-offset: 8
25949+ tab-width: 8
25950+ fill-column: 120
25951+ scroll-step: 1
25952+ End:
25953+*/
25954diff --git a/fs/reiser4/plugin/compress/Makefile b/fs/reiser4/plugin/compress/Makefile
25955new file mode 100644
25956index 0000000..82793a4
25957--- /dev/null
25958+++ b/fs/reiser4/plugin/compress/Makefile
25959@@ -0,0 +1,6 @@
25960+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
25961+
25962+compress_plugins-objs := \
25963+ compress.o \
25964+ minilzo.o \
25965+ compress_mode.o
25966diff --git a/fs/reiser4/plugin/compress/compress.c b/fs/reiser4/plugin/compress/compress.c
25967new file mode 100644
25968index 0000000..7e64d0c
25969--- /dev/null
25970+++ b/fs/reiser4/plugin/compress/compress.c
25971@@ -0,0 +1,381 @@
25972+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25973+/* reiser4 compression transform plugins */
25974+
25975+#include "../../debug.h"
25976+#include "../../inode.h"
25977+#include "../plugin.h"
25978+#include "minilzo.h"
25979+
25980+#include <linux/zlib.h>
25981+#include <linux/types.h>
25982+#include <linux/hardirq.h>
25983+
25984+static int change_compression(struct inode *inode,
25985+ reiser4_plugin * plugin,
25986+ pset_member memb)
25987+{
25988+ assert("edward-1316", inode != NULL);
25989+ assert("edward-1317", plugin != NULL);
25990+ assert("edward-1318", is_reiser4_inode(inode));
25991+ assert("edward-1319",
25992+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25993+
25994+ /* cannot change compression plugin of already existing regular object */
25995+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25996+ return RETERR(-EINVAL);
25997+
25998+ /* If matches, nothing to change. */
25999+ if (inode_hash_plugin(inode) != NULL &&
26000+ inode_hash_plugin(inode)->h.id == plugin->h.id)
26001+ return 0;
26002+
26003+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
26004+ PSET_COMPRESSION, plugin);
26005+}
26006+
26007+static reiser4_plugin_ops compression_plugin_ops = {
26008+ .init = NULL,
26009+ .load = NULL,
26010+ .save_len = NULL,
26011+ .save = NULL,
26012+ .change = &change_compression
26013+};
26014+
26015+/******************************************************************************/
26016+/* gzip1 compression */
26017+/******************************************************************************/
26018+
26019+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
26020+#define GZIP1_DEF_WINBITS 15
26021+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
26022+
26023+static int gzip1_init(void)
26024+{
26025+ int ret = -EINVAL;
26026+#if REISER4_ZLIB
26027+ ret = 0;
26028+#endif
26029+ if (ret == -EINVAL)
26030+ warning("edward-1337", "Zlib not compiled into kernel");
26031+ return ret;
26032+}
26033+
26034+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
26035+{
26036+ return 0;
26037+}
26038+
26039+static coa_t gzip1_alloc(tfm_action act)
26040+{
26041+ coa_t coa = NULL;
26042+#if REISER4_ZLIB
26043+ int ret = 0;
26044+ switch (act) {
26045+ case TFMA_WRITE: /* compress */
26046+ coa = reiser4_vmalloc(zlib_deflate_workspacesize());
26047+ if (!coa) {
26048+ ret = -ENOMEM;
26049+ break;
26050+ }
26051+ memset(coa, 0, zlib_deflate_workspacesize());
26052+ break;
26053+ case TFMA_READ: /* decompress */
26054+ coa = reiser4_vmalloc(zlib_inflate_workspacesize());
26055+ if (!coa) {
26056+ ret = -ENOMEM;
26057+ break;
26058+ }
26059+ memset(coa, 0, zlib_inflate_workspacesize());
26060+ break;
26061+ default:
26062+ impossible("edward-767",
26063+ "trying to alloc workspace for unknown tfm action");
26064+ }
26065+ if (ret) {
26066+ warning("edward-768",
26067+ "alloc workspace for gzip1 (tfm action = %d) failed\n",
26068+ act);
26069+ return ERR_PTR(ret);
26070+ }
26071+#endif
26072+ return coa;
26073+}
26074+
26075+static void gzip1_free(coa_t coa, tfm_action act)
26076+{
26077+ assert("edward-769", coa != NULL);
26078+
26079+ switch (act) {
26080+ case TFMA_WRITE: /* compress */
26081+ vfree(coa);
26082+ break;
26083+ case TFMA_READ: /* decompress */
26084+ vfree(coa);
26085+ break;
26086+ default:
26087+ impossible("edward-770", "unknown tfm action");
26088+ }
26089+ return;
26090+}
26091+
26092+static int gzip1_min_size_deflate(void)
26093+{
26094+ return 64;
26095+}
26096+
26097+static void
26098+gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26099+ __u8 * dst_first, unsigned *dst_len)
26100+{
26101+#if REISER4_ZLIB
26102+ int ret = 0;
26103+ struct z_stream_s stream;
26104+
26105+ memset(&stream, 0, sizeof(stream));
26106+
26107+ assert("edward-842", coa != NULL);
26108+ assert("edward-875", src_len != 0);
26109+
26110+ stream.workspace = coa;
26111+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
26112+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
26113+ Z_DEFAULT_STRATEGY);
26114+ if (ret != Z_OK) {
26115+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
26116+ goto rollback;
26117+ }
26118+ ret = zlib_deflateReset(&stream);
26119+ if (ret != Z_OK) {
26120+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
26121+ goto rollback;
26122+ }
26123+ stream.next_in = src_first;
26124+ stream.avail_in = src_len;
26125+ stream.next_out = dst_first;
26126+ stream.avail_out = *dst_len;
26127+
26128+ ret = zlib_deflate(&stream, Z_FINISH);
26129+ if (ret != Z_STREAM_END) {
26130+ if (ret != Z_OK)
26131+ warning("edward-773",
26132+ "zlib_deflate returned %d\n", ret);
26133+ goto rollback;
26134+ }
26135+ *dst_len = stream.total_out;
26136+ return;
26137+ rollback:
26138+ *dst_len = src_len;
26139+#endif
26140+ return;
26141+}
26142+
26143+static void
26144+gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26145+ __u8 * dst_first, unsigned *dst_len)
26146+{
26147+#if REISER4_ZLIB
26148+ int ret = 0;
26149+ struct z_stream_s stream;
26150+
26151+ memset(&stream, 0, sizeof(stream));
26152+
26153+ assert("edward-843", coa != NULL);
26154+ assert("edward-876", src_len != 0);
26155+
26156+ stream.workspace = coa;
26157+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
26158+ if (ret != Z_OK) {
26159+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
26160+ return;
26161+ }
26162+ ret = zlib_inflateReset(&stream);
26163+ if (ret != Z_OK) {
26164+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
26165+ return;
26166+ }
26167+
26168+ stream.next_in = src_first;
26169+ stream.avail_in = src_len;
26170+ stream.next_out = dst_first;
26171+ stream.avail_out = *dst_len;
26172+
26173+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
26174+ /*
26175+ * Work around a bug in zlib, which sometimes wants to taste an extra
26176+ * byte when being used in the (undocumented) raw deflate mode.
26177+ * (From USAGI).
26178+ */
26179+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
26180+ u8 zerostuff = 0;
26181+ stream.next_in = &zerostuff;
26182+ stream.avail_in = 1;
26183+ ret = zlib_inflate(&stream, Z_FINISH);
26184+ }
26185+ if (ret != Z_STREAM_END) {
26186+ warning("edward-776", "zlib_inflate returned %d\n", ret);
26187+ return;
26188+ }
26189+ *dst_len = stream.total_out;
26190+#endif
26191+ return;
26192+}
26193+
26194+/******************************************************************************/
26195+/* lzo1 compression */
26196+/******************************************************************************/
26197+
26198+static int lzo1_init(void)
26199+{
26200+ int ret;
26201+ ret = lzo_init();
26202+ if (ret != LZO_E_OK)
26203+ warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
26204+ return ret;
26205+}
26206+
26207+static int lzo1_overrun(unsigned in_len)
26208+{
26209+ return in_len / 64 + 16 + 3;
26210+}
26211+
26212+#define LZO_HEAP_SIZE(size) \
26213+ sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
26214+
26215+static coa_t lzo1_alloc(tfm_action act)
26216+{
26217+ int ret = 0;
26218+ coa_t coa = NULL;
26219+
26220+ switch (act) {
26221+ case TFMA_WRITE: /* compress */
26222+ coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
26223+ if (!coa) {
26224+ ret = -ENOMEM;
26225+ break;
26226+ }
26227+ memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
26228+ case TFMA_READ: /* decompress */
26229+ break;
26230+ default:
26231+ impossible("edward-877",
26232+ "trying to alloc workspace for unknown tfm action");
26233+ }
26234+ if (ret) {
26235+ warning("edward-878",
26236+ "alloc workspace for lzo1 (tfm action = %d) failed\n",
26237+ act);
26238+ return ERR_PTR(ret);
26239+ }
26240+ return coa;
26241+}
26242+
26243+static void lzo1_free(coa_t coa, tfm_action act)
26244+{
26245+ assert("edward-879", coa != NULL);
26246+
26247+ switch (act) {
26248+ case TFMA_WRITE: /* compress */
26249+ vfree(coa);
26250+ break;
26251+ case TFMA_READ: /* decompress */
26252+ impossible("edward-1304",
26253+ "trying to free non-allocated workspace");
26254+ default:
26255+ impossible("edward-880", "unknown tfm action");
26256+ }
26257+ return;
26258+}
26259+
26260+static int lzo1_min_size_deflate(void)
26261+{
26262+ return 256;
26263+}
26264+
26265+static void
26266+lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26267+ __u8 * dst_first, unsigned *dst_len)
26268+{
26269+ int result;
26270+
26271+ assert("edward-846", coa != NULL);
26272+ assert("edward-847", src_len != 0);
26273+
26274+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26275+ if (result != LZO_E_OK) {
26276+ warning("edward-849", "lzo1x_1_compress failed\n");
26277+ goto out;
26278+ }
26279+ if (*dst_len >= src_len) {
26280+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26281+ goto out;
26282+ }
26283+ return;
26284+ out:
26285+ *dst_len = src_len;
26286+ return;
26287+}
26288+
26289+static void
26290+lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26291+ __u8 * dst_first, unsigned *dst_len)
26292+{
26293+ int result;
26294+
26295+ assert("edward-851", coa == NULL);
26296+ assert("edward-852", src_len != 0);
26297+
26298+ result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
26299+ if (result != LZO_E_OK)
26300+ warning("edward-853", "lzo1x_1_decompress failed\n");
26301+ return;
26302+}
26303+
26304+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26305+ [LZO1_COMPRESSION_ID] = {
26306+ .h = {
26307+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26308+ .id = LZO1_COMPRESSION_ID,
26309+ .pops = &compression_plugin_ops,
26310+ .label = "lzo1",
26311+ .desc = "lzo1 compression transform",
26312+ .linkage = {NULL, NULL}
26313+ },
26314+ .init = lzo1_init,
26315+ .overrun = lzo1_overrun,
26316+ .alloc = lzo1_alloc,
26317+ .free = lzo1_free,
26318+ .min_size_deflate = lzo1_min_size_deflate,
26319+ .checksum = reiser4_adler32,
26320+ .compress = lzo1_compress,
26321+ .decompress = lzo1_decompress
26322+ },
26323+ [GZIP1_COMPRESSION_ID] = {
26324+ .h = {
26325+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26326+ .id = GZIP1_COMPRESSION_ID,
26327+ .pops = &compression_plugin_ops,
26328+ .label = "gzip1",
26329+ .desc = "gzip1 compression transform",
26330+ .linkage = {NULL, NULL}
26331+ },
26332+ .init = gzip1_init,
26333+ .overrun = gzip1_overrun,
26334+ .alloc = gzip1_alloc,
26335+ .free = gzip1_free,
26336+ .min_size_deflate = gzip1_min_size_deflate,
26337+ .checksum = reiser4_adler32,
26338+ .compress = gzip1_compress,
26339+ .decompress = gzip1_decompress
26340+ }
26341+};
26342+
26343+/*
26344+ Local variables:
26345+ c-indentation-style: "K&R"
26346+ mode-name: "LC"
26347+ c-basic-offset: 8
26348+ tab-width: 8
26349+ fill-column: 120
26350+ scroll-step: 1
26351+ End:
26352+*/
26353diff --git a/fs/reiser4/plugin/compress/compress.h b/fs/reiser4/plugin/compress/compress.h
26354new file mode 100644
26355index 0000000..922ca0b
26356--- /dev/null
26357+++ b/fs/reiser4/plugin/compress/compress.h
26358@@ -0,0 +1,38 @@
26359+#if !defined( __FS_REISER4_COMPRESS_H__ )
26360+#define __FS_REISER4_COMPRESS_H__
26361+
26362+#include <linux/types.h>
26363+#include <linux/string.h>
26364+
26365+typedef enum {
26366+ TFMA_READ,
26367+ TFMA_WRITE,
26368+ TFMA_LAST
26369+} tfm_action;
26370+
26371+/* builtin compression plugins */
26372+
26373+typedef enum {
26374+ LZO1_COMPRESSION_ID,
26375+ GZIP1_COMPRESSION_ID,
26376+ LAST_COMPRESSION_ID,
26377+} reiser4_compression_id;
26378+
26379+typedef unsigned long cloff_t;
26380+typedef void *coa_t;
26381+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26382+
26383+__u32 reiser4_adler32(char *data, __u32 len);
26384+
26385+#endif /* __FS_REISER4_COMPRESS_H__ */
26386+
26387+/* Make Linus happy.
26388+ Local variables:
26389+ c-indentation-style: "K&R"
26390+ mode-name: "LC"
26391+ c-basic-offset: 8
26392+ tab-width: 8
26393+ fill-column: 120
26394+ scroll-step: 1
26395+ End:
26396+*/
26397diff --git a/fs/reiser4/plugin/compress/compress_mode.c b/fs/reiser4/plugin/compress/compress_mode.c
26398new file mode 100644
26399index 0000000..2ae7856
26400--- /dev/null
26401+++ b/fs/reiser4/plugin/compress/compress_mode.c
26402@@ -0,0 +1,162 @@
26403+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26404+/* This file contains Reiser4 compression mode plugins.
26405+
26406+ Compression mode plugin is a set of handlers called by compressor
26407+ at flush time and represent some heuristics including the ones
26408+ which are to avoid compression of incompressible data, see
26409+ http://www.namesys.com/cryptcompress_design.html for more details.
26410+*/
26411+#include "../../inode.h"
26412+#include "../plugin.h"
26413+
26414+static int should_deflate_none(struct inode * inode, cloff_t index)
26415+{
26416+ return 0;
26417+}
26418+
26419+static int should_deflate_common(struct inode * inode, cloff_t index)
26420+{
26421+ return compression_is_on(cryptcompress_inode_data(inode));
26422+}
26423+
26424+static int discard_hook_ultim(struct inode *inode, cloff_t index)
26425+{
26426+ turn_off_compression(cryptcompress_inode_data(inode));
26427+ return 0;
26428+}
26429+
26430+static int discard_hook_lattd(struct inode *inode, cloff_t index)
26431+{
26432+ cryptcompress_info_t * info = cryptcompress_inode_data(inode);
26433+
26434+ assert("edward-1462",
26435+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26436+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26437+
26438+ turn_off_compression(info);
26439+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26440+ set_lattice_factor(info, get_lattice_factor(info) << 1);
26441+ return 0;
26442+}
26443+
26444+static int accept_hook_lattd(struct inode *inode, cloff_t index)
26445+{
26446+ turn_on_compression(cryptcompress_inode_data(inode));
26447+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26448+ return 0;
26449+}
26450+
26451+/* Check on dynamic lattice, the adaptive compression modes which
26452+ defines the following behavior:
26453+
26454+ Compression is on: try to compress everything and turn
26455+ it off, whenever cluster is incompressible.
26456+
26457+ Compression is off: try to compress clusters of indexes
26458+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26459+ them is compressible. If incompressible, then increase FACTOR */
26460+
26461+/* check if @index belongs to one-dimensional lattice
26462+ of sparce factor @factor */
26463+static int is_on_lattice(cloff_t index, int factor)
26464+{
26465+ return (factor ? index % factor == 0: index == 0);
26466+}
26467+
26468+static int should_deflate_lattd(struct inode * inode, cloff_t index)
26469+{
26470+ return should_deflate_common(inode, index) ||
26471+ is_on_lattice(index,
26472+ get_lattice_factor
26473+ (cryptcompress_inode_data(inode)));
26474+}
26475+
26476+/* compression mode_plugins */
26477+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26478+ [NONE_COMPRESSION_MODE_ID] = {
26479+ .h = {
26480+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26481+ .id = NONE_COMPRESSION_MODE_ID,
26482+ .pops = NULL,
26483+ .label = "none",
26484+ .desc = "Compress nothing",
26485+ .linkage = {NULL, NULL}
26486+ },
26487+ .should_deflate = should_deflate_none,
26488+ .accept_hook = NULL,
26489+ .discard_hook = NULL
26490+ },
26491+ /* Check-on-dynamic-lattice adaptive compression mode */
26492+ [LATTD_COMPRESSION_MODE_ID] = {
26493+ .h = {
26494+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26495+ .id = LATTD_COMPRESSION_MODE_ID,
26496+ .pops = NULL,
26497+ .label = "lattd",
26498+ .desc = "Check on dynamic lattice",
26499+ .linkage = {NULL, NULL}
26500+ },
26501+ .should_deflate = should_deflate_lattd,
26502+ .accept_hook = accept_hook_lattd,
26503+ .discard_hook = discard_hook_lattd
26504+ },
26505+ /* Check-ultimately compression mode:
26506+ Turn off compression forever as soon as we meet
26507+ incompressible data */
26508+ [ULTIM_COMPRESSION_MODE_ID] = {
26509+ .h = {
26510+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26511+ .id = ULTIM_COMPRESSION_MODE_ID,
26512+ .pops = NULL,
26513+ .label = "ultim",
26514+ .desc = "Check ultimately",
26515+ .linkage = {NULL, NULL}
26516+ },
26517+ .should_deflate = should_deflate_common,
26518+ .accept_hook = NULL,
26519+ .discard_hook = discard_hook_ultim
26520+ },
26521+ /* Force-to-compress-everything compression mode */
26522+ [FORCE_COMPRESSION_MODE_ID] = {
26523+ .h = {
26524+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26525+ .id = FORCE_COMPRESSION_MODE_ID,
26526+ .pops = NULL,
26527+ .label = "force",
26528+ .desc = "Force to compress everything",
26529+ .linkage = {NULL, NULL}
26530+ },
26531+ .should_deflate = NULL,
26532+ .accept_hook = NULL,
26533+ .discard_hook = NULL
26534+ },
26535+ /* Convert-to-extent compression mode.
26536+ In this mode items will be converted to extents and management
26537+ will be passed to (classic) unix file plugin as soon as ->write()
26538+ detects that the first complete logical cluster (of index #0) is
26539+ incompressible. */
26540+ [CONVX_COMPRESSION_MODE_ID] = {
26541+ .h = {
26542+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26543+ .id = CONVX_COMPRESSION_MODE_ID,
26544+ .pops = NULL,
26545+ .label = "conv",
26546+ .desc = "Convert to extent",
26547+ .linkage = {NULL, NULL}
26548+ },
26549+ .should_deflate = should_deflate_common,
26550+ .accept_hook = NULL,
26551+ .discard_hook = NULL
26552+ }
26553+};
26554+
26555+/*
26556+ Local variables:
26557+ c-indentation-style: "K&R"
26558+ mode-name: "LC"
26559+ c-basic-offset: 8
26560+ tab-width: 8
26561+ fill-column: 120
26562+ scroll-step: 1
26563+ End:
26564+*/
26565diff --git a/fs/reiser4/plugin/compress/lzoconf.h b/fs/reiser4/plugin/compress/lzoconf.h
26566new file mode 100644
26567index 0000000..cc0fa4d
26568--- /dev/null
26569+++ b/fs/reiser4/plugin/compress/lzoconf.h
26570@@ -0,0 +1,216 @@
26571+/* lzoconf.h -- configuration for the LZO real-time data compression library
26572+ adopted for reiser4 compression transform plugin.
26573+
26574+ This file is part of the LZO real-time data compression library
26575+ and not included in any proprietary licenses of reiser4.
26576+
26577+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26578+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26579+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26580+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26581+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26582+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26583+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26584+ All Rights Reserved.
26585+
26586+ The LZO library is free software; you can redistribute it and/or
26587+ modify it under the terms of the GNU General Public License as
26588+ published by the Free Software Foundation; either version 2 of
26589+ the License, or (at your option) any later version.
26590+
26591+ The LZO library is distributed in the hope that it will be useful,
26592+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26593+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26594+ GNU General Public License for more details.
26595+
26596+ You should have received a copy of the GNU General Public License
26597+ along with the LZO library; see the file COPYING.
26598+ If not, write to the Free Software Foundation, Inc.,
26599+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26600+
26601+ Markus F.X.J. Oberhumer
26602+ <markus@oberhumer.com>
26603+ http://www.oberhumer.com/opensource/lzo/
26604+ */
26605+
26606+#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
26607+
26608+#ifndef __LZOCONF_H
26609+#define __LZOCONF_H
26610+
26611+#define LZO_VERSION 0x1080
26612+#define LZO_VERSION_STRING "1.08"
26613+#define LZO_VERSION_DATE "Jul 12 2002"
26614+
26615+/* internal Autoconf configuration file - only used when building LZO */
26616+
26617+/***********************************************************************
26618+// LZO requires a conforming <limits.h>
26619+************************************************************************/
26620+
26621+#define CHAR_BIT 8
26622+#define USHRT_MAX 0xffff
26623+
26624+/* workaround a cpp bug under hpux 10.20 */
26625+#define LZO_0xffffffffL 4294967295ul
26626+
26627+/***********************************************************************
26628+// architecture defines
26629+************************************************************************/
26630+
26631+#if !defined(__LZO_i386)
26632+# if defined(__i386__) || defined(__386__) || defined(_M_IX86)
26633+# define __LZO_i386
26634+# endif
26635+#endif
26636+
26637+/* memory checkers */
26638+#if !defined(__LZO_CHECKER)
26639+# if defined(__BOUNDS_CHECKING_ON)
26640+# define __LZO_CHECKER
26641+# elif defined(__CHECKER__)
26642+# define __LZO_CHECKER
26643+# elif defined(__INSURE__)
26644+# define __LZO_CHECKER
26645+# elif defined(__PURIFY__)
26646+# define __LZO_CHECKER
26647+# endif
26648+#endif
26649+
26650+/***********************************************************************
26651+// integral and pointer types
26652+************************************************************************/
26653+
26654+/* Integral types with 32 bits or more */
26655+#if !defined(LZO_UINT32_MAX)
26656+# if (UINT_MAX >= LZO_0xffffffffL)
26657+ typedef unsigned int lzo_uint32;
26658+ typedef int lzo_int32;
26659+# define LZO_UINT32_MAX UINT_MAX
26660+# define LZO_INT32_MAX INT_MAX
26661+# define LZO_INT32_MIN INT_MIN
26662+# elif (ULONG_MAX >= LZO_0xffffffffL)
26663+ typedef unsigned long lzo_uint32;
26664+ typedef long lzo_int32;
26665+# define LZO_UINT32_MAX ULONG_MAX
26666+# define LZO_INT32_MAX LONG_MAX
26667+# define LZO_INT32_MIN LONG_MIN
26668+# else
26669+# error "lzo_uint32"
26670+# endif
26671+#endif
26672+
26673+/* lzo_uint is used like size_t */
26674+#if !defined(LZO_UINT_MAX)
26675+# if (UINT_MAX >= LZO_0xffffffffL)
26676+ typedef unsigned int lzo_uint;
26677+ typedef int lzo_int;
26678+# define LZO_UINT_MAX UINT_MAX
26679+# define LZO_INT_MAX INT_MAX
26680+# define LZO_INT_MIN INT_MIN
26681+# elif (ULONG_MAX >= LZO_0xffffffffL)
26682+ typedef unsigned long lzo_uint;
26683+ typedef long lzo_int;
26684+# define LZO_UINT_MAX ULONG_MAX
26685+# define LZO_INT_MAX LONG_MAX
26686+# define LZO_INT_MIN LONG_MIN
26687+# else
26688+# error "lzo_uint"
26689+# endif
26690+#endif
26691+
26692+ typedef int lzo_bool;
26693+
26694+/***********************************************************************
26695+// memory models
26696+************************************************************************/
26697+
26698+/* Memory model that allows to access memory at offsets of lzo_uint. */
26699+#if !defined(__LZO_MMODEL)
26700+# if (LZO_UINT_MAX <= UINT_MAX)
26701+# define __LZO_MMODEL
26702+# else
26703+# error "__LZO_MMODEL"
26704+# endif
26705+#endif
26706+
26707+/* no typedef here because of const-pointer issues */
26708+#define lzo_byte unsigned char __LZO_MMODEL
26709+#define lzo_bytep unsigned char __LZO_MMODEL *
26710+#define lzo_charp char __LZO_MMODEL *
26711+#define lzo_voidp void __LZO_MMODEL *
26712+#define lzo_shortp short __LZO_MMODEL *
26713+#define lzo_ushortp unsigned short __LZO_MMODEL *
26714+#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
26715+#define lzo_int32p lzo_int32 __LZO_MMODEL *
26716+#define lzo_uintp lzo_uint __LZO_MMODEL *
26717+#define lzo_intp lzo_int __LZO_MMODEL *
26718+#define lzo_voidpp lzo_voidp __LZO_MMODEL *
26719+#define lzo_bytepp lzo_bytep __LZO_MMODEL *
26720+
26721+#ifndef lzo_sizeof_dict_t
26722+# define lzo_sizeof_dict_t sizeof(lzo_bytep)
26723+#endif
26724+
26725+typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26726+ lzo_byte * dst, lzo_uintp dst_len,
26727+ lzo_voidp wrkmem);
26728+
26729+
26730+/***********************************************************************
26731+// error codes and prototypes
26732+************************************************************************/
26733+
26734+/* Error codes for the compression/decompression functions. Negative
26735+ * values are errors, positive values will be used for special but
26736+ * normal events.
26737+ */
26738+#define LZO_E_OK 0
26739+#define LZO_E_ERROR (-1)
26740+#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
26741+#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
26742+#define LZO_E_INPUT_OVERRUN (-4)
26743+#define LZO_E_OUTPUT_OVERRUN (-5)
26744+#define LZO_E_LOOKBEHIND_OVERRUN (-6)
26745+#define LZO_E_EOF_NOT_FOUND (-7)
26746+#define LZO_E_INPUT_NOT_CONSUMED (-8)
26747+
26748+/* lzo_init() should be the first function you call.
26749+ * Check the return code !
26750+ *
26751+ * lzo_init() is a macro to allow checking that the library and the
26752+ * compiler's view of various types are consistent.
26753+ */
26754+#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26755+ (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26756+ (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26757+ (int)sizeof(lzo_compress_t))
26758+ extern int __lzo_init2(unsigned, int, int, int, int, int, int,
26759+ int, int, int);
26760+
26761+/* checksum functions */
26762+extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
26763+ lzo_uint _len);
26764+/* misc. */
26765+ typedef union {
26766+ lzo_bytep p;
26767+ lzo_uint u;
26768+ } __lzo_pu_u;
26769+ typedef union {
26770+ lzo_bytep p;
26771+ lzo_uint32 u32;
26772+ } __lzo_pu32_u;
26773+ typedef union {
26774+ void *vp;
26775+ lzo_bytep bp;
26776+ lzo_uint32 u32;
26777+ long l;
26778+ } lzo_align_t;
26779+
26780+#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26781+ ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26782+
26783+/* deprecated - only for backward compatibility */
26784+#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26785+
26786+#endif /* already included */
26787diff --git a/fs/reiser4/plugin/compress/minilzo.c b/fs/reiser4/plugin/compress/minilzo.c
26788new file mode 100644
26789index 0000000..2dba187
26790--- /dev/null
26791+++ b/fs/reiser4/plugin/compress/minilzo.c
26792@@ -0,0 +1,1967 @@
26793+/* minilzo.c -- mini subset of the LZO real-time data compression library
26794+ adopted for reiser4 compression transform plugin.
26795+
26796+ This file is part of the LZO real-time data compression library
26797+ and not included in any proprietary licenses of reiser4.
26798+
26799+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26800+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26801+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26802+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26803+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26804+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26805+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26806+ All Rights Reserved.
26807+
26808+ The LZO library is free software; you can redistribute it and/or
26809+ modify it under the terms of the GNU General Public License as
26810+ published by the Free Software Foundation; either version 2 of
26811+ the License, or (at your option) any later version.
26812+
26813+ The LZO library is distributed in the hope that it will be useful,
26814+ but WITHOUT ANY WARRANTY; without even the implied warranty of
26815+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26816+ GNU General Public License for more details.
26817+
26818+ You should have received a copy of the GNU General Public License
26819+ along with the LZO library; see the file COPYING.
26820+ If not, write to the Free Software Foundation, Inc.,
26821+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26822+
26823+ Markus F.X.J. Oberhumer
26824+ <markus@oberhumer.com>
26825+ http://www.oberhumer.com/opensource/lzo/
26826+ */
26827+
26828+/*
26829+ * NOTE:
26830+ * the full LZO package can be found at
26831+ * http://www.oberhumer.com/opensource/lzo/
26832+ */
26833+
26834+#include "../../debug.h" /* for reiser4 assert macro -edward */
26835+
26836+#define __LZO_IN_MINILZO
26837+#define LZO_BUILD
26838+
26839+#include "minilzo.h"
26840+
26841+#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26842+# error "version mismatch in miniLZO source files"
26843+#endif
26844+
26845+#ifndef __LZO_CONF_H
26846+#define __LZO_CONF_H
26847+
26848+# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
26849+# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
26850+
26851+# define HAVE_MEMCMP
26852+# define HAVE_MEMCPY
26853+# define HAVE_MEMMOVE
26854+# define HAVE_MEMSET
26855+
26856+#undef NDEBUG
26857+#if !defined(LZO_DEBUG)
26858+# define NDEBUG
26859+#endif
26860+#if defined(LZO_DEBUG) || !defined(NDEBUG)
26861+# if !defined(NO_STDIO_H)
26862+# include <stdio.h>
26863+# endif
26864+#endif
26865+
26866+#if !defined(LZO_COMPILE_TIME_ASSERT)
26867+# define LZO_COMPILE_TIME_ASSERT(expr) \
26868+ { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26869+#endif
26870+
26871+#if !defined(LZO_UNUSED)
26872+# if 1
26873+# define LZO_UNUSED(var) ((void)&var)
26874+# elif 0
26875+# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26876+# else
26877+# define LZO_UNUSED(parm) (parm = parm)
26878+# endif
26879+#endif
26880+
26881+#if defined(NO_MEMCMP)
26882+# undef HAVE_MEMCMP
26883+#endif
26884+
26885+#if !defined(HAVE_MEMSET)
26886+# undef memset
26887+# define memset lzo_memset
26888+#endif
26889+
26890+# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
26891+
26892+#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
26893+#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
26894+#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26895+#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26896+
26897+#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
26898+
26899+#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26900+
26901+#define LZO_SIZE(bits) (1u << (bits))
26902+#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
26903+
26904+#define LZO_LSIZE(bits) (1ul << (bits))
26905+#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
26906+
26907+#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
26908+#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
26909+
26910+#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
26911+#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26912+
26913+#if !defined(SIZEOF_UNSIGNED)
26914+# if (UINT_MAX == 0xffff)
26915+# define SIZEOF_UNSIGNED 2
26916+# elif (UINT_MAX == LZO_0xffffffffL)
26917+# define SIZEOF_UNSIGNED 4
26918+# elif (UINT_MAX >= LZO_0xffffffffL)
26919+# define SIZEOF_UNSIGNED 8
26920+# else
26921+# error "SIZEOF_UNSIGNED"
26922+# endif
26923+#endif
26924+
26925+#if !defined(SIZEOF_UNSIGNED_LONG)
26926+# if (ULONG_MAX == LZO_0xffffffffL)
26927+# define SIZEOF_UNSIGNED_LONG 4
26928+# elif (ULONG_MAX >= LZO_0xffffffffL)
26929+# define SIZEOF_UNSIGNED_LONG 8
26930+# else
26931+# error "SIZEOF_UNSIGNED_LONG"
26932+# endif
26933+#endif
26934+
26935+#if !defined(SIZEOF_SIZE_T)
26936+# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
26937+#endif
26938+#if !defined(SIZE_T_MAX)
26939+# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26940+#endif
26941+
26942+#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26943+# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26944+# define LZO_UNALIGNED_OK_2
26945+# endif
26946+# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26947+# define LZO_UNALIGNED_OK_4
26948+# endif
26949+#endif
26950+
26951+#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26952+# if !defined(LZO_UNALIGNED_OK)
26953+# define LZO_UNALIGNED_OK
26954+# endif
26955+#endif
26956+
26957+#if defined(__LZO_NO_UNALIGNED)
26958+# undef LZO_UNALIGNED_OK
26959+# undef LZO_UNALIGNED_OK_2
26960+# undef LZO_UNALIGNED_OK_4
26961+#endif
26962+
26963+#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26964+# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26965+#endif
26966+#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26967+# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26968+#endif
26969+
26970+#if defined(__LZO_NO_ALIGNED)
26971+# undef LZO_ALIGNED_OK_4
26972+#endif
26973+
26974+#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26975+# error "LZO_ALIGNED_OK_4 must not be defined on this system"
26976+#endif
26977+
26978+#define LZO_LITTLE_ENDIAN 1234
26979+#define LZO_BIG_ENDIAN 4321
26980+#define LZO_PDP_ENDIAN 3412
26981+
26982+#if !defined(LZO_BYTE_ORDER)
26983+# if defined(MFX_BYTE_ORDER)
26984+# define LZO_BYTE_ORDER MFX_BYTE_ORDER
26985+# elif defined(__LZO_i386)
26986+# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
26987+# elif defined(BYTE_ORDER)
26988+# define LZO_BYTE_ORDER BYTE_ORDER
26989+# elif defined(__BYTE_ORDER)
26990+# define LZO_BYTE_ORDER __BYTE_ORDER
26991+# endif
26992+#endif
26993+
26994+#if defined(LZO_BYTE_ORDER)
26995+# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26996+ (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26997+# error "invalid LZO_BYTE_ORDER"
26998+# endif
26999+#endif
27000+
27001+#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
27002+# error "LZO_BYTE_ORDER is not defined"
27003+#endif
27004+
27005+#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
27006+
27007+#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
27008+# if defined(__GNUC__) && defined(__i386__)
27009+# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
27010+# define LZO_OPTIMIZE_GNUC_i386
27011+# endif
27012+# endif
27013+#endif
27014+
27015+extern const lzo_uint32 _lzo_crc32_table[256];
27016+
27017+#define _LZO_STRINGIZE(x) #x
27018+#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
27019+
27020+#define _LZO_CONCAT2(a,b) a ## b
27021+#define _LZO_CONCAT3(a,b,c) a ## b ## c
27022+#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
27023+#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
27024+
27025+#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
27026+#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
27027+#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
27028+#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
27029+
27030+#ifndef __LZO_PTR_H
27031+#define __LZO_PTR_H
27032+
27033+#if !defined(lzo_ptrdiff_t)
27034+# if (UINT_MAX >= LZO_0xffffffffL)
27035+typedef ptrdiff_t lzo_ptrdiff_t;
27036+# else
27037+typedef long lzo_ptrdiff_t;
27038+# endif
27039+#endif
27040+
27041+#if !defined(__LZO_HAVE_PTR_T)
27042+# if defined(lzo_ptr_t)
27043+# define __LZO_HAVE_PTR_T
27044+# endif
27045+#endif
27046+#if !defined(__LZO_HAVE_PTR_T)
27047+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
27048+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
27049+typedef unsigned long lzo_ptr_t;
27050+typedef long lzo_sptr_t;
27051+# define __LZO_HAVE_PTR_T
27052+# endif
27053+# endif
27054+#endif
27055+#if !defined(__LZO_HAVE_PTR_T)
27056+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
27057+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
27058+typedef unsigned int lzo_ptr_t;
27059+typedef int lzo_sptr_t;
27060+# define __LZO_HAVE_PTR_T
27061+# endif
27062+# endif
27063+#endif
27064+#if !defined(__LZO_HAVE_PTR_T)
27065+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
27066+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
27067+typedef unsigned short lzo_ptr_t;
27068+typedef short lzo_sptr_t;
27069+# define __LZO_HAVE_PTR_T
27070+# endif
27071+# endif
27072+#endif
27073+#if !defined(__LZO_HAVE_PTR_T)
27074+# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
27075+# error "no suitable type for lzo_ptr_t"
27076+# else
27077+typedef unsigned long lzo_ptr_t;
27078+typedef long lzo_sptr_t;
27079+# define __LZO_HAVE_PTR_T
27080+# endif
27081+#endif
27082+
27083+#define PTR(a) ((lzo_ptr_t) (a))
27084+#define PTR_LINEAR(a) PTR(a)
27085+#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
27086+#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
27087+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
27088+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
27089+
27090+#define PTR_LT(a,b) (PTR(a) < PTR(b))
27091+#define PTR_GE(a,b) (PTR(a) >= PTR(b))
27092+#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
27093+#define pd(a,b) ((lzo_uint) ((a)-(b)))
27094+
27095+typedef union {
27096+ char a_char;
27097+ unsigned char a_uchar;
27098+ short a_short;
27099+ unsigned short a_ushort;
27100+ int a_int;
27101+ unsigned int a_uint;
27102+ long a_long;
27103+ unsigned long a_ulong;
27104+ lzo_int a_lzo_int;
27105+ lzo_uint a_lzo_uint;
27106+ lzo_int32 a_lzo_int32;
27107+ lzo_uint32 a_lzo_uint32;
27108+ ptrdiff_t a_ptrdiff_t;
27109+ lzo_ptrdiff_t a_lzo_ptrdiff_t;
27110+ lzo_ptr_t a_lzo_ptr_t;
27111+ lzo_voidp a_lzo_voidp;
27112+ void *a_void_p;
27113+ lzo_bytep a_lzo_bytep;
27114+ lzo_bytepp a_lzo_bytepp;
27115+ lzo_uintp a_lzo_uintp;
27116+ lzo_uint *a_lzo_uint_p;
27117+ lzo_uint32p a_lzo_uint32p;
27118+ lzo_uint32 *a_lzo_uint32_p;
27119+ unsigned char *a_uchar_p;
27120+ char *a_char_p;
27121+} lzo_full_align_t;
27122+
27123+#endif
27124+#define LZO_DETERMINISTIC
27125+#define LZO_DICT_USE_PTR
27126+# define lzo_dict_t const lzo_bytep
27127+# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
27128+#if !defined(lzo_moff_t)
27129+#define lzo_moff_t lzo_uint
27130+#endif
27131+#endif
27132+static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
27133+{
27134+ return PTR_LINEAR(ptr);
27135+}
27136+
27137+static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
27138+{
27139+ lzo_ptr_t p, s, n;
27140+
27141+ assert("lzo-01", size > 0);
27142+
27143+ p = __lzo_ptr_linear(ptr);
27144+ s = (lzo_ptr_t) (size - 1);
27145+ n = (((p + s) / size) * size) - p;
27146+
27147+ assert("lzo-02", (long)n >= 0);
27148+ assert("lzo-03", n <= s);
27149+
27150+ return (unsigned)n;
27151+}
27152+
27153+#ifndef __LZO_UTIL_H
27154+#define __LZO_UTIL_H
27155+
27156+#ifndef __LZO_CONF_H
27157+#endif
27158+
27159+#if 1 && defined(HAVE_MEMCPY)
27160+#define MEMCPY8_DS(dest,src,len) \
27161+ memcpy(dest,src,len); \
27162+ dest += len; \
27163+ src += len
27164+#endif
27165+
27166+#if !defined(MEMCPY8_DS)
27167+
27168+#define MEMCPY8_DS(dest,src,len) \
27169+ { register lzo_uint __l = (len) / 8; \
27170+ do { \
27171+ *dest++ = *src++; \
27172+ *dest++ = *src++; \
27173+ *dest++ = *src++; \
27174+ *dest++ = *src++; \
27175+ *dest++ = *src++; \
27176+ *dest++ = *src++; \
27177+ *dest++ = *src++; \
27178+ *dest++ = *src++; \
27179+ } while (--__l > 0); }
27180+
27181+#endif
27182+
27183+#define MEMCPY_DS(dest,src,len) \
27184+ do *dest++ = *src++; \
27185+ while (--len > 0)
27186+
27187+#define MEMMOVE_DS(dest,src,len) \
27188+ do *dest++ = *src++; \
27189+ while (--len > 0)
27190+
27191+#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
27192+
27193+#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
27194+
27195+#else
27196+
27197+#define BZERO8_PTR(s,l,n) \
27198+ lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
27199+
27200+#endif
27201+#endif
27202+
27203+/* If you use the LZO library in a product, you *must* keep this
27204+ * copyright string in the executable of your product.
27205+ */
27206+
27207+static const lzo_byte __lzo_copyright[] =
27208+#if !defined(__LZO_IN_MINLZO)
27209+ LZO_VERSION_STRING;
27210+#else
27211+ "\n\n\n"
27212+ "LZO real-time data compression library.\n"
27213+ "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
27214+ "<markus.oberhumer@jk.uni-linz.ac.at>\n"
27215+ "http://www.oberhumer.com/opensource/lzo/\n"
27216+ "\n"
27217+ "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
27218+ "LZO build date: " __DATE__ " " __TIME__ "\n\n"
27219+ "LZO special compilation options:\n"
27220+#ifdef __cplusplus
27221+ " __cplusplus\n"
27222+#endif
27223+#if defined(__PIC__)
27224+ " __PIC__\n"
27225+#elif defined(__pic__)
27226+ " __pic__\n"
27227+#endif
27228+#if (UINT_MAX < LZO_0xffffffffL)
27229+ " 16BIT\n"
27230+#endif
27231+#if defined(__LZO_STRICT_16BIT)
27232+ " __LZO_STRICT_16BIT\n"
27233+#endif
27234+#if (UINT_MAX > LZO_0xffffffffL)
27235+ " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
27236+#endif
27237+#if (ULONG_MAX > LZO_0xffffffffL)
27238+ " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
27239+#endif
27240+#if defined(LZO_BYTE_ORDER)
27241+ " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
27242+#endif
27243+#if defined(LZO_UNALIGNED_OK_2)
27244+ " LZO_UNALIGNED_OK_2\n"
27245+#endif
27246+#if defined(LZO_UNALIGNED_OK_4)
27247+ " LZO_UNALIGNED_OK_4\n"
27248+#endif
27249+#if defined(LZO_ALIGNED_OK_4)
27250+ " LZO_ALIGNED_OK_4\n"
27251+#endif
27252+#if defined(LZO_DICT_USE_PTR)
27253+ " LZO_DICT_USE_PTR\n"
27254+#endif
27255+#if defined(__LZO_QUERY_COMPRESS)
27256+ " __LZO_QUERY_COMPRESS\n"
27257+#endif
27258+#if defined(__LZO_QUERY_DECOMPRESS)
27259+ " __LZO_QUERY_DECOMPRESS\n"
27260+#endif
27261+#if defined(__LZO_IN_MINILZO)
27262+ " __LZO_IN_MINILZO\n"
27263+#endif
27264+ "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
27265+#if defined(__GNUC__) && defined(__VERSION__)
27266+ " by gcc " __VERSION__
27267+#elif defined(__BORLANDC__)
27268+ " by Borland C " _LZO_MEXPAND(__BORLANDC__)
27269+#elif defined(_MSC_VER)
27270+ " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
27271+#elif defined(__PUREC__)
27272+ " by Pure C " _LZO_MEXPAND(__PUREC__)
27273+#elif defined(__SC__)
27274+ " by Symantec C " _LZO_MEXPAND(__SC__)
27275+#elif defined(__TURBOC__)
27276+ " by Turbo C " _LZO_MEXPAND(__TURBOC__)
27277+#elif defined(__WATCOMC__)
27278+ " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
27279+#endif
27280+ " $\n"
27281+ "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
27282+#endif
27283+
27284+#define LZO_BASE 65521u
27285+#define LZO_NMAX 5552
27286+
27287+#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
27288+#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
27289+#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
27290+#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
27291+#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
27292+
27293+# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
27294+# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
27295+
27296+#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
27297+
27298+static lzo_bool schedule_insns_bug(void);
27299+static lzo_bool strength_reduce_bug(int *);
27300+
27301+# define __lzo_assert(x) ((x) ? 1 : 0)
27302+
27303+#undef COMPILE_TIME_ASSERT
27304+
27305+# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
27306+
27307+static lzo_bool basic_integral_check(void)
27308+{
27309+ lzo_bool r = 1;
27310+
27311+ COMPILE_TIME_ASSERT(CHAR_BIT == 8);
27312+ COMPILE_TIME_ASSERT(sizeof(char) == 1);
27313+ COMPILE_TIME_ASSERT(sizeof(short) >= 2);
27314+ COMPILE_TIME_ASSERT(sizeof(long) >= 4);
27315+ COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
27316+ COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
27317+
27318+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
27319+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
27320+
27321+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
27322+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
27323+#if defined(__LZO_STRICT_16BIT)
27324+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
27325+#else
27326+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
27327+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
27328+#endif
27329+
27330+#if (USHRT_MAX == 65535u)
27331+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
27332+#elif (USHRT_MAX == LZO_0xffffffffL)
27333+ COMPILE_TIME_ASSERT(sizeof(short) == 4);
27334+#elif (USHRT_MAX >= LZO_0xffffffffL)
27335+ COMPILE_TIME_ASSERT(sizeof(short) > 4);
27336+#endif
27337+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
27338+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
27339+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
27340+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
27341+ COMPILE_TIME_ASSERT(IS_SIGNED(short));
27342+ COMPILE_TIME_ASSERT(IS_SIGNED(int));
27343+ COMPILE_TIME_ASSERT(IS_SIGNED(long));
27344+
27345+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
27346+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
27347+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
27348+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
27349+
27350+ COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
27351+ COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
27352+ COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
27353+ COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
27354+ COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
27355+ COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
27356+ LZO_UTYPE_MAX(sizeof(lzo_uint32)));
27357+ COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
27358+
27359+ r &= __lzo_assert(LZO_BYTE(257) == 1);
27360+
27361+ return r;
27362+}
27363+
27364+static lzo_bool basic_ptr_check(void)
27365+{
27366+ lzo_bool r = 1;
27367+
27368+ COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
27369+ COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27370+
27371+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27372+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27373+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27374+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27375+
27376+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27377+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27378+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27379+
27380+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27381+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27382+
27383+ COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27384+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27385+
27386+#if defined(SIZEOF_CHAR_P)
27387+ COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27388+#endif
27389+#if defined(SIZEOF_PTRDIFF_T)
27390+ COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27391+#endif
27392+
27393+ COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27394+ COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27395+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27396+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27397+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27398+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27399+
27400+ return r;
27401+}
27402+
27403+static lzo_bool ptr_check(void)
27404+{
27405+ lzo_bool r = 1;
27406+ int i;
27407+ char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27408+ lzo_bytep wrkmem;
27409+ lzo_bytepp dict;
27410+ unsigned char x[4 * sizeof(lzo_full_align_t)];
27411+ long d;
27412+ lzo_full_align_t a;
27413+ lzo_full_align_t u;
27414+
27415+ for (i = 0; i < (int)sizeof(x); i++)
27416+ x[i] = LZO_BYTE(i);
27417+
27418+ wrkmem =
27419+ LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27420+
27421+ u.a_lzo_bytep = wrkmem;
27422+ dict = u.a_lzo_bytepp;
27423+
27424+ d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27425+ r &= __lzo_assert(d >= 0);
27426+ r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27427+
27428+ memset(&a, 0, sizeof(a));
27429+ r &= __lzo_assert(a.a_lzo_voidp == NULL);
27430+
27431+ memset(&a, 0xff, sizeof(a));
27432+ r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27433+ r &= __lzo_assert(a.a_uint == UINT_MAX);
27434+ r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27435+ r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27436+ r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27437+
27438+ if (r == 1) {
27439+ for (i = 0; i < 8; i++)
27440+ r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27441+ (const
27442+ lzo_voidp)(&wrkmem[i *
27443+ sizeof(lzo_byte
27444+ *)]));
27445+ }
27446+
27447+ memset(&a, 0, sizeof(a));
27448+ r &= __lzo_assert(a.a_char_p == NULL);
27449+ r &= __lzo_assert(a.a_lzo_bytep == NULL);
27450+ r &= __lzo_assert(NULL == (void *)0);
27451+ if (r == 1) {
27452+ for (i = 0; i < 10; i++)
27453+ dict[i] = wrkmem;
27454+ BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27455+ r &= __lzo_assert(dict[0] == wrkmem);
27456+ for (i = 1; i < 9; i++)
27457+ r &= __lzo_assert(dict[i] == NULL);
27458+ r &= __lzo_assert(dict[9] == wrkmem);
27459+ }
27460+
27461+ if (r == 1) {
27462+ unsigned k = 1;
27463+ const unsigned n = (unsigned)sizeof(lzo_uint32);
27464+ lzo_byte *p0;
27465+ lzo_byte *p1;
27466+
27467+ k += __lzo_align_gap(&x[k], n);
27468+ p0 = (lzo_bytep) & x[k];
27469+#if defined(PTR_LINEAR)
27470+ r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27471+#else
27472+ r &= __lzo_assert(n == 4);
27473+ r &= __lzo_assert(PTR_ALIGNED_4(p0));
27474+#endif
27475+
27476+ r &= __lzo_assert(k >= 1);
27477+ p1 = (lzo_bytep) & x[1];
27478+ r &= __lzo_assert(PTR_GE(p0, p1));
27479+
27480+ r &= __lzo_assert(k < 1 + n);
27481+ p1 = (lzo_bytep) & x[1 + n];
27482+ r &= __lzo_assert(PTR_LT(p0, p1));
27483+
27484+ if (r == 1) {
27485+ lzo_uint32 v0, v1;
27486+
27487+ u.a_uchar_p = &x[k];
27488+ v0 = *u.a_lzo_uint32_p;
27489+ u.a_uchar_p = &x[k + n];
27490+ v1 = *u.a_lzo_uint32_p;
27491+
27492+ r &= __lzo_assert(v0 > 0);
27493+ r &= __lzo_assert(v1 > 0);
27494+ }
27495+ }
27496+
27497+ return r;
27498+}
27499+
27500+static int _lzo_config_check(void)
27501+{
27502+ lzo_bool r = 1;
27503+ int i;
27504+ union {
27505+ lzo_uint32 a;
27506+ unsigned short b;
27507+ lzo_uint32 aa[4];
27508+ unsigned char x[4 * sizeof(lzo_full_align_t)];
27509+ } u;
27510+
27511+ COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27512+ COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27513+ < 0);
27514+
27515+ r &= basic_integral_check();
27516+ r &= basic_ptr_check();
27517+ if (r != 1)
27518+ return LZO_E_ERROR;
27519+
27520+ u.a = 0;
27521+ u.b = 0;
27522+ for (i = 0; i < (int)sizeof(u.x); i++)
27523+ u.x[i] = LZO_BYTE(i);
27524+
27525+#if defined(LZO_BYTE_ORDER)
27526+ if (r == 1) {
27527+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27528+ lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27529+ unsigned short b = (unsigned short)(u.b & 0xffff);
27530+ r &= __lzo_assert(a == 0x03020100L);
27531+ r &= __lzo_assert(b == 0x0100);
27532+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27533+ lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27534+ unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27535+ r &= __lzo_assert(a == 0x00010203L);
27536+ r &= __lzo_assert(b == 0x0001);
27537+# else
27538+# error "invalid LZO_BYTE_ORDER"
27539+# endif
27540+ }
27541+#endif
27542+
27543+#if defined(LZO_UNALIGNED_OK_2)
27544+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
27545+ if (r == 1) {
27546+ unsigned short b[4];
27547+
27548+ for (i = 0; i < 4; i++)
27549+ b[i] = *(const unsigned short *)&u.x[i];
27550+
27551+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27552+ r &= __lzo_assert(b[0] == 0x0100);
27553+ r &= __lzo_assert(b[1] == 0x0201);
27554+ r &= __lzo_assert(b[2] == 0x0302);
27555+ r &= __lzo_assert(b[3] == 0x0403);
27556+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27557+ r &= __lzo_assert(b[0] == 0x0001);
27558+ r &= __lzo_assert(b[1] == 0x0102);
27559+ r &= __lzo_assert(b[2] == 0x0203);
27560+ r &= __lzo_assert(b[3] == 0x0304);
27561+# endif
27562+ }
27563+#endif
27564+
27565+#if defined(LZO_UNALIGNED_OK_4)
27566+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27567+ if (r == 1) {
27568+ lzo_uint32 a[4];
27569+
27570+ for (i = 0; i < 4; i++)
27571+ a[i] = *(const lzo_uint32 *)&u.x[i];
27572+
27573+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27574+ r &= __lzo_assert(a[0] == 0x03020100L);
27575+ r &= __lzo_assert(a[1] == 0x04030201L);
27576+ r &= __lzo_assert(a[2] == 0x05040302L);
27577+ r &= __lzo_assert(a[3] == 0x06050403L);
27578+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27579+ r &= __lzo_assert(a[0] == 0x00010203L);
27580+ r &= __lzo_assert(a[1] == 0x01020304L);
27581+ r &= __lzo_assert(a[2] == 0x02030405L);
27582+ r &= __lzo_assert(a[3] == 0x03040506L);
27583+# endif
27584+ }
27585+#endif
27586+
27587+#if defined(LZO_ALIGNED_OK_4)
27588+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27589+#endif
27590+
27591+ COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27592+
27593+ if (r == 1) {
27594+ r &= __lzo_assert(!schedule_insns_bug());
27595+ }
27596+
27597+ if (r == 1) {
27598+ static int x[3];
27599+ static unsigned xn = 3;
27600+ register unsigned j;
27601+
27602+ for (j = 0; j < xn; j++)
27603+ x[j] = (int)j - 3;
27604+ r &= __lzo_assert(!strength_reduce_bug(x));
27605+ }
27606+
27607+ if (r == 1) {
27608+ r &= ptr_check();
27609+ }
27610+
27611+ return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27612+}
27613+
27614+static lzo_bool schedule_insns_bug(void)
27615+{
27616+#if defined(__LZO_CHECKER)
27617+ return 0;
27618+#else
27619+ const int clone[] = { 1, 2, 0 };
27620+ const int *q;
27621+ q = clone;
27622+ return (*q) ? 0 : 1;
27623+#endif
27624+}
27625+
27626+static lzo_bool strength_reduce_bug(int *x)
27627+{
27628+ return x[0] != -3 || x[1] != -2 || x[2] != -1;
27629+}
27630+
27631+#undef COMPILE_TIME_ASSERT
27632+
27633+int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27634+ int s6, int s7, int s8, int s9)
27635+{
27636+ int r;
27637+
27638+ if (v == 0)
27639+ return LZO_E_ERROR;
27640+
27641+ r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27642+ (s2 == -1 || s2 == (int)sizeof(int)) &&
27643+ (s3 == -1 || s3 == (int)sizeof(long)) &&
27644+ (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27645+ (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27646+ (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27647+ (s7 == -1 || s7 == (int)sizeof(char *)) &&
27648+ (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27649+ (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27650+ if (!r)
27651+ return LZO_E_ERROR;
27652+
27653+ r = _lzo_config_check();
27654+ if (r != LZO_E_OK)
27655+ return r;
27656+
27657+ return r;
27658+}
27659+
27660+#define do_compress _lzo1x_1_do_compress
27661+
27662+#define LZO_NEED_DICT_H
27663+#define D_BITS 14
27664+#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
27665+#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27666+
27667+#ifndef __LZO_CONFIG1X_H
27668+#define __LZO_CONFIG1X_H
27669+
27670+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27671+# define LZO1X
27672+#endif
27673+
27674+#define LZO_EOF_CODE
27675+#undef LZO_DETERMINISTIC
27676+
27677+#define M1_MAX_OFFSET 0x0400
27678+#ifndef M2_MAX_OFFSET
27679+#define M2_MAX_OFFSET 0x0800
27680+#endif
27681+#define M3_MAX_OFFSET 0x4000
27682+#define M4_MAX_OFFSET 0xbfff
27683+
27684+#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
27685+
27686+#define M1_MIN_LEN 2
27687+#define M1_MAX_LEN 2
27688+#define M2_MIN_LEN 3
27689+#ifndef M2_MAX_LEN
27690+#define M2_MAX_LEN 8
27691+#endif
27692+#define M3_MIN_LEN 3
27693+#define M3_MAX_LEN 33
27694+#define M4_MIN_LEN 3
27695+#define M4_MAX_LEN 9
27696+
27697+#define M1_MARKER 0
27698+#define M2_MARKER 64
27699+#define M3_MARKER 32
27700+#define M4_MARKER 16
27701+
27702+#ifndef MIN_LOOKAHEAD
27703+#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
27704+#endif
27705+
27706+#if defined(LZO_NEED_DICT_H)
27707+
27708+#ifndef LZO_HASH
27709+#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
27710+#endif
27711+#define DL_MIN_LEN M2_MIN_LEN
27712+
27713+#ifndef __LZO_DICT_H
27714+#define __LZO_DICT_H
27715+
27716+#if !defined(D_BITS) && defined(DBITS)
27717+# define D_BITS DBITS
27718+#endif
27719+#if !defined(D_BITS)
27720+# error "D_BITS is not defined"
27721+#endif
27722+#if (D_BITS < 16)
27723+# define D_SIZE LZO_SIZE(D_BITS)
27724+# define D_MASK LZO_MASK(D_BITS)
27725+#else
27726+# define D_SIZE LZO_USIZE(D_BITS)
27727+# define D_MASK LZO_UMASK(D_BITS)
27728+#endif
27729+#define D_HIGH ((D_MASK >> 1) + 1)
27730+
27731+#if !defined(DD_BITS)
27732+# define DD_BITS 0
27733+#endif
27734+#define DD_SIZE LZO_SIZE(DD_BITS)
27735+#define DD_MASK LZO_MASK(DD_BITS)
27736+
27737+#if !defined(DL_BITS)
27738+# define DL_BITS (D_BITS - DD_BITS)
27739+#endif
27740+#if (DL_BITS < 16)
27741+# define DL_SIZE LZO_SIZE(DL_BITS)
27742+# define DL_MASK LZO_MASK(DL_BITS)
27743+#else
27744+# define DL_SIZE LZO_USIZE(DL_BITS)
27745+# define DL_MASK LZO_UMASK(DL_BITS)
27746+#endif
27747+
27748+#if (D_BITS != DL_BITS + DD_BITS)
27749+# error "D_BITS does not match"
27750+#endif
27751+#if (D_BITS < 8 || D_BITS > 18)
27752+# error "invalid D_BITS"
27753+#endif
27754+#if (DL_BITS < 8 || DL_BITS > 20)
27755+# error "invalid DL_BITS"
27756+#endif
27757+#if (DD_BITS < 0 || DD_BITS > 6)
27758+# error "invalid DD_BITS"
27759+#endif
27760+
27761+#if !defined(DL_MIN_LEN)
27762+# define DL_MIN_LEN 3
27763+#endif
27764+#if !defined(DL_SHIFT)
27765+# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27766+#endif
27767+
27768+#define LZO_HASH_GZIP 1
27769+#define LZO_HASH_GZIP_INCREMENTAL 2
27770+#define LZO_HASH_LZO_INCREMENTAL_A 3
27771+#define LZO_HASH_LZO_INCREMENTAL_B 4
27772+
27773+#if !defined(LZO_HASH)
27774+# error "choose a hashing strategy"
27775+#endif
27776+
27777+#if (DL_MIN_LEN == 3)
27778+# define _DV2_A(p,shift1,shift2) \
27779+ (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27780+# define _DV2_B(p,shift1,shift2) \
27781+ (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27782+# define _DV3_B(p,shift1,shift2,shift3) \
27783+ ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27784+#elif (DL_MIN_LEN == 2)
27785+# define _DV2_A(p,shift1,shift2) \
27786+ (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27787+# define _DV2_B(p,shift1,shift2) \
27788+ (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27789+#else
27790+# error "invalid DL_MIN_LEN"
27791+#endif
27792+#define _DV_A(p,shift) _DV2_A(p,shift,shift)
27793+#define _DV_B(p,shift) _DV2_B(p,shift,shift)
27794+#define DA2(p,s1,s2) \
27795+ (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27796+#define DS2(p,s1,s2) \
27797+ (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27798+#define DX2(p,s1,s2) \
27799+ (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27800+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27801+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27802+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27803+#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27804+#define DM(v) DMS(v,0)
27805+
27806+#if (LZO_HASH == LZO_HASH_GZIP)
27807+# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
27808+
27809+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27810+# define __LZO_HASH_INCREMENTAL
27811+# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
27812+# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
27813+# define _DINDEX(dv,p) (dv)
27814+# define DVAL_LOOKAHEAD DL_MIN_LEN
27815+
27816+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27817+# define __LZO_HASH_INCREMENTAL
27818+# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
27819+# define DVAL_NEXT(dv,p) \
27820+ dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27821+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27822+# define DVAL_LOOKAHEAD DL_MIN_LEN
27823+
27824+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27825+# define __LZO_HASH_INCREMENTAL
27826+# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
27827+# define DVAL_NEXT(dv,p) \
27828+ dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27829+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
27830+# define DVAL_LOOKAHEAD DL_MIN_LEN
27831+
27832+#else
27833+# error "choose a hashing strategy"
27834+#endif
27835+
27836+#ifndef DINDEX
27837+#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27838+#endif
27839+#if !defined(DINDEX1) && defined(D_INDEX1)
27840+#define DINDEX1 D_INDEX1
27841+#endif
27842+#if !defined(DINDEX2) && defined(D_INDEX2)
27843+#define DINDEX2 D_INDEX2
27844+#endif
27845+
27846+#if !defined(__LZO_HASH_INCREMENTAL)
27847+# define DVAL_FIRST(dv,p) ((void) 0)
27848+# define DVAL_NEXT(dv,p) ((void) 0)
27849+# define DVAL_LOOKAHEAD 0
27850+#endif
27851+
27852+#if !defined(DVAL_ASSERT)
27853+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27854+static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
27855+{
27856+ lzo_uint32 df;
27857+ DVAL_FIRST(df, (p));
27858+ assert(DINDEX(dv, p) == DINDEX(df, p));
27859+}
27860+#else
27861+# define DVAL_ASSERT(dv,p) ((void) 0)
27862+#endif
27863+#endif
27864+
27865+# define DENTRY(p,in) (p)
27866+# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
27867+
27868+#if (DD_BITS == 0)
27869+
27870+# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27871+# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
27872+# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
27873+
27874+#else
27875+
27876+# define UPDATE_D(dict,drun,dv,p,in) \
27877+ dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27878+# define UPDATE_I(dict,drun,index,p,in) \
27879+ dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27880+# define UPDATE_P(ptr,drun,p,in) \
27881+ (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
27882+
27883+#endif
27884+
27885+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
27886+ (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
27887+
27888+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
27889+ (BOUNDS_CHECKING_OFF_IN_EXPR( \
27890+ (PTR_LT(m_pos,in) || \
27891+ (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
27892+ m_off > max_offset) ))
27893+
27894+#if defined(LZO_DETERMINISTIC)
27895+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
27896+#else
27897+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
27898+#endif
27899+#endif
27900+#endif
27901+#endif
27902+#define DO_COMPRESS lzo1x_1_compress
27903+static
27904+lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
27905+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27906+{
27907+ register const lzo_byte *ip;
27908+ lzo_byte *op;
27909+ const lzo_byte *const in_end = in + in_len;
27910+ const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
27911+ const lzo_byte *ii;
27912+ lzo_dict_p const dict = (lzo_dict_p) wrkmem;
27913+
27914+ op = out;
27915+ ip = in;
27916+ ii = ip;
27917+
27918+ ip += 4;
27919+ for (;;) {
27920+ register const lzo_byte *m_pos;
27921+
27922+ lzo_moff_t m_off;
27923+ lzo_uint m_len;
27924+ lzo_uint dindex;
27925+
27926+ DINDEX1(dindex, ip);
27927+ GINDEX(m_pos, m_off, dict, dindex, in);
27928+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27929+ goto literal;
27930+#if 1
27931+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27932+ goto try_match;
27933+ DINDEX2(dindex, ip);
27934+#endif
27935+ GINDEX(m_pos, m_off, dict, dindex, in);
27936+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27937+ goto literal;
27938+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27939+ goto try_match;
27940+ goto literal;
27941+
27942+ try_match:
27943+#if 1 && defined(LZO_UNALIGNED_OK_2)
27944+ if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
27945+#else
27946+ if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
27947+#endif
27948+ ;
27949+ } else {
27950+ if (m_pos[2] == ip[2]) {
27951+ goto match;
27952+ } else {
27953+ ;
27954+ }
27955+ }
27956+
27957+ literal:
27958+ UPDATE_I(dict, 0, dindex, ip, in);
27959+ ++ip;
27960+ if (ip >= ip_end)
27961+ break;
27962+ continue;
27963+
27964+ match:
27965+ UPDATE_I(dict, 0, dindex, ip, in);
27966+ if (pd(ip, ii) > 0) {
27967+ register lzo_uint t = pd(ip, ii);
27968+
27969+ if (t <= 3) {
27970+ assert("lzo-04", op - 2 > out);
27971+ op[-2] |= LZO_BYTE(t);
27972+ } else if (t <= 18)
27973+ *op++ = LZO_BYTE(t - 3);
27974+ else {
27975+ register lzo_uint tt = t - 18;
27976+
27977+ *op++ = 0;
27978+ while (tt > 255) {
27979+ tt -= 255;
27980+ *op++ = 0;
27981+ }
27982+ assert("lzo-05", tt > 0);
27983+ *op++ = LZO_BYTE(tt);
27984+ }
27985+ do
27986+ *op++ = *ii++;
27987+ while (--t > 0);
27988+ }
27989+
27990+ assert("lzo-06", ii == ip);
27991+ ip += 3;
27992+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
27993+ || m_pos[6] != *ip++ || m_pos[7] != *ip++
27994+ || m_pos[8] != *ip++
27995+#ifdef LZO1Y
27996+ || m_pos[9] != *ip++ || m_pos[10] != *ip++
27997+ || m_pos[11] != *ip++ || m_pos[12] != *ip++
27998+ || m_pos[13] != *ip++ || m_pos[14] != *ip++
27999+#endif
28000+ ) {
28001+ --ip;
28002+ m_len = ip - ii;
28003+ assert("lzo-07", m_len >= 3);
28004+ assert("lzo-08", m_len <= M2_MAX_LEN);
28005+
28006+ if (m_off <= M2_MAX_OFFSET) {
28007+ m_off -= 1;
28008+#if defined(LZO1X)
28009+ *op++ =
28010+ LZO_BYTE(((m_len -
28011+ 1) << 5) | ((m_off & 7) << 2));
28012+ *op++ = LZO_BYTE(m_off >> 3);
28013+#elif defined(LZO1Y)
28014+ *op++ =
28015+ LZO_BYTE(((m_len +
28016+ 1) << 4) | ((m_off & 3) << 2));
28017+ *op++ = LZO_BYTE(m_off >> 2);
28018+#endif
28019+ } else if (m_off <= M3_MAX_OFFSET) {
28020+ m_off -= 1;
28021+ *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
28022+ goto m3_m4_offset;
28023+ } else
28024+#if defined(LZO1X)
28025+ {
28026+ m_off -= 0x4000;
28027+ assert("lzo-09", m_off > 0);
28028+ assert("lzo-10", m_off <= 0x7fff);
28029+ *op++ = LZO_BYTE(M4_MARKER |
28030+ ((m_off & 0x4000) >> 11) |
28031+ (m_len - 2));
28032+ goto m3_m4_offset;
28033+ }
28034+#elif defined(LZO1Y)
28035+ goto m4_match;
28036+#endif
28037+ } else {
28038+ {
28039+ const lzo_byte *end = in_end;
28040+ const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
28041+ while (ip < end && *m == *ip)
28042+ m++, ip++;
28043+ m_len = (ip - ii);
28044+ }
28045+ assert("lzo-11", m_len > M2_MAX_LEN);
28046+
28047+ if (m_off <= M3_MAX_OFFSET) {
28048+ m_off -= 1;
28049+ if (m_len <= 33)
28050+ *op++ =
28051+ LZO_BYTE(M3_MARKER | (m_len - 2));
28052+ else {
28053+ m_len -= 33;
28054+ *op++ = M3_MARKER | 0;
28055+ goto m3_m4_len;
28056+ }
28057+ } else {
28058+#if defined(LZO1Y)
28059+ m4_match:
28060+#endif
28061+ m_off -= 0x4000;
28062+ assert("lzo-12", m_off > 0);
28063+ assert("lzo-13", m_off <= 0x7fff);
28064+ if (m_len <= M4_MAX_LEN)
28065+ *op++ = LZO_BYTE(M4_MARKER |
28066+ ((m_off & 0x4000) >>
28067+ 11) | (m_len - 2));
28068+ else {
28069+ m_len -= M4_MAX_LEN;
28070+ *op++ =
28071+ LZO_BYTE(M4_MARKER |
28072+ ((m_off & 0x4000) >> 11));
28073+ m3_m4_len:
28074+ while (m_len > 255) {
28075+ m_len -= 255;
28076+ *op++ = 0;
28077+ }
28078+ assert("lzo-14", m_len > 0);
28079+ *op++ = LZO_BYTE(m_len);
28080+ }
28081+ }
28082+
28083+ m3_m4_offset:
28084+ *op++ = LZO_BYTE((m_off & 63) << 2);
28085+ *op++ = LZO_BYTE(m_off >> 6);
28086+ }
28087+
28088+ ii = ip;
28089+ if (ip >= ip_end)
28090+ break;
28091+ }
28092+
28093+ *out_len = op - out;
28094+ return pd(in_end, ii);
28095+}
28096+
28097+int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
28098+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28099+{
28100+ lzo_byte *op = out;
28101+ lzo_uint t;
28102+
28103+#if defined(__LZO_QUERY_COMPRESS)
28104+ if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28105+ return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
28106+ D_SIZE, lzo_sizeof(lzo_dict_t));
28107+#endif
28108+
28109+ if (in_len <= M2_MAX_LEN + 5)
28110+ t = in_len;
28111+ else {
28112+ t = do_compress(in, in_len, op, out_len, wrkmem);
28113+ op += *out_len;
28114+ }
28115+
28116+ if (t > 0) {
28117+ const lzo_byte *ii = in + in_len - t;
28118+
28119+ if (op == out && t <= 238)
28120+ *op++ = LZO_BYTE(17 + t);
28121+ else if (t <= 3)
28122+ op[-2] |= LZO_BYTE(t);
28123+ else if (t <= 18)
28124+ *op++ = LZO_BYTE(t - 3);
28125+ else {
28126+ lzo_uint tt = t - 18;
28127+
28128+ *op++ = 0;
28129+ while (tt > 255) {
28130+ tt -= 255;
28131+ *op++ = 0;
28132+ }
28133+ assert("lzo-15", tt > 0);
28134+ *op++ = LZO_BYTE(tt);
28135+ }
28136+ do
28137+ *op++ = *ii++;
28138+ while (--t > 0);
28139+ }
28140+
28141+ *op++ = M4_MARKER | 1;
28142+ *op++ = 0;
28143+ *op++ = 0;
28144+
28145+ *out_len = op - out;
28146+ return LZO_E_OK;
28147+}
28148+
28149+#undef do_compress
28150+#undef DO_COMPRESS
28151+#undef LZO_HASH
28152+
28153+#undef LZO_TEST_DECOMPRESS_OVERRUN
28154+#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
28155+#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
28156+#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28157+#undef DO_DECOMPRESS
28158+#define DO_DECOMPRESS lzo1x_decompress
28159+
28160+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28161+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28162+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28163+# endif
28164+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28165+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28166+# endif
28167+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28168+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28169+# endif
28170+#endif
28171+
28172+#undef TEST_IP
28173+#undef TEST_OP
28174+#undef TEST_LOOKBEHIND
28175+#undef NEED_IP
28176+#undef NEED_OP
28177+#undef HAVE_TEST_IP
28178+#undef HAVE_TEST_OP
28179+#undef HAVE_NEED_IP
28180+#undef HAVE_NEED_OP
28181+#undef HAVE_ANY_IP
28182+#undef HAVE_ANY_OP
28183+
28184+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28185+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28186+# define TEST_IP (ip < ip_end)
28187+# endif
28188+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28189+# define NEED_IP(x) \
28190+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28191+# endif
28192+#endif
28193+
28194+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28195+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28196+# define TEST_OP (op <= op_end)
28197+# endif
28198+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28199+# undef TEST_OP
28200+# define NEED_OP(x) \
28201+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28202+# endif
28203+#endif
28204+
28205+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28206+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28207+#else
28208+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28209+#endif
28210+
28211+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28212+# define TEST_IP (ip < ip_end)
28213+#endif
28214+
28215+#if defined(TEST_IP)
28216+# define HAVE_TEST_IP
28217+#else
28218+# define TEST_IP 1
28219+#endif
28220+#if defined(TEST_OP)
28221+# define HAVE_TEST_OP
28222+#else
28223+# define TEST_OP 1
28224+#endif
28225+
28226+#if defined(NEED_IP)
28227+# define HAVE_NEED_IP
28228+#else
28229+# define NEED_IP(x) ((void) 0)
28230+#endif
28231+#if defined(NEED_OP)
28232+# define HAVE_NEED_OP
28233+#else
28234+# define NEED_OP(x) ((void) 0)
28235+#endif
28236+
28237+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28238+# define HAVE_ANY_IP
28239+#endif
28240+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28241+# define HAVE_ANY_OP
28242+#endif
28243+
28244+#undef __COPY4
28245+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28246+
28247+#undef COPY4
28248+#if defined(LZO_UNALIGNED_OK_4)
28249+# define COPY4(dst,src) __COPY4(dst,src)
28250+#elif defined(LZO_ALIGNED_OK_4)
28251+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28252+#endif
28253+
28254+#if defined(DO_DECOMPRESS)
28255+int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
28256+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28257+#endif
28258+{
28259+ register lzo_byte *op;
28260+ register const lzo_byte *ip;
28261+ register lzo_uint t;
28262+#if defined(COPY_DICT)
28263+ lzo_uint m_off;
28264+ const lzo_byte *dict_end;
28265+#else
28266+ register const lzo_byte *m_pos;
28267+#endif
28268+
28269+ const lzo_byte *const ip_end = in + in_len;
28270+#if defined(HAVE_ANY_OP)
28271+ lzo_byte *const op_end = out + *out_len;
28272+#endif
28273+#if defined(LZO1Z)
28274+ lzo_uint last_m_off = 0;
28275+#endif
28276+
28277+ LZO_UNUSED(wrkmem);
28278+
28279+#if defined(__LZO_QUERY_DECOMPRESS)
28280+ if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28281+ return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
28282+ 0, 0);
28283+#endif
28284+
28285+#if defined(COPY_DICT)
28286+ if (dict) {
28287+ if (dict_len > M4_MAX_OFFSET) {
28288+ dict += dict_len - M4_MAX_OFFSET;
28289+ dict_len = M4_MAX_OFFSET;
28290+ }
28291+ dict_end = dict + dict_len;
28292+ } else {
28293+ dict_len = 0;
28294+ dict_end = NULL;
28295+ }
28296+#endif
28297+
28298+ *out_len = 0;
28299+
28300+ op = out;
28301+ ip = in;
28302+
28303+ if (*ip > 17) {
28304+ t = *ip++ - 17;
28305+ if (t < 4)
28306+ goto match_next;
28307+ assert("lzo-16", t > 0);
28308+ NEED_OP(t);
28309+ NEED_IP(t + 1);
28310+ do
28311+ *op++ = *ip++;
28312+ while (--t > 0);
28313+ goto first_literal_run;
28314+ }
28315+
28316+ while (TEST_IP && TEST_OP) {
28317+ t = *ip++;
28318+ if (t >= 16)
28319+ goto match;
28320+ if (t == 0) {
28321+ NEED_IP(1);
28322+ while (*ip == 0) {
28323+ t += 255;
28324+ ip++;
28325+ NEED_IP(1);
28326+ }
28327+ t += 15 + *ip++;
28328+ }
28329+ assert("lzo-17", t > 0);
28330+ NEED_OP(t + 3);
28331+ NEED_IP(t + 4);
28332+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28333+#if !defined(LZO_UNALIGNED_OK_4)
28334+ if (PTR_ALIGNED2_4(op, ip)) {
28335+#endif
28336+ COPY4(op, ip);
28337+ op += 4;
28338+ ip += 4;
28339+ if (--t > 0) {
28340+ if (t >= 4) {
28341+ do {
28342+ COPY4(op, ip);
28343+ op += 4;
28344+ ip += 4;
28345+ t -= 4;
28346+ } while (t >= 4);
28347+ if (t > 0)
28348+ do
28349+ *op++ = *ip++;
28350+ while (--t > 0);
28351+ } else
28352+ do
28353+ *op++ = *ip++;
28354+ while (--t > 0);
28355+ }
28356+#if !defined(LZO_UNALIGNED_OK_4)
28357+ } else
28358+#endif
28359+#endif
28360+#if !defined(LZO_UNALIGNED_OK_4)
28361+ {
28362+ *op++ = *ip++;
28363+ *op++ = *ip++;
28364+ *op++ = *ip++;
28365+ do
28366+ *op++ = *ip++;
28367+ while (--t > 0);
28368+ }
28369+#endif
28370+
28371+ first_literal_run:
28372+
28373+ t = *ip++;
28374+ if (t >= 16)
28375+ goto match;
28376+#if defined(COPY_DICT)
28377+#if defined(LZO1Z)
28378+ m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28379+ last_m_off = m_off;
28380+#else
28381+ m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28382+#endif
28383+ NEED_OP(3);
28384+ t = 3;
28385+ COPY_DICT(t, m_off)
28386+#else
28387+#if defined(LZO1Z)
28388+ t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28389+ m_pos = op - t;
28390+ last_m_off = t;
28391+#else
28392+ m_pos = op - (1 + M2_MAX_OFFSET);
28393+ m_pos -= t >> 2;
28394+ m_pos -= *ip++ << 2;
28395+#endif
28396+ TEST_LOOKBEHIND(m_pos, out);
28397+ NEED_OP(3);
28398+ *op++ = *m_pos++;
28399+ *op++ = *m_pos++;
28400+ *op++ = *m_pos;
28401+#endif
28402+ goto match_done;
28403+
28404+ while (TEST_IP && TEST_OP) {
28405+ match:
28406+ if (t >= 64) {
28407+#if defined(COPY_DICT)
28408+#if defined(LZO1X)
28409+ m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28410+ t = (t >> 5) - 1;
28411+#elif defined(LZO1Y)
28412+ m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28413+ t = (t >> 4) - 3;
28414+#elif defined(LZO1Z)
28415+ m_off = t & 0x1f;
28416+ if (m_off >= 0x1c)
28417+ m_off = last_m_off;
28418+ else {
28419+ m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28420+ last_m_off = m_off;
28421+ }
28422+ t = (t >> 5) - 1;
28423+#endif
28424+#else
28425+#if defined(LZO1X)
28426+ m_pos = op - 1;
28427+ m_pos -= (t >> 2) & 7;
28428+ m_pos -= *ip++ << 3;
28429+ t = (t >> 5) - 1;
28430+#elif defined(LZO1Y)
28431+ m_pos = op - 1;
28432+ m_pos -= (t >> 2) & 3;
28433+ m_pos -= *ip++ << 2;
28434+ t = (t >> 4) - 3;
28435+#elif defined(LZO1Z)
28436+ {
28437+ lzo_uint off = t & 0x1f;
28438+ m_pos = op;
28439+ if (off >= 0x1c) {
28440+ assert(last_m_off > 0);
28441+ m_pos -= last_m_off;
28442+ } else {
28443+ off =
28444+ 1 + (off << 6) +
28445+ (*ip++ >> 2);
28446+ m_pos -= off;
28447+ last_m_off = off;
28448+ }
28449+ }
28450+ t = (t >> 5) - 1;
28451+#endif
28452+ TEST_LOOKBEHIND(m_pos, out);
28453+ assert("lzo-18", t > 0);
28454+ NEED_OP(t + 3 - 1);
28455+ goto copy_match;
28456+#endif
28457+ } else if (t >= 32) {
28458+ t &= 31;
28459+ if (t == 0) {
28460+ NEED_IP(1);
28461+ while (*ip == 0) {
28462+ t += 255;
28463+ ip++;
28464+ NEED_IP(1);
28465+ }
28466+ t += 31 + *ip++;
28467+ }
28468+#if defined(COPY_DICT)
28469+#if defined(LZO1Z)
28470+ m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28471+ last_m_off = m_off;
28472+#else
28473+ m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28474+#endif
28475+#else
28476+#if defined(LZO1Z)
28477+ {
28478+ lzo_uint off =
28479+ 1 + (ip[0] << 6) + (ip[1] >> 2);
28480+ m_pos = op - off;
28481+ last_m_off = off;
28482+ }
28483+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28484+ m_pos = op - 1;
28485+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28486+#else
28487+ m_pos = op - 1;
28488+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28489+#endif
28490+#endif
28491+ ip += 2;
28492+ } else if (t >= 16) {
28493+#if defined(COPY_DICT)
28494+ m_off = (t & 8) << 11;
28495+#else
28496+ m_pos = op;
28497+ m_pos -= (t & 8) << 11;
28498+#endif
28499+ t &= 7;
28500+ if (t == 0) {
28501+ NEED_IP(1);
28502+ while (*ip == 0) {
28503+ t += 255;
28504+ ip++;
28505+ NEED_IP(1);
28506+ }
28507+ t += 7 + *ip++;
28508+ }
28509+#if defined(COPY_DICT)
28510+#if defined(LZO1Z)
28511+ m_off += (ip[0] << 6) + (ip[1] >> 2);
28512+#else
28513+ m_off += (ip[0] >> 2) + (ip[1] << 6);
28514+#endif
28515+ ip += 2;
28516+ if (m_off == 0)
28517+ goto eof_found;
28518+ m_off += 0x4000;
28519+#if defined(LZO1Z)
28520+ last_m_off = m_off;
28521+#endif
28522+#else
28523+#if defined(LZO1Z)
28524+ m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28525+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28526+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
28527+#else
28528+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28529+#endif
28530+ ip += 2;
28531+ if (m_pos == op)
28532+ goto eof_found;
28533+ m_pos -= 0x4000;
28534+#if defined(LZO1Z)
28535+ last_m_off = op - m_pos;
28536+#endif
28537+#endif
28538+ } else {
28539+#if defined(COPY_DICT)
28540+#if defined(LZO1Z)
28541+ m_off = 1 + (t << 6) + (*ip++ >> 2);
28542+ last_m_off = m_off;
28543+#else
28544+ m_off = 1 + (t >> 2) + (*ip++ << 2);
28545+#endif
28546+ NEED_OP(2);
28547+ t = 2;
28548+ COPY_DICT(t, m_off)
28549+#else
28550+#if defined(LZO1Z)
28551+ t = 1 + (t << 6) + (*ip++ >> 2);
28552+ m_pos = op - t;
28553+ last_m_off = t;
28554+#else
28555+ m_pos = op - 1;
28556+ m_pos -= t >> 2;
28557+ m_pos -= *ip++ << 2;
28558+#endif
28559+ TEST_LOOKBEHIND(m_pos, out);
28560+ NEED_OP(2);
28561+ *op++ = *m_pos++;
28562+ *op++ = *m_pos;
28563+#endif
28564+ goto match_done;
28565+ }
28566+
28567+#if defined(COPY_DICT)
28568+
28569+ NEED_OP(t + 3 - 1);
28570+ t += 3 - 1;
28571+ COPY_DICT(t, m_off)
28572+#else
28573+
28574+ TEST_LOOKBEHIND(m_pos, out);
28575+ assert("lzo-19", t > 0);
28576+ NEED_OP(t + 3 - 1);
28577+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28578+#if !defined(LZO_UNALIGNED_OK_4)
28579+ if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28580+ assert((op - m_pos) >= 4);
28581+#else
28582+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28583+#endif
28584+ COPY4(op, m_pos);
28585+ op += 4;
28586+ m_pos += 4;
28587+ t -= 4 - (3 - 1);
28588+ do {
28589+ COPY4(op, m_pos);
28590+ op += 4;
28591+ m_pos += 4;
28592+ t -= 4;
28593+ } while (t >= 4);
28594+ if (t > 0)
28595+ do
28596+ *op++ = *m_pos++;
28597+ while (--t > 0);
28598+ } else
28599+#endif
28600+ {
28601+ copy_match:
28602+ *op++ = *m_pos++;
28603+ *op++ = *m_pos++;
28604+ do
28605+ *op++ = *m_pos++;
28606+ while (--t > 0);
28607+ }
28608+
28609+#endif
28610+
28611+ match_done:
28612+#if defined(LZO1Z)
28613+ t = ip[-1] & 3;
28614+#else
28615+ t = ip[-2] & 3;
28616+#endif
28617+ if (t == 0)
28618+ break;
28619+
28620+ match_next:
28621+ assert("lzo-20", t > 0);
28622+ NEED_OP(t);
28623+ NEED_IP(t + 1);
28624+ do
28625+ *op++ = *ip++;
28626+ while (--t > 0);
28627+ t = *ip++;
28628+ }
28629+ }
28630+
28631+#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28632+ *out_len = op - out;
28633+ return LZO_E_EOF_NOT_FOUND;
28634+#endif
28635+
28636+ eof_found:
28637+ assert("lzo-21", t == 1);
28638+ *out_len = op - out;
28639+ return (ip == ip_end ? LZO_E_OK :
28640+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28641+
28642+#if defined(HAVE_NEED_IP)
28643+ input_overrun:
28644+ *out_len = op - out;
28645+ return LZO_E_INPUT_OVERRUN;
28646+#endif
28647+
28648+#if defined(HAVE_NEED_OP)
28649+ output_overrun:
28650+ *out_len = op - out;
28651+ return LZO_E_OUTPUT_OVERRUN;
28652+#endif
28653+
28654+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28655+ lookbehind_overrun:
28656+ *out_len = op - out;
28657+ return LZO_E_LOOKBEHIND_OVERRUN;
28658+#endif
28659+}
28660+
28661+#define LZO_TEST_DECOMPRESS_OVERRUN
28662+#undef DO_DECOMPRESS
28663+#define DO_DECOMPRESS lzo1x_decompress_safe
28664+
28665+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28666+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28667+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
28668+# endif
28669+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28670+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
28671+# endif
28672+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28673+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28674+# endif
28675+#endif
28676+
28677+#undef TEST_IP
28678+#undef TEST_OP
28679+#undef TEST_LOOKBEHIND
28680+#undef NEED_IP
28681+#undef NEED_OP
28682+#undef HAVE_TEST_IP
28683+#undef HAVE_TEST_OP
28684+#undef HAVE_NEED_IP
28685+#undef HAVE_NEED_OP
28686+#undef HAVE_ANY_IP
28687+#undef HAVE_ANY_OP
28688+
28689+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28690+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28691+# define TEST_IP (ip < ip_end)
28692+# endif
28693+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28694+# define NEED_IP(x) \
28695+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
28696+# endif
28697+#endif
28698+
28699+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28700+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28701+# define TEST_OP (op <= op_end)
28702+# endif
28703+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28704+# undef TEST_OP
28705+# define NEED_OP(x) \
28706+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
28707+# endif
28708+#endif
28709+
28710+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28711+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
28712+#else
28713+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
28714+#endif
28715+
28716+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28717+# define TEST_IP (ip < ip_end)
28718+#endif
28719+
28720+#if defined(TEST_IP)
28721+# define HAVE_TEST_IP
28722+#else
28723+# define TEST_IP 1
28724+#endif
28725+#if defined(TEST_OP)
28726+# define HAVE_TEST_OP
28727+#else
28728+# define TEST_OP 1
28729+#endif
28730+
28731+#if defined(NEED_IP)
28732+# define HAVE_NEED_IP
28733+#else
28734+# define NEED_IP(x) ((void) 0)
28735+#endif
28736+#if defined(NEED_OP)
28737+# define HAVE_NEED_OP
28738+#else
28739+# define NEED_OP(x) ((void) 0)
28740+#endif
28741+
28742+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28743+# define HAVE_ANY_IP
28744+#endif
28745+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28746+# define HAVE_ANY_OP
28747+#endif
28748+
28749+#undef __COPY4
28750+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28751+
28752+#undef COPY4
28753+#if defined(LZO_UNALIGNED_OK_4)
28754+# define COPY4(dst,src) __COPY4(dst,src)
28755+#elif defined(LZO_ALIGNED_OK_4)
28756+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28757+#endif
28758+
28759+/***** End of minilzo.c *****/
28760diff --git a/fs/reiser4/plugin/compress/minilzo.h b/fs/reiser4/plugin/compress/minilzo.h
28761new file mode 100644
28762index 0000000..6a47001
28763--- /dev/null
28764+++ b/fs/reiser4/plugin/compress/minilzo.h
28765@@ -0,0 +1,70 @@
28766+/* minilzo.h -- mini subset of the LZO real-time data compression library
28767+ adopted for reiser4 compression transform plugin.
28768+
28769+ This file is part of the LZO real-time data compression library
28770+ and not included in any proprietary licenses of reiser4.
28771+
28772+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28773+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28774+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28775+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28776+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28777+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28778+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28779+ All Rights Reserved.
28780+
28781+ The LZO library is free software; you can redistribute it and/or
28782+ modify it under the terms of the GNU General Public License as
28783+ published by the Free Software Foundation; either version 2 of
28784+ the License, or (at your option) any later version.
28785+
28786+ The LZO library is distributed in the hope that it will be useful,
28787+ but WITHOUT ANY WARRANTY; without even the implied warranty of
28788+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28789+ GNU General Public License for more details.
28790+
28791+ You should have received a copy of the GNU General Public License
28792+ along with the LZO library; see the file COPYING.
28793+ If not, write to the Free Software Foundation, Inc.,
28794+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28795+
28796+ Markus F.X.J. Oberhumer
28797+ <markus@oberhumer.com>
28798+ http://www.oberhumer.com/opensource/lzo/
28799+ */
28800+
28801+/*
28802+ * NOTE:
28803+ * the full LZO package can be found at
28804+ * http://www.oberhumer.com/opensource/lzo/
28805+ */
28806+
28807+#ifndef __MINILZO_H
28808+#define __MINILZO_H
28809+
28810+#define MINILZO_VERSION 0x1080
28811+
28812+#include "lzoconf.h"
28813+
28814+/* Memory required for the wrkmem parameter.
28815+ * When the required size is 0, you can also pass a NULL pointer.
28816+ */
28817+
28818+#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
28819+#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28820+#define LZO1X_MEM_DECOMPRESS (0)
28821+
28822+/* compression */
28823+extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28824+ lzo_byte * dst, lzo_uintp dst_len,
28825+ lzo_voidp wrkmem);
28826+/* decompression */
28827+extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28828+ lzo_byte * dst, lzo_uintp dst_len,
28829+ lzo_voidp wrkmem /* NOT USED */);
28830+/* safe decompression with overrun testing */
28831+extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
28832+ lzo_byte * dst, lzo_uintp dst_len,
28833+ lzo_voidp wrkmem /* NOT USED */ );
28834+
28835+#endif /* already included */
28836diff --git a/fs/reiser4/plugin/crypto/cipher.c b/fs/reiser4/plugin/crypto/cipher.c
28837new file mode 100644
28838index 0000000..e918154
28839--- /dev/null
28840+++ b/fs/reiser4/plugin/crypto/cipher.c
28841@@ -0,0 +1,37 @@
28842+/* Copyright 2001, 2002, 2003 by Hans Reiser,
28843+ licensing governed by reiser4/README */
28844+/* Reiser4 cipher transform plugins */
28845+
28846+#include "../../debug.h"
28847+#include "../plugin.h"
28848+
28849+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
28850+ [NONE_CIPHER_ID] = {
28851+ .h = {
28852+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
28853+ .id = NONE_CIPHER_ID,
28854+ .pops = NULL,
28855+ .label = "none",
28856+ .desc = "no cipher transform",
28857+ .linkage = {NULL, NULL}
28858+ },
28859+ .alloc = NULL,
28860+ .free = NULL,
28861+ .scale = NULL,
28862+ .align_stream = NULL,
28863+ .setkey = NULL,
28864+ .encrypt = NULL,
28865+ .decrypt = NULL
28866+ }
28867+};
28868+
28869+/* Make Linus happy.
28870+ Local variables:
28871+ c-indentation-style: "K&R"
28872+ mode-name: "LC"
28873+ c-basic-offset: 8
28874+ tab-width: 8
28875+ fill-column: 120
28876+ scroll-step: 1
28877+ End:
28878+*/
28879diff --git a/fs/reiser4/plugin/crypto/cipher.h b/fs/reiser4/plugin/crypto/cipher.h
28880new file mode 100644
28881index 0000000..e896c67
28882--- /dev/null
28883+++ b/fs/reiser4/plugin/crypto/cipher.h
28884@@ -0,0 +1,55 @@
28885+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28886+/* This file contains definitions for the objects operated
28887+ by reiser4 key manager, which is something like keyring
28888+ wrapped by appropriate reiser4 plugin */
28889+
28890+#if !defined( __FS_REISER4_CRYPT_H__ )
28891+#define __FS_REISER4_CRYPT_H__
28892+
28893+#include <linux/crypto.h>
28894+
28895+/* key info imported from user space */
28896+typedef struct crypto_data {
28897+ int keysize; /* uninstantiated key size */
28898+ __u8 * key; /* uninstantiated key */
28899+ int keyid_size; /* size of passphrase */
28900+ __u8 * keyid; /* passphrase */
28901+} crypto_data_t;
28902+
28903+/* This object contains all needed infrastructure to implement
28904+ cipher transform. This is operated (allocating, inheriting,
28905+ validating, binding to host inode, etc..) by reiser4 key manager.
28906+
28907+ This info can be allocated in two cases:
28908+ 1. importing a key from user space.
28909+ 2. reading inode from disk */
28910+typedef struct crypto_stat {
28911+ struct inode * host;
28912+ struct crypto_hash * digest;
28913+ struct crypto_blkcipher * cipher;
28914+#if 0
28915+ cipher_key_plugin * kplug; /* key manager */
28916+#endif
28917+ __u8 * keyid; /* key fingerprint, created by digest plugin,
28918+ using uninstantiated key and passphrase.
28919+ supposed to be stored in disk stat-data */
28920+ int inst; /* this indicates if the cipher key is
28921+ instantiated (case 1 above) */
28922+ int keysize; /* uninstantiated key size (bytes), supposed
28923+ to be stored in disk stat-data */
28924+ int keyload_count; /* number of the objects which has this
28925+ crypto-stat attached */
28926+} crypto_stat_t;
28927+
28928+#endif /* __FS_REISER4_CRYPT_H__ */
28929+
28930+/*
28931+ Local variables:
28932+ c-indentation-style: "K&R"
28933+ mode-name: "LC"
28934+ c-basic-offset: 8
28935+ tab-width: 8
28936+ fill-column: 120
28937+ scroll-step: 1
28938+ End:
28939+*/
28940diff --git a/fs/reiser4/plugin/crypto/digest.c b/fs/reiser4/plugin/crypto/digest.c
28941new file mode 100644
28942index 0000000..7508917
28943--- /dev/null
28944+++ b/fs/reiser4/plugin/crypto/digest.c
28945@@ -0,0 +1,58 @@
28946+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28947+
28948+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
28949+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
28950+#include "../../debug.h"
28951+#include "../plugin_header.h"
28952+#include "../plugin.h"
28953+#include "../file/cryptcompress.h"
28954+
28955+#include <linux/types.h>
28956+
28957+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
28958+
28959+static struct crypto_hash * alloc_sha256 (void)
28960+{
28961+#if REISER4_SHA256
28962+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
28963+#else
28964+ warning("edward-1418", "sha256 unsupported");
28965+ return ERR_PTR(-EINVAL);
28966+#endif
28967+}
28968+
28969+static void free_sha256 (struct crypto_hash * tfm)
28970+{
28971+#if REISER4_SHA256
28972+ crypto_free_hash(tfm);
28973+#endif
28974+ return;
28975+}
28976+
28977+/* digest plugins */
28978+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
28979+ [SHA256_32_DIGEST_ID] = {
28980+ .h = {
28981+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
28982+ .id = SHA256_32_DIGEST_ID,
28983+ .pops = NULL,
28984+ .label = "sha256_32",
28985+ .desc = "sha256_32 digest transform",
28986+ .linkage = {NULL, NULL}
28987+ },
28988+ .fipsize = sizeof(__u32),
28989+ .alloc = alloc_sha256,
28990+ .free = free_sha256
28991+ }
28992+};
28993+
28994+/*
28995+ Local variables:
28996+ c-indentation-style: "K&R"
28997+ mode-name: "LC"
28998+ c-basic-offset: 8
28999+ tab-width: 8
29000+ fill-column: 120
29001+ scroll-step: 1
29002+ End:
29003+*/
29004diff --git a/fs/reiser4/plugin/dir/Makefile b/fs/reiser4/plugin/dir/Makefile
29005new file mode 100644
29006index 0000000..ed370b1
29007--- /dev/null
29008+++ b/fs/reiser4/plugin/dir/Makefile
29009@@ -0,0 +1,5 @@
29010+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
29011+
29012+dir_plugins-objs := \
29013+ hashed_dir.o \
29014+ seekable_dir.o
29015diff --git a/fs/reiser4/plugin/dir/dir.h b/fs/reiser4/plugin/dir/dir.h
29016new file mode 100644
29017index 0000000..4a91ebe
29018--- /dev/null
29019+++ b/fs/reiser4/plugin/dir/dir.h
29020@@ -0,0 +1,36 @@
29021+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29022+ * reiser4/README */
29023+
29024+/* this file contains declarations of methods implementing directory plugins */
29025+
29026+#if !defined( __REISER4_DIR_H__ )
29027+#define __REISER4_DIR_H__
29028+
29029+/*#include "../../key.h"
29030+
29031+#include <linux/fs.h>*/
29032+
29033+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
29034+
29035+/* "hashed" directory methods of dir plugin */
29036+void build_entry_key_hashed(const struct inode *, const struct qstr *,
29037+ reiser4_key *);
29038+
29039+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
29040+
29041+/* "seekable" directory methods of dir plugin */
29042+void build_entry_key_seekable(const struct inode *, const struct qstr *,
29043+ reiser4_key *);
29044+
29045+/* __REISER4_DIR_H__ */
29046+#endif
29047+
29048+/*
29049+ Local variables:
29050+ c-indentation-style: "K&R"
29051+ mode-name: "LC"
29052+ c-basic-offset: 8
29053+ tab-width: 8
29054+ fill-column: 120
29055+ End:
29056+*/
29057diff --git a/fs/reiser4/plugin/dir/hashed_dir.c b/fs/reiser4/plugin/dir/hashed_dir.c
29058new file mode 100644
29059index 0000000..0f34824
29060--- /dev/null
29061+++ b/fs/reiser4/plugin/dir/hashed_dir.c
29062@@ -0,0 +1,81 @@
29063+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29064+ * reiser4/README */
29065+
29066+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
29067+ names to the files. */
29068+
29069+/*
29070+ * Hashed directory logically consists of persistent directory
29071+ * entries. Directory entry is a pair of a file name and a key of stat-data of
29072+ * a file that has this name in the given directory.
29073+ *
29074+ * Directory entries are stored in the tree in the form of directory
29075+ * items. Directory item should implement dir_entry_ops portion of item plugin
29076+ * interface (see plugin/item/item.h). Hashed directory interacts with
29077+ * directory item plugin exclusively through dir_entry_ops operations.
29078+ *
29079+ * Currently there are two implementations of directory items: "simple
29080+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
29081+ * (plugin/item/cde.[ch]) with the latter being the default.
29082+ *
29083+ * There is, however some delicate way through which directory code interferes
29084+ * with item plugin: key assignment policy. A key for a directory item is
29085+ * chosen by directory code, and as described in kassign.c, this key contains
29086+ * a portion of file name. Directory item uses this knowledge to avoid storing
29087+ * this portion of file name twice: in the key and in the directory item body.
29088+ *
29089+ */
29090+
29091+#include "../../inode.h"
29092+
29093+void complete_entry_key(const struct inode *, const char *name,
29094+ int len, reiser4_key * result);
29095+
29096+/* this is implementation of build_entry_key method of dir
29097+ plugin for HASHED_DIR_PLUGIN_ID
29098+ */
29099+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
29100+ * (or will be) in.*/
29101+ const struct qstr *qname, /* name of file referenced
29102+ * by this entry */
29103+ reiser4_key * result /* resulting key of directory
29104+ * entry */ )
29105+{
29106+ const char *name;
29107+ int len;
29108+
29109+ assert("nikita-1139", dir != NULL);
29110+ assert("nikita-1140", qname != NULL);
29111+ assert("nikita-1141", qname->name != NULL);
29112+ assert("nikita-1142", result != NULL);
29113+
29114+ name = qname->name;
29115+ len = qname->len;
29116+
29117+ assert("nikita-2867", strlen(name) == len);
29118+
29119+ reiser4_key_init(result);
29120+ /* locality of directory entry's key is objectid of parent
29121+ directory */
29122+ set_key_locality(result, get_inode_oid(dir));
29123+ /* minor packing locality is constant */
29124+ set_key_type(result, KEY_FILE_NAME_MINOR);
29125+ /* dot is special case---we always want it to be first entry in
29126+ a directory. Actually, we just want to have smallest
29127+ directory entry.
29128+ */
29129+ if (len == 1 && name[0] == '.')
29130+ return;
29131+
29132+ /* initialize part of entry key which depends on file name */
29133+ complete_entry_key(dir, name, len, result);
29134+}
29135+
29136+/* Local variables:
29137+ c-indentation-style: "K&R"
29138+ mode-name: "LC"
29139+ c-basic-offset: 8
29140+ tab-width: 8
29141+ fill-column: 120
29142+ End:
29143+*/
29144diff --git a/fs/reiser4/plugin/dir/seekable_dir.c b/fs/reiser4/plugin/dir/seekable_dir.c
29145new file mode 100644
29146index 0000000..c1c6c4c
29147--- /dev/null
29148+++ b/fs/reiser4/plugin/dir/seekable_dir.c
29149@@ -0,0 +1,46 @@
29150+/* Copyright 2005 by Hans Reiser, licensing governed by
29151+ * reiser4/README */
29152+
29153+#include "../../inode.h"
29154+
29155+/* this is implementation of build_entry_key method of dir
29156+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
29157+ This is for directories where we want repeatable and restartable readdir()
29158+ even in case 32bit user level struct dirent (readdir(3)).
29159+*/
29160+void
29161+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
29162+ reiser4_key * result)
29163+{
29164+ oid_t objectid;
29165+
29166+ assert("nikita-2283", dir != NULL);
29167+ assert("nikita-2284", name != NULL);
29168+ assert("nikita-2285", name->name != NULL);
29169+ assert("nikita-2286", result != NULL);
29170+
29171+ reiser4_key_init(result);
29172+ /* locality of directory entry's key is objectid of parent
29173+ directory */
29174+ set_key_locality(result, get_inode_oid(dir));
29175+ /* minor packing locality is constant */
29176+ set_key_type(result, KEY_FILE_NAME_MINOR);
29177+ /* dot is special case---we always want it to be first entry in
29178+ a directory. Actually, we just want to have smallest
29179+ directory entry.
29180+ */
29181+ if ((name->len == 1) && (name->name[0] == '.'))
29182+ return;
29183+
29184+ /* objectid of key is 31 lowest bits of hash. */
29185+ objectid =
29186+ inode_hash_plugin(dir)->hash(name->name,
29187+ (int)name->len) & 0x7fffffff;
29188+
29189+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
29190+ set_key_objectid(result, objectid);
29191+
29192+ /* offset is always 0. */
29193+ set_key_offset(result, (__u64) 0);
29194+ return;
29195+}
29196diff --git a/fs/reiser4/plugin/dir_plugin_common.c b/fs/reiser4/plugin/dir_plugin_common.c
29197new file mode 100644
29198index 0000000..f5e1028
29199--- /dev/null
29200+++ b/fs/reiser4/plugin/dir_plugin_common.c
29201@@ -0,0 +1,872 @@
29202+/* Copyright 2005 by Hans Reiser, licensing governed by
29203+ reiser4/README */
29204+
29205+/* this file contains typical implementations for most of methods of
29206+ directory plugin
29207+*/
29208+
29209+#include "../inode.h"
29210+
29211+int reiser4_find_entry(struct inode *dir, struct dentry *name,
29212+ lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
29213+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
29214+void check_light_weight(struct inode *inode, struct inode *parent);
29215+
29216+/* this is common implementation of get_parent method of dir plugin
29217+ this is used by NFS kernel server to "climb" up directory tree to
29218+ check permissions
29219+ */
29220+struct dentry *get_parent_common(struct inode *child)
29221+{
29222+ struct super_block *s;
29223+ struct inode *parent;
29224+ struct dentry dotdot;
29225+ struct dentry *dentry;
29226+ reiser4_key key;
29227+ int result;
29228+
29229+ /*
29230+ * lookup dotdot entry.
29231+ */
29232+
29233+ s = child->i_sb;
29234+ memset(&dotdot, 0, sizeof(dotdot));
29235+ dotdot.d_name.name = "..";
29236+ dotdot.d_name.len = 2;
29237+ dotdot.d_op = &get_super_private(s)->ops.dentry;
29238+
29239+ result = reiser4_lookup_name(child, &dotdot, &key);
29240+ if (result != 0)
29241+ return ERR_PTR(result);
29242+
29243+ parent = reiser4_iget(s, &key, 1);
29244+ if (!IS_ERR(parent)) {
29245+ /*
29246+ * FIXME-NIKITA dubious: attributes are inherited from @child
29247+ * to @parent. But:
29248+ *
29249+ * (*) this is the only this we can do
29250+ *
29251+ * (*) attributes of light-weight object are inherited
29252+ * from a parent through which object was looked up first,
29253+ * so it is ambiguous anyway.
29254+ *
29255+ */
29256+ check_light_weight(parent, child);
29257+ reiser4_iget_complete(parent);
29258+ dentry = d_alloc_anon(parent);
29259+ if (dentry == NULL) {
29260+ iput(parent);
29261+ dentry = ERR_PTR(RETERR(-ENOMEM));
29262+ } else
29263+ dentry->d_op = &get_super_private(s)->ops.dentry;
29264+ } else if (PTR_ERR(parent) == -ENOENT)
29265+ dentry = ERR_PTR(RETERR(-ESTALE));
29266+ else
29267+ dentry = (void *)parent;
29268+ return dentry;
29269+}
29270+
29271+/* this is common implementation of is_name_acceptable method of dir
29272+ plugin
29273+ */
29274+int is_name_acceptable_common(const struct inode *inode, /* directory to check */
29275+ const char *name UNUSED_ARG, /* name to check */
29276+ int len /* @name's length */ )
29277+{
29278+ assert("nikita-733", inode != NULL);
29279+ assert("nikita-734", name != NULL);
29280+ assert("nikita-735", len > 0);
29281+
29282+ return len <= reiser4_max_filename_len(inode);
29283+}
29284+
29285+/* there is no common implementation of build_entry_key method of dir
29286+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
29287+ plugin/dir/seekable.c:build_entry_key_seekable() for example
29288+*/
29289+
29290+/* this is common implementation of build_readdir_key method of dir
29291+ plugin
29292+ see reiser4_readdir_common for more details
29293+*/
29294+int build_readdir_key_common(struct file *dir /* directory being read */ ,
29295+ reiser4_key * result /* where to store key */ )
29296+{
29297+ reiser4_file_fsdata *fdata;
29298+ struct inode *inode;
29299+
29300+ assert("nikita-1361", dir != NULL);
29301+ assert("nikita-1362", result != NULL);
29302+ assert("nikita-1363", dir->f_dentry != NULL);
29303+ inode = dir->f_dentry->d_inode;
29304+ assert("nikita-1373", inode != NULL);
29305+
29306+ fdata = reiser4_get_file_fsdata(dir);
29307+ if (IS_ERR(fdata))
29308+ return PTR_ERR(fdata);
29309+ assert("nikita-1364", fdata != NULL);
29310+ return extract_key_from_de_id(get_inode_oid(inode),
29311+ &fdata->dir.readdir.position.
29312+ dir_entry_key, result);
29313+
29314+}
29315+
29316+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
29317+ int adj);
29318+
29319+/* this is common implementation of add_entry method of dir plugin
29320+*/
29321+int reiser4_add_entry_common(struct inode *object, /* directory to add new name
29322+ * in */
29323+ struct dentry *where, /* new name */
29324+ reiser4_object_create_data * data, /* parameters of
29325+ * new object */
29326+ reiser4_dir_entry_desc * entry /* parameters of
29327+ * new directory
29328+ * entry */)
29329+{
29330+ int result;
29331+ coord_t *coord;
29332+ lock_handle lh;
29333+ reiser4_dentry_fsdata *fsdata;
29334+ reiser4_block_nr reserve;
29335+
29336+ assert("nikita-1114", object != NULL);
29337+ assert("nikita-1250", where != NULL);
29338+
29339+ fsdata = reiser4_get_dentry_fsdata(where);
29340+ if (unlikely(IS_ERR(fsdata)))
29341+ return PTR_ERR(fsdata);
29342+
29343+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
29344+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29345+ return RETERR(-ENOSPC);
29346+
29347+ init_lh(&lh);
29348+ coord = &fsdata->dec.entry_coord;
29349+ coord_clear_iplug(coord);
29350+
29351+ /* check for this entry in a directory. This is plugin method. */
29352+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
29353+ entry);
29354+ if (likely(result == -ENOENT)) {
29355+ /* add new entry. Just pass control to the directory
29356+ item plugin. */
29357+ assert("nikita-1709", inode_dir_item_plugin(object));
29358+ assert("nikita-2230", coord->node == lh.node);
29359+ reiser4_seal_done(&fsdata->dec.entry_seal);
29360+ result =
29361+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
29362+ coord, &lh,
29363+ where,
29364+ entry);
29365+ if (result == 0) {
29366+ reiser4_adjust_dir_file(object, where,
29367+ fsdata->dec.pos + 1, +1);
29368+ INODE_INC_FIELD(object, i_size);
29369+ }
29370+ } else if (result == 0) {
29371+ assert("nikita-2232", coord->node == lh.node);
29372+ result = RETERR(-EEXIST);
29373+ }
29374+ done_lh(&lh);
29375+
29376+ return result;
29377+}
29378+
29379+/**
29380+ * rem_entry - remove entry from directory item
29381+ * @dir:
29382+ * @dentry:
29383+ * @entry:
29384+ * @coord:
29385+ * @lh:
29386+ *
29387+ * Checks that coordinate @coord is set properly and calls item plugin
29388+ * method to cut entry.
29389+ */
29390+static int
29391+rem_entry(struct inode *dir, struct dentry *dentry,
29392+ reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29393+{
29394+ item_plugin *iplug;
29395+ struct inode *child;
29396+
29397+ iplug = inode_dir_item_plugin(dir);
29398+ child = dentry->d_inode;
29399+ assert("nikita-3399", child != NULL);
29400+
29401+ /* check that we are really destroying an entry for @child */
29402+ if (REISER4_DEBUG) {
29403+ int result;
29404+ reiser4_key key;
29405+
29406+ result = iplug->s.dir.extract_key(coord, &key);
29407+ if (result != 0)
29408+ return result;
29409+ if (get_key_objectid(&key) != get_inode_oid(child)) {
29410+ warning("nikita-3397",
29411+ "rem_entry: %#llx != %#llx\n",
29412+ get_key_objectid(&key),
29413+ (unsigned long long)get_inode_oid(child));
29414+ return RETERR(-EIO);
29415+ }
29416+ }
29417+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29418+}
29419+
29420+/**
29421+ * reiser4_rem_entry_common - remove entry from a directory
29422+ * @dir: directory to remove entry from
29423+ * @where: name that is being removed
29424+ * @entry: description of entry being removed
29425+ *
29426+ * This is common implementation of rem_entry method of dir plugin.
29427+ */
29428+int reiser4_rem_entry_common(struct inode *dir,
29429+ struct dentry *dentry,
29430+ reiser4_dir_entry_desc *entry)
29431+{
29432+ int result;
29433+ coord_t *coord;
29434+ lock_handle lh;
29435+ reiser4_dentry_fsdata *fsdata;
29436+ __u64 tograb;
29437+
29438+ assert("nikita-1124", dir != NULL);
29439+ assert("nikita-1125", dentry != NULL);
29440+
29441+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29442+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29443+ if (result != 0)
29444+ return RETERR(-ENOSPC);
29445+
29446+ init_lh(&lh);
29447+
29448+ /* check for this entry in a directory. This is plugin method. */
29449+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
29450+ fsdata = reiser4_get_dentry_fsdata(dentry);
29451+ if (IS_ERR(fsdata)) {
29452+ done_lh(&lh);
29453+ return PTR_ERR(fsdata);
29454+ }
29455+
29456+ coord = &fsdata->dec.entry_coord;
29457+
29458+ assert("nikita-3404",
29459+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29460+ dir->i_size <= 1);
29461+
29462+ coord_clear_iplug(coord);
29463+ if (result == 0) {
29464+ /* remove entry. Just pass control to the directory item
29465+ plugin. */
29466+ assert("vs-542", inode_dir_item_plugin(dir));
29467+ reiser4_seal_done(&fsdata->dec.entry_seal);
29468+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
29469+ result =
29470+ WITH_COORD(coord,
29471+ rem_entry(dir, dentry, entry, coord, &lh));
29472+ if (result == 0) {
29473+ if (dir->i_size >= 1)
29474+ INODE_DEC_FIELD(dir, i_size);
29475+ else {
29476+ warning("nikita-2509", "Dir %llu is runt",
29477+ (unsigned long long)
29478+ get_inode_oid(dir));
29479+ result = RETERR(-EIO);
29480+ }
29481+
29482+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
29483+ dentry->d_inode->i_size != 2 ||
29484+ inode_dir_plugin(dentry->d_inode) == NULL);
29485+ }
29486+ }
29487+ done_lh(&lh);
29488+
29489+ return result;
29490+}
29491+
29492+static reiser4_block_nr estimate_init(struct inode *parent,
29493+ struct inode *object);
29494+static int create_dot_dotdot(struct inode *object, struct inode *parent);
29495+
29496+/* this is common implementation of init method of dir plugin
29497+ create "." and ".." entries
29498+*/
29499+int reiser4_dir_init_common(struct inode *object, /* new directory */
29500+ struct inode *parent, /* parent directory */
29501+ reiser4_object_create_data * data /* info passed
29502+ * to us, this
29503+ * is filled by
29504+ * reiser4()
29505+ * syscall in
29506+ * particular */)
29507+{
29508+ reiser4_block_nr reserve;
29509+
29510+ assert("nikita-680", object != NULL);
29511+ assert("nikita-681", S_ISDIR(object->i_mode));
29512+ assert("nikita-682", parent != NULL);
29513+ assert("nikita-684", data != NULL);
29514+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29515+ assert("nikita-687", object->i_mode & S_IFDIR);
29516+
29517+ reserve = estimate_init(parent, object);
29518+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29519+ return RETERR(-ENOSPC);
29520+
29521+ return create_dot_dotdot(object, parent);
29522+}
29523+
29524+/* this is common implementation of done method of dir plugin
29525+ remove "." entry
29526+*/
29527+int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
29528+{
29529+ int result;
29530+ reiser4_block_nr reserve;
29531+ struct dentry goodby_dots;
29532+ reiser4_dir_entry_desc entry;
29533+
29534+ assert("nikita-1449", object != NULL);
29535+
29536+ if (reiser4_inode_get_flag(object, REISER4_NO_SD))
29537+ return 0;
29538+
29539+ /* of course, this can be rewritten to sweep everything in one
29540+ reiser4_cut_tree(). */
29541+ memset(&entry, 0, sizeof entry);
29542+
29543+ /* FIXME: this done method is called from reiser4_delete_dir_common which
29544+ * reserved space already */
29545+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29546+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29547+ return RETERR(-ENOSPC);
29548+
29549+ memset(&goodby_dots, 0, sizeof goodby_dots);
29550+ entry.obj = goodby_dots.d_inode = object;
29551+ goodby_dots.d_name.name = ".";
29552+ goodby_dots.d_name.len = 1;
29553+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29554+ reiser4_free_dentry_fsdata(&goodby_dots);
29555+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29556+ /* only worth a warning
29557+
29558+ "values of \ eB\ f will give rise to dom!\n"
29559+ -- v6src/s2/mv.c:89
29560+ */
29561+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
29562+ (unsigned long long)get_inode_oid(object), result);
29563+ return 0;
29564+}
29565+
29566+/* this is common implementation of attach method of dir plugin
29567+*/
29568+int reiser4_attach_common(struct inode *child UNUSED_ARG,
29569+ struct inode *parent UNUSED_ARG)
29570+{
29571+ assert("nikita-2647", child != NULL);
29572+ assert("nikita-2648", parent != NULL);
29573+
29574+ return 0;
29575+}
29576+
29577+/* this is common implementation of detach method of dir plugin
29578+ remove "..", decrease nlink on parent
29579+*/
29580+int reiser4_detach_common(struct inode *object, struct inode *parent)
29581+{
29582+ int result;
29583+ struct dentry goodby_dots;
29584+ reiser4_dir_entry_desc entry;
29585+
29586+ assert("nikita-2885", object != NULL);
29587+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
29588+
29589+ memset(&entry, 0, sizeof entry);
29590+
29591+ /* NOTE-NIKITA this only works if @parent is -the- parent of
29592+ @object, viz. object whose key is stored in dotdot
29593+ entry. Wouldn't work with hard-links on directories. */
29594+ memset(&goodby_dots, 0, sizeof goodby_dots);
29595+ entry.obj = goodby_dots.d_inode = parent;
29596+ goodby_dots.d_name.name = "..";
29597+ goodby_dots.d_name.len = 2;
29598+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29599+ reiser4_free_dentry_fsdata(&goodby_dots);
29600+ if (result == 0) {
29601+ /* the dot should be the only entry remaining at this time... */
29602+ assert("nikita-3400",
29603+ object->i_size == 1 && object->i_nlink <= 2);
29604+#if 0
29605+ /* and, together with the only name directory can have, they
29606+ * provides for the last 2 remaining references. If we get
29607+ * here as part of error handling during mkdir, @object
29608+ * possibly has no name yet, so its nlink == 1. If we get here
29609+ * from rename (targeting empty directory), it has no name
29610+ * already, so its nlink == 1. */
29611+ assert("nikita-3401",
29612+ object->i_nlink == 2 || object->i_nlink == 1);
29613+#endif
29614+
29615+ /* decrement nlink of directory removed ".." pointed
29616+ to */
29617+ reiser4_del_nlink(parent, NULL, 0);
29618+ }
29619+ return result;
29620+}
29621+
29622+/* this is common implementation of estimate.add_entry method of
29623+ dir plugin
29624+ estimation of adding entry which supposes that entry is inserting a
29625+ unit into item
29626+*/
29627+reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29628+{
29629+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
29630+}
29631+
29632+/* this is common implementation of estimate.rem_entry method of dir
29633+ plugin
29634+*/
29635+reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29636+{
29637+ return estimate_one_item_removal(reiser4_tree_by_inode(inode));
29638+}
29639+
29640+/* this is common implementation of estimate.unlink method of dir
29641+ plugin
29642+*/
29643+reiser4_block_nr
29644+dir_estimate_unlink_common(const struct inode * parent,
29645+ const struct inode * object)
29646+{
29647+ reiser4_block_nr res;
29648+
29649+ /* hashed_rem_entry(object) */
29650+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
29651+ /* del_nlink(parent) */
29652+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29653+
29654+ return res;
29655+}
29656+
29657+/*
29658+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29659+ * methods: if @inode is a light-weight file, setup its credentials
29660+ * that are not stored in the stat-data in this case
29661+ */
29662+void check_light_weight(struct inode *inode, struct inode *parent)
29663+{
29664+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29665+ inode->i_uid = parent->i_uid;
29666+ inode->i_gid = parent->i_gid;
29667+ /* clear light-weight flag. If inode would be read by any
29668+ other name, [ug]id wouldn't change. */
29669+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29670+ }
29671+}
29672+
29673+/* looks for name specified in @dentry in directory @parent and if name is
29674+ found - key of object found entry points to is stored in @entry->key */
29675+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup for
29676+ * name in */
29677+ struct dentry *dentry, /* name to look for */
29678+ reiser4_key * key /* place to store key */ )
29679+{
29680+ int result;
29681+ coord_t *coord;
29682+ lock_handle lh;
29683+ const char *name;
29684+ int len;
29685+ reiser4_dir_entry_desc entry;
29686+ reiser4_dentry_fsdata *fsdata;
29687+
29688+ assert("nikita-1247", parent != NULL);
29689+ assert("nikita-1248", dentry != NULL);
29690+ assert("nikita-1123", dentry->d_name.name != NULL);
29691+ assert("vs-1486",
29692+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29693+
29694+ name = dentry->d_name.name;
29695+ len = dentry->d_name.len;
29696+
29697+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29698+ /* some arbitrary error code to return */
29699+ return RETERR(-ENAMETOOLONG);
29700+
29701+ fsdata = reiser4_get_dentry_fsdata(dentry);
29702+ if (IS_ERR(fsdata))
29703+ return PTR_ERR(fsdata);
29704+
29705+ coord = &fsdata->dec.entry_coord;
29706+ coord_clear_iplug(coord);
29707+ init_lh(&lh);
29708+
29709+ /* find entry in a directory. This is plugin method. */
29710+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
29711+ &entry);
29712+ if (result == 0) {
29713+ /* entry was found, extract object key from it. */
29714+ result =
29715+ WITH_COORD(coord,
29716+ item_plugin_by_coord(coord)->s.dir.
29717+ extract_key(coord, key));
29718+ }
29719+ done_lh(&lh);
29720+ return result;
29721+
29722+}
29723+
29724+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
29725+static reiser4_block_nr
29726+estimate_init(struct inode *parent, struct inode *object)
29727+{
29728+ reiser4_block_nr res = 0;
29729+
29730+ assert("vpf-321", parent != NULL);
29731+ assert("vpf-322", object != NULL);
29732+
29733+ /* hashed_add_entry(object) */
29734+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29735+ /* reiser4_add_nlink(object) */
29736+ res += inode_file_plugin(object)->estimate.update(object);
29737+ /* hashed_add_entry(object) */
29738+ res += inode_dir_plugin(object)->estimate.add_entry(object);
29739+ /* reiser4_add_nlink(parent) */
29740+ res += inode_file_plugin(parent)->estimate.update(parent);
29741+
29742+ return 0;
29743+}
29744+
29745+/* helper function for reiser4_dir_init_common(). Create "." and ".." */
29746+static int create_dot_dotdot(struct inode *object /* object to create dot and
29747+ * dotdot for */ ,
29748+ struct inode *parent /* parent of @object */)
29749+{
29750+ int result;
29751+ struct dentry dots_entry;
29752+ reiser4_dir_entry_desc entry;
29753+
29754+ assert("nikita-688", object != NULL);
29755+ assert("nikita-689", S_ISDIR(object->i_mode));
29756+ assert("nikita-691", parent != NULL);
29757+
29758+ /* We store dot and dotdot as normal directory entries. This is
29759+ not necessary, because almost all information stored in them
29760+ is already in the stat-data of directory, the only thing
29761+ being missed is objectid of grand-parent directory that can
29762+ easily be added there as extension.
29763+
29764+ But it is done the way it is done, because not storing dot
29765+ and dotdot will lead to the following complications:
29766+
29767+ . special case handling in ->lookup().
29768+ . addition of another extension to the sd.
29769+ . dependency on key allocation policy for stat data.
29770+
29771+ */
29772+
29773+ memset(&entry, 0, sizeof entry);
29774+ memset(&dots_entry, 0, sizeof dots_entry);
29775+ entry.obj = dots_entry.d_inode = object;
29776+ dots_entry.d_name.name = ".";
29777+ dots_entry.d_name.len = 1;
29778+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
29779+ reiser4_free_dentry_fsdata(&dots_entry);
29780+
29781+ if (result == 0) {
29782+ result = reiser4_add_nlink(object, object, 0);
29783+ if (result == 0) {
29784+ entry.obj = dots_entry.d_inode = parent;
29785+ dots_entry.d_name.name = "..";
29786+ dots_entry.d_name.len = 2;
29787+ result = reiser4_add_entry_common(object,
29788+ &dots_entry, NULL, &entry);
29789+ reiser4_free_dentry_fsdata(&dots_entry);
29790+ /* if creation of ".." failed, iput() will delete
29791+ object with ".". */
29792+ if (result == 0) {
29793+ result = reiser4_add_nlink(parent, object, 0);
29794+ if (result != 0)
29795+ /*
29796+ * if we failed to bump i_nlink, try
29797+ * to remove ".."
29798+ */
29799+ reiser4_detach_common(object, parent);
29800+ }
29801+ }
29802+ }
29803+
29804+ if (result != 0) {
29805+ /*
29806+ * in the case of error, at least update stat-data so that,
29807+ * ->i_nlink updates are not lingering.
29808+ */
29809+ reiser4_update_sd(object);
29810+ reiser4_update_sd(parent);
29811+ }
29812+
29813+ return result;
29814+}
29815+
29816+/*
29817+ * return 0 iff @coord contains a directory entry for the file with the name
29818+ * @name.
29819+ */
29820+static int
29821+check_item(const struct inode *dir, const coord_t * coord, const char *name)
29822+{
29823+ item_plugin *iplug;
29824+ char buf[DE_NAME_BUF_LEN];
29825+
29826+ iplug = item_plugin_by_coord(coord);
29827+ if (iplug == NULL) {
29828+ warning("nikita-1135", "Cannot get item plugin");
29829+ print_coord("coord", coord, 1);
29830+ return RETERR(-EIO);
29831+ } else if (item_id_by_coord(coord) !=
29832+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
29833+ /* item id of current item does not match to id of items a
29834+ directory is built of */
29835+ warning("nikita-1136", "Wrong item plugin");
29836+ print_coord("coord", coord, 1);
29837+ return RETERR(-EIO);
29838+ }
29839+ assert("nikita-1137", iplug->s.dir.extract_name);
29840+
29841+ /* Compare name stored in this entry with name we are looking for.
29842+
29843+ NOTE-NIKITA Here should go code for support of something like
29844+ unicode, code tables, etc.
29845+ */
29846+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
29847+}
29848+
29849+static int
29850+check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
29851+{
29852+ return WITH_COORD(coord, check_item(dir, coord, name->name));
29853+}
29854+
29855+/*
29856+ * argument package used by entry_actor to scan entries with identical keys.
29857+ */
29858+typedef struct entry_actor_args {
29859+ /* name we are looking for */
29860+ const char *name;
29861+ /* key of directory entry. entry_actor() scans through sequence of
29862+ * items/units having the same key */
29863+ reiser4_key *key;
29864+ /* how many entries with duplicate key was scanned so far. */
29865+ int non_uniq;
29866+#if REISER4_USE_COLLISION_LIMIT
29867+ /* scan limit */
29868+ int max_non_uniq;
29869+#endif
29870+ /* return parameter: set to true, if ->name wasn't found */
29871+ int not_found;
29872+ /* what type of lock to take when moving to the next node during
29873+ * scan */
29874+ znode_lock_mode mode;
29875+
29876+ /* last coord that was visited during scan */
29877+ coord_t last_coord;
29878+ /* last node locked during scan */
29879+ lock_handle last_lh;
29880+ /* inode of directory */
29881+ const struct inode *inode;
29882+} entry_actor_args;
29883+
29884+/* Function called by reiser4_find_entry() to look for given name
29885+ in the directory. */
29886+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
29887+ coord_t * coord /* current coord */ ,
29888+ lock_handle * lh /* current lock handle */ ,
29889+ void *entry_actor_arg /* argument to scan */ )
29890+{
29891+ reiser4_key unit_key;
29892+ entry_actor_args *args;
29893+
29894+ assert("nikita-1131", tree != NULL);
29895+ assert("nikita-1132", coord != NULL);
29896+ assert("nikita-1133", entry_actor_arg != NULL);
29897+
29898+ args = entry_actor_arg;
29899+ ++args->non_uniq;
29900+#if REISER4_USE_COLLISION_LIMIT
29901+ if (args->non_uniq > args->max_non_uniq) {
29902+ args->not_found = 1;
29903+ /* hash collision overflow. */
29904+ return RETERR(-EBUSY);
29905+ }
29906+#endif
29907+
29908+ /*
29909+ * did we just reach the end of the sequence of items/units with
29910+ * identical keys?
29911+ */
29912+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
29913+ assert("nikita-1791",
29914+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
29915+ args->not_found = 1;
29916+ args->last_coord.between = AFTER_UNIT;
29917+ return 0;
29918+ }
29919+
29920+ coord_dup(&args->last_coord, coord);
29921+ /*
29922+ * did scan just moved to the next node?
29923+ */
29924+ if (args->last_lh.node != lh->node) {
29925+ int lock_result;
29926+
29927+ /*
29928+ * if so, lock new node with the mode requested by the caller
29929+ */
29930+ done_lh(&args->last_lh);
29931+ assert("nikita-1896", znode_is_any_locked(lh->node));
29932+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
29933+ args->mode, ZNODE_LOCK_HIPRI);
29934+ if (lock_result != 0)
29935+ return lock_result;
29936+ }
29937+ return check_item(args->inode, coord, args->name);
29938+}
29939+
29940+/* Look for given @name within directory @dir.
29941+
29942+ This is called during lookup, creation and removal of directory
29943+ entries and on reiser4_rename_common
29944+
29945+ First calculate key that directory entry for @name would have. Search
29946+ for this key in the tree. If such key is found, scan all items with
29947+ the same key, checking name in each directory entry along the way.
29948+*/
29949+int reiser4_find_entry(struct inode *dir, /* directory to scan */
29950+ struct dentry *de, /* name to search for */
29951+ lock_handle * lh, /* resulting lock handle */
29952+ znode_lock_mode mode, /* required lock mode */
29953+ reiser4_dir_entry_desc * entry /* parameters of found
29954+ directory entry */)
29955+{
29956+ const struct qstr *name;
29957+ seal_t *seal;
29958+ coord_t *coord;
29959+ int result;
29960+ __u32 flags;
29961+ de_location *dec;
29962+ reiser4_dentry_fsdata *fsdata;
29963+
29964+ assert("nikita-1130", lh != NULL);
29965+ assert("nikita-1128", dir != NULL);
29966+
29967+ name = &de->d_name;
29968+ assert("nikita-1129", name != NULL);
29969+
29970+ /* dentry private data don't require lock, because dentry
29971+ manipulations are protected by i_mutex on parent.
29972+
29973+ This is not so for inodes, because there is no -the- parent in
29974+ inode case.
29975+ */
29976+ fsdata = reiser4_get_dentry_fsdata(de);
29977+ if (IS_ERR(fsdata))
29978+ return PTR_ERR(fsdata);
29979+ dec = &fsdata->dec;
29980+
29981+ coord = &dec->entry_coord;
29982+ coord_clear_iplug(coord);
29983+ seal = &dec->entry_seal;
29984+ /* compose key of directory entry for @name */
29985+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
29986+
29987+ if (reiser4_seal_is_set(seal)) {
29988+ /* check seal */
29989+ result = reiser4_seal_validate(seal, coord, &entry->key,
29990+ lh, mode, ZNODE_LOCK_LOPRI);
29991+ if (result == 0) {
29992+ /* key was found. Check that it is really item we are
29993+ looking for. */
29994+ result = check_entry(dir, coord, name);
29995+ if (result == 0)
29996+ return 0;
29997+ }
29998+ }
29999+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
30000+ /*
30001+ * find place in the tree where directory item should be located.
30002+ */
30003+ result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
30004+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
30005+ flags, NULL /*ra_info */ );
30006+ if (result == CBK_COORD_FOUND) {
30007+ entry_actor_args arg;
30008+
30009+ /* fast path: no hash collisions */
30010+ result = check_entry(dir, coord, name);
30011+ if (result == 0) {
30012+ reiser4_seal_init(seal, coord, &entry->key);
30013+ dec->pos = 0;
30014+ } else if (result > 0) {
30015+ /* Iterate through all units with the same keys. */
30016+ arg.name = name->name;
30017+ arg.key = &entry->key;
30018+ arg.not_found = 0;
30019+ arg.non_uniq = 0;
30020+#if REISER4_USE_COLLISION_LIMIT
30021+ arg.max_non_uniq = max_hash_collisions(dir);
30022+ assert("nikita-2851", arg.max_non_uniq > 1);
30023+#endif
30024+ arg.mode = mode;
30025+ arg.inode = dir;
30026+ coord_init_zero(&arg.last_coord);
30027+ init_lh(&arg.last_lh);
30028+
30029+ result = reiser4_iterate_tree
30030+ (reiser4_tree_by_inode(dir),
30031+ coord, lh,
30032+ entry_actor, &arg, mode, 1);
30033+ /* if end of the tree or extent was reached during
30034+ scanning. */
30035+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
30036+ /* step back */
30037+ done_lh(lh);
30038+
30039+ result = zload(arg.last_coord.node);
30040+ if (result == 0) {
30041+ coord_clear_iplug(&arg.last_coord);
30042+ coord_dup(coord, &arg.last_coord);
30043+ move_lh(lh, &arg.last_lh);
30044+ result = RETERR(-ENOENT);
30045+ zrelse(arg.last_coord.node);
30046+ --arg.non_uniq;
30047+ }
30048+ }
30049+
30050+ done_lh(&arg.last_lh);
30051+ if (result == 0)
30052+ reiser4_seal_init(seal, coord, &entry->key);
30053+
30054+ if (result == 0 || result == -ENOENT) {
30055+ assert("nikita-2580", arg.non_uniq > 0);
30056+ dec->pos = arg.non_uniq - 1;
30057+ }
30058+ }
30059+ } else
30060+ dec->pos = -1;
30061+ return result;
30062+}
30063+
30064+/*
30065+ Local variables:
30066+ c-indentation-style: "K&R"
30067+ mode-name: "LC"
30068+ c-basic-offset: 8
30069+ tab-width: 8
30070+ fill-column: 120
30071+ scroll-step: 1
30072+ End:
30073+*/
30074diff --git a/fs/reiser4/plugin/disk_format/Makefile b/fs/reiser4/plugin/disk_format/Makefile
30075new file mode 100644
30076index 0000000..e4e9e54
30077--- /dev/null
30078+++ b/fs/reiser4/plugin/disk_format/Makefile
30079@@ -0,0 +1,5 @@
30080+obj-$(CONFIG_REISER4_FS) += df_plugins.o
30081+
30082+df_plugins-objs := \
30083+ disk_format40.o \
30084+ disk_format.o
30085diff --git a/fs/reiser4/plugin/disk_format/disk_format.c b/fs/reiser4/plugin/disk_format/disk_format.c
30086new file mode 100644
30087index 0000000..d785106
30088--- /dev/null
30089+++ b/fs/reiser4/plugin/disk_format/disk_format.c
30090@@ -0,0 +1,38 @@
30091+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30092+
30093+#include "../../debug.h"
30094+#include "../plugin_header.h"
30095+#include "disk_format40.h"
30096+#include "disk_format.h"
30097+#include "../plugin.h"
30098+
30099+/* initialization of disk layout plugins */
30100+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30101+ [FORMAT40_ID] = {
30102+ .h = {
30103+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30104+ .id = FORMAT40_ID,
30105+ .pops = NULL,
30106+ .label = "reiser40",
30107+ .desc = "standard disk layout for reiser40",
30108+ .linkage = {NULL, NULL}
30109+ },
30110+ .init_format = init_format_format40,
30111+ .root_dir_key = root_dir_key_format40,
30112+ .release = release_format40,
30113+ .log_super = log_super_format40,
30114+ .check_open = check_open_format40,
30115+ .version_update = version_update_format40
30116+ }
30117+};
30118+
30119+/* Make Linus happy.
30120+ Local variables:
30121+ c-indentation-style: "K&R"
30122+ mode-name: "LC"
30123+ c-basic-offset: 8
30124+ tab-width: 8
30125+ fill-column: 120
30126+ scroll-step: 1
30127+ End:
30128+*/
30129diff --git a/fs/reiser4/plugin/disk_format/disk_format.h b/fs/reiser4/plugin/disk_format/disk_format.h
30130new file mode 100644
30131index 0000000..b9c53ac
30132--- /dev/null
30133+++ b/fs/reiser4/plugin/disk_format/disk_format.h
30134@@ -0,0 +1,27 @@
30135+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30136+
30137+/* identifiers for disk layouts, they are also used as indexes in array of disk
30138+ plugins */
30139+
30140+#if !defined( __REISER4_DISK_FORMAT_H__ )
30141+#define __REISER4_DISK_FORMAT_H__
30142+
30143+typedef enum {
30144+ /* standard reiser4 disk layout plugin id */
30145+ FORMAT40_ID,
30146+ LAST_FORMAT_ID
30147+} disk_format_id;
30148+
30149+/* __REISER4_DISK_FORMAT_H__ */
30150+#endif
30151+
30152+/* Make Linus happy.
30153+ Local variables:
30154+ c-indentation-style: "K&R"
30155+ mode-name: "LC"
30156+ c-basic-offset: 8
30157+ tab-width: 8
30158+ fill-column: 120
30159+ scroll-step: 1
30160+ End:
30161+*/
30162diff --git a/fs/reiser4/plugin/disk_format/disk_format40.c b/fs/reiser4/plugin/disk_format/disk_format40.c
30163new file mode 100644
30164index 0000000..17718f0
30165--- /dev/null
30166+++ b/fs/reiser4/plugin/disk_format/disk_format40.c
30167@@ -0,0 +1,655 @@
30168+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30169+
30170+#include "../../debug.h"
30171+#include "../../dformat.h"
30172+#include "../../key.h"
30173+#include "../node/node.h"
30174+#include "../space/space_allocator.h"
30175+#include "disk_format40.h"
30176+#include "../plugin.h"
30177+#include "../../txnmgr.h"
30178+#include "../../jnode.h"
30179+#include "../../tree.h"
30180+#include "../../super.h"
30181+#include "../../wander.h"
30182+#include "../../inode.h"
30183+#include "../../ktxnmgrd.h"
30184+#include "../../status_flags.h"
30185+
30186+#include <linux/types.h> /* for __u?? */
30187+#include <linux/fs.h> /* for struct super_block */
30188+#include <linux/buffer_head.h>
30189+
30190+/* reiser 4.0 default disk layout */
30191+
30192+/* Amount of free blocks needed to perform release_format40 when fs gets
30193+ mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
30194+ & tx record. */
30195+#define RELEASE_RESERVED 4
30196+
30197+/* The greatest supported format40 version number */
30198+#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
30199+
30200+/* This flag indicates that backup should be updated
30201+ (the update is performed by fsck) */
30202+#define FORMAT40_UPDATE_BACKUP (1 << 31)
30203+
30204+/* functions to access fields of format40_disk_super_block */
30205+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
30206+{
30207+ return le64_to_cpu(get_unaligned(&sb->block_count));
30208+}
30209+
30210+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
30211+{
30212+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
30213+}
30214+
30215+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
30216+{
30217+ return le64_to_cpu(get_unaligned(&sb->root_block));
30218+}
30219+
30220+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
30221+{
30222+ return le16_to_cpu(get_unaligned(&sb->tree_height));
30223+}
30224+
30225+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
30226+{
30227+ return le64_to_cpu(get_unaligned(&sb->file_count));
30228+}
30229+
30230+static __u64 get_format40_oid(const format40_disk_super_block * sb)
30231+{
30232+ return le64_to_cpu(get_unaligned(&sb->oid));
30233+}
30234+
30235+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
30236+{
30237+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
30238+}
30239+
30240+static __u64 get_format40_flags(const format40_disk_super_block * sb)
30241+{
30242+ return le64_to_cpu(get_unaligned(&sb->flags));
30243+}
30244+
30245+static __u32 get_format40_version(const format40_disk_super_block * sb)
30246+{
30247+ return le32_to_cpu(get_unaligned(&sb->version)) &
30248+ ~FORMAT40_UPDATE_BACKUP;
30249+}
30250+
30251+static int update_backup_version(const format40_disk_super_block * sb)
30252+{
30253+ return (le32_to_cpu(get_unaligned(&sb->version)) &
30254+ FORMAT40_UPDATE_BACKUP);
30255+}
30256+
30257+static int update_disk_version(const format40_disk_super_block * sb)
30258+{
30259+ return (get_format40_version(sb) < FORMAT40_VERSION);
30260+}
30261+
30262+static int incomplete_compatibility(const format40_disk_super_block * sb)
30263+{
30264+ return (get_format40_version(sb) > FORMAT40_VERSION);
30265+}
30266+
30267+static format40_super_info *get_sb_info(struct super_block *super)
30268+{
30269+ return &get_super_private(super)->u.format40;
30270+}
30271+
30272+static int consult_diskmap(struct super_block *s)
30273+{
30274+ format40_super_info *info;
30275+ journal_location *jloc;
30276+
30277+ info = get_sb_info(s);
30278+ jloc = &get_super_private(s)->jloc;
30279+ /* Default format-specific locations, if there is nothing in
30280+ * diskmap */
30281+ jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
30282+ jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
30283+ info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
30284+#ifdef CONFIG_REISER4_BADBLOCKS
30285+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
30286+ &jloc->footer);
30287+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
30288+ &jloc->header);
30289+ reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
30290+ &info->loc.super);
30291+#endif
30292+ return 0;
30293+}
30294+
30295+/* find any valid super block of disk_format40 (even if the first
30296+ super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
30297+ if needed */
30298+static struct buffer_head *find_a_disk_format40_super_block(struct super_block
30299+ *s)
30300+{
30301+ struct buffer_head *super_bh;
30302+ format40_disk_super_block *disk_sb;
30303+ format40_super_info *info;
30304+
30305+ assert("umka-487", s != NULL);
30306+
30307+ info = get_sb_info(s);
30308+
30309+ super_bh = sb_bread(s, info->loc.super);
30310+ if (super_bh == NULL)
30311+ return ERR_PTR(RETERR(-EIO));
30312+
30313+ disk_sb = (format40_disk_super_block *) super_bh->b_data;
30314+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
30315+ brelse(super_bh);
30316+ return ERR_PTR(RETERR(-EINVAL));
30317+ }
30318+
30319+ reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
30320+ reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
30321+ le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30322+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30323+
30324+ return super_bh;
30325+}
30326+
30327+/* find the most recent version of super block. This is called after journal is
30328+ replayed */
30329+static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
30330+{
30331+ /* Here the most recent superblock copy has to be read. However, as
30332+ journal replay isn't complete, we are using
30333+ find_a_disk_format40_super_block() function. */
30334+ return find_a_disk_format40_super_block(s);
30335+}
30336+
30337+static int get_super_jnode(struct super_block *s)
30338+{
30339+ reiser4_super_info_data *sbinfo = get_super_private(s);
30340+ jnode *sb_jnode;
30341+ int ret;
30342+
30343+ sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
30344+
30345+ ret = jload(sb_jnode);
30346+
30347+ if (ret) {
30348+ reiser4_drop_io_head(sb_jnode);
30349+ return ret;
30350+ }
30351+
30352+ pin_jnode_data(sb_jnode);
30353+ jrelse(sb_jnode);
30354+
30355+ sbinfo->u.format40.sb_jnode = sb_jnode;
30356+
30357+ return 0;
30358+}
30359+
30360+static void done_super_jnode(struct super_block *s)
30361+{
30362+ jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30363+
30364+ if (sb_jnode) {
30365+ unpin_jnode_data(sb_jnode);
30366+ reiser4_drop_io_head(sb_jnode);
30367+ }
30368+}
30369+
30370+typedef enum format40_init_stage {
30371+ NONE_DONE = 0,
30372+ CONSULT_DISKMAP,
30373+ FIND_A_SUPER,
30374+ INIT_JOURNAL_INFO,
30375+ INIT_STATUS,
30376+ JOURNAL_REPLAY,
30377+ READ_SUPER,
30378+ KEY_CHECK,
30379+ INIT_OID,
30380+ INIT_TREE,
30381+ JOURNAL_RECOVER,
30382+ INIT_SA,
30383+ INIT_JNODE,
30384+ ALL_DONE
30385+} format40_init_stage;
30386+
30387+static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
30388+{
30389+ format40_disk_super_block *sb_copy;
30390+
30391+ sb_copy = kmalloc(sizeof(format40_disk_super_block),
30392+ reiser4_ctx_gfp_mask_get());
30393+ if (sb_copy == NULL)
30394+ return ERR_PTR(RETERR(-ENOMEM));
30395+ memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
30396+ sizeof(format40_disk_super_block));
30397+ return sb_copy;
30398+}
30399+
30400+static int check_key_format(const format40_disk_super_block *sb_copy)
30401+{
30402+ if (!equi(REISER4_LARGE_KEY,
30403+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
30404+ warning("nikita-3228", "Key format mismatch. "
30405+ "Only %s keys are supported.",
30406+ REISER4_LARGE_KEY ? "large" : "small");
30407+ return RETERR(-EINVAL);
30408+ }
30409+ return 0;
30410+}
30411+
30412+/**
30413+ * try_init_format40
30414+ * @super:
30415+ * @stage:
30416+ *
30417+ */
30418+static int try_init_format40(struct super_block *super,
30419+ format40_init_stage *stage)
30420+{
30421+ int result;
30422+ struct buffer_head *super_bh;
30423+ reiser4_super_info_data *sbinfo;
30424+ format40_disk_super_block *sb_copy;
30425+ tree_level height;
30426+ reiser4_block_nr root_block;
30427+ node_plugin *nplug;
30428+
30429+ assert("vs-475", super != NULL);
30430+ assert("vs-474", get_super_private(super));
30431+
30432+ *stage = NONE_DONE;
30433+
30434+ result = consult_diskmap(super);
30435+ if (result)
30436+ return result;
30437+ *stage = CONSULT_DISKMAP;
30438+
30439+ super_bh = find_a_disk_format40_super_block(super);
30440+ if (IS_ERR(super_bh))
30441+ return PTR_ERR(super_bh);
30442+ brelse(super_bh);
30443+ *stage = FIND_A_SUPER;
30444+
30445+ /* ok, we are sure that filesystem format is a format40 format */
30446+
30447+ /* map jnodes for journal control blocks (header, footer) to disk */
30448+ result = reiser4_init_journal_info(super);
30449+ if (result)
30450+ return result;
30451+ *stage = INIT_JOURNAL_INFO;
30452+
30453+ /* ok, we are sure that filesystem format is a format40 format */
30454+ /* Now check it's state */
30455+ result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
30456+ if (result != 0 && result != -EINVAL)
30457+ /* -EINVAL means there is no magic, so probably just old
30458+ * fs. */
30459+ return result;
30460+ *stage = INIT_STATUS;
30461+
30462+ result = reiser4_status_query(NULL, NULL);
30463+ if (result == REISER4_STATUS_MOUNT_WARN)
30464+ notice("vpf-1363", "Warning: mounting %s with errors.",
30465+ super->s_id);
30466+ if (result == REISER4_STATUS_MOUNT_RO)
30467+ notice("vpf-1364", "Warning: mounting %s with fatal errors,"
30468+ " forcing read-only mount.", super->s_id);
30469+ result = reiser4_journal_replay(super);
30470+ if (result)
30471+ return result;
30472+ *stage = JOURNAL_REPLAY;
30473+
30474+ super_bh = read_super_block(super);
30475+ if (IS_ERR(super_bh))
30476+ return PTR_ERR(super_bh);
30477+ *stage = READ_SUPER;
30478+
30479+ /* allocate and make a copy of format40_disk_super_block */
30480+ sb_copy = copy_sb(super_bh);
30481+ brelse(super_bh);
30482+
30483+ if (IS_ERR(sb_copy))
30484+ return PTR_ERR(sb_copy);
30485+ printk("reiser4: %s: found disk format 4.0.%u.\n",
30486+ super->s_id,
30487+ get_format40_version(sb_copy));
30488+ if (incomplete_compatibility(sb_copy))
30489+ printk("reiser4: Warning: The last completely supported "
30490+ "version of disk format40 is %u. Some objects of "
30491+ "the semantic tree can be unaccessible.\n",
30492+ FORMAT40_VERSION);
30493+ /* make sure that key format of kernel and filesystem match */
30494+ result = check_key_format(sb_copy);
30495+ if (result) {
30496+ kfree(sb_copy);
30497+ return result;
30498+ }
30499+ *stage = KEY_CHECK;
30500+
30501+ result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30502+ get_format40_oid(sb_copy));
30503+ if (result) {
30504+ kfree(sb_copy);
30505+ return result;
30506+ }
30507+ *stage = INIT_OID;
30508+
30509+ /* get things necessary to init reiser4_tree */
30510+ root_block = get_format40_root_block(sb_copy);
30511+ height = get_format40_tree_height(sb_copy);
30512+ nplug = node_plugin_by_id(NODE40_ID);
30513+
30514+ /* initialize reiser4_super_info_data */
30515+ sbinfo = get_super_private(super);
30516+ assert("", sbinfo->tree.super == super);
30517+ /* init reiser4_tree for the filesystem */
30518+ result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
30519+ if (result) {
30520+ kfree(sb_copy);
30521+ return result;
30522+ }
30523+ *stage = INIT_TREE;
30524+
30525+ /*
30526+ * initialize reiser4_super_info_data with data from format40 super
30527+ * block
30528+ */
30529+ sbinfo->default_uid = 0;
30530+ sbinfo->default_gid = 0;
30531+ sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
30532+ /* number of blocks in filesystem and reserved space */
30533+ reiser4_set_block_count(super, get_format40_block_count(sb_copy));
30534+ sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
30535+ sbinfo->version = get_format40_version(sb_copy);
30536+ kfree(sb_copy);
30537+
30538+ if (update_backup_version(sb_copy))
30539+ printk("reiser4: Warning: metadata backup is not updated. "
30540+ "Please run 'fsck.reiser4 --fix' on %s.\n",
30541+ super->s_id);
30542+
30543+ sbinfo->fsuid = 0;
30544+ sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
30545+ * are not supported */
30546+ sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in
30547+ * layout 40 are
30548+ * of one
30549+ * plugin */
30550+ /* sbinfo->tmgr is initialized already */
30551+
30552+ /* recover sb data which were logged separately from sb block */
30553+
30554+ /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
30555+ * oid_init_allocator() and reiser4_set_free_blocks() with new
30556+ * data. What's the reason to call them above? */
30557+ result = reiser4_journal_recover_sb_data(super);
30558+ if (result != 0)
30559+ return result;
30560+ *stage = JOURNAL_RECOVER;
30561+
30562+ /*
30563+ * Set number of used blocks. The number of used blocks is not stored
30564+ * neither in on-disk super block nor in the journal footer blocks. At
30565+ * this moment actual values of total blocks and free block counters
30566+ * are set in the reiser4 super block (in-memory structure) and we can
30567+ * calculate number of used blocks from them.
30568+ */
30569+ reiser4_set_data_blocks(super,
30570+ reiser4_block_count(super) -
30571+ reiser4_free_blocks(super));
30572+
30573+#if REISER4_DEBUG
30574+ sbinfo->min_blocks_used = 16 /* reserved area */ +
30575+ 2 /* super blocks */ +
30576+ 2 /* journal footer and header */ ;
30577+#endif
30578+
30579+ /* init disk space allocator */
30580+ result = sa_init_allocator(reiser4_get_space_allocator(super),
30581+ super, NULL);
30582+ if (result)
30583+ return result;
30584+ *stage = INIT_SA;
30585+
30586+ result = get_super_jnode(super);
30587+ if (result == 0)
30588+ *stage = ALL_DONE;
30589+ return result;
30590+}
30591+
30592+/* plugin->u.format.get_ready */
30593+int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30594+{
30595+ int result;
30596+ format40_init_stage stage;
30597+
30598+ result = try_init_format40(s, &stage);
30599+ switch (stage) {
30600+ case ALL_DONE:
30601+ assert("nikita-3458", result == 0);
30602+ break;
30603+ case INIT_JNODE:
30604+ done_super_jnode(s);
30605+ case INIT_SA:
30606+ sa_destroy_allocator(reiser4_get_space_allocator(s), s);
30607+ case JOURNAL_RECOVER:
30608+ case INIT_TREE:
30609+ reiser4_done_tree(&get_super_private(s)->tree);
30610+ case INIT_OID:
30611+ case KEY_CHECK:
30612+ case READ_SUPER:
30613+ case JOURNAL_REPLAY:
30614+ case INIT_STATUS:
30615+ reiser4_status_finish();
30616+ case INIT_JOURNAL_INFO:
30617+ reiser4_done_journal_info(s);
30618+ case FIND_A_SUPER:
30619+ case CONSULT_DISKMAP:
30620+ case NONE_DONE:
30621+ break;
30622+ default:
30623+ impossible("nikita-3457", "init stage: %i", stage);
30624+ }
30625+
30626+ if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30627+ return RETERR(-ENOSPC);
30628+
30629+ return result;
30630+}
30631+
30632+static void pack_format40_super(const struct super_block *s, char *data)
30633+{
30634+ format40_disk_super_block *super_data =
30635+ (format40_disk_super_block *) data;
30636+
30637+ reiser4_super_info_data *sbinfo = get_super_private(s);
30638+
30639+ assert("zam-591", data != NULL);
30640+
30641+ put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30642+ &super_data->free_blocks);
30643+
30644+ put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
30645+ &super_data->root_block);
30646+
30647+ put_unaligned(cpu_to_le64(oid_next(s)),
30648+ &super_data->oid);
30649+
30650+ put_unaligned(cpu_to_le64(oids_used(s)),
30651+ &super_data->file_count);
30652+
30653+ put_unaligned(cpu_to_le16(sbinfo->tree.height),
30654+ &super_data->tree_height);
30655+
30656+ if (update_disk_version(super_data)) {
30657+ __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
30658+
30659+ put_unaligned(cpu_to_le32(version), &super_data->version);
30660+ }
30661+}
30662+
30663+/* plugin->u.format.log_super
30664+ return a jnode which should be added to transaction when the super block
30665+ gets logged */
30666+jnode *log_super_format40(struct super_block *s)
30667+{
30668+ jnode *sb_jnode;
30669+
30670+ sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30671+
30672+ jload(sb_jnode);
30673+
30674+ pack_format40_super(s, jdata(sb_jnode));
30675+
30676+ jrelse(sb_jnode);
30677+
30678+ return sb_jnode;
30679+}
30680+
30681+/* plugin->u.format.release */
30682+int release_format40(struct super_block *s)
30683+{
30684+ int ret;
30685+ reiser4_super_info_data *sbinfo;
30686+
30687+ sbinfo = get_super_private(s);
30688+ assert("zam-579", sbinfo != NULL);
30689+
30690+ if (!rofs_super(s)) {
30691+ ret = reiser4_capture_super_block(s);
30692+ if (ret != 0)
30693+ warning("vs-898",
30694+ "reiser4_capture_super_block failed: %d",
30695+ ret);
30696+
30697+ ret = txnmgr_force_commit_all(s, 1);
30698+ if (ret != 0)
30699+ warning("jmacd-74438", "txn_force failed: %d", ret);
30700+
30701+ all_grabbed2free();
30702+ }
30703+
30704+ sa_destroy_allocator(&sbinfo->space_allocator, s);
30705+ reiser4_done_journal_info(s);
30706+ done_super_jnode(s);
30707+
30708+ rcu_barrier();
30709+ reiser4_done_tree(&sbinfo->tree);
30710+ /* call finish_rcu(), because some znode were "released" in
30711+ * reiser4_done_tree(). */
30712+ rcu_barrier();
30713+
30714+ return 0;
30715+}
30716+
30717+#define FORMAT40_ROOT_LOCALITY 41
30718+#define FORMAT40_ROOT_OBJECTID 42
30719+
30720+/* plugin->u.format.root_dir_key */
30721+const reiser4_key *root_dir_key_format40(const struct super_block *super
30722+ UNUSED_ARG)
30723+{
30724+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30725+ .el = {
30726+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30727+#if REISER4_LARGE_KEY
30728+ ON_LARGE_KEY(0ull,)
30729+#endif
30730+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30731+ 0ull
30732+ }
30733+ };
30734+
30735+ return &FORMAT40_ROOT_DIR_KEY;
30736+}
30737+
30738+/* plugin->u.format.check_open.
30739+ Check the opened object for validness. For now it checks for the valid oid &
30740+ locality only, can be improved later and it its work may depend on the mount
30741+ options. */
30742+int check_open_format40(const struct inode *object)
30743+{
30744+ oid_t max, oid;
30745+
30746+ max = oid_next(object->i_sb) - 1;
30747+
30748+ /* Check the oid. */
30749+ oid = get_inode_oid(object);
30750+ if (oid > max) {
30751+ warning("vpf-1360", "The object with the oid %llu "
30752+ "greater then the max used oid %llu found.",
30753+ (unsigned long long)oid, (unsigned long long)max);
30754+
30755+ return RETERR(-EIO);
30756+ }
30757+
30758+ /* Check the locality. */
30759+ oid = reiser4_inode_data(object)->locality_id;
30760+ if (oid > max) {
30761+ warning("vpf-1361", "The object with the locality %llu "
30762+ "greater then the max used oid %llu found.",
30763+ (unsigned long long)oid, (unsigned long long)max);
30764+
30765+ return RETERR(-EIO);
30766+ }
30767+
30768+ return 0;
30769+}
30770+
30771+/* plugin->u.format.version_update.
30772+ Perform all version update operations from the on-disk
30773+ format40_disk_super_block.version on disk to FORMAT40_VERSION.
30774+ */
30775+int version_update_format40(struct super_block *super) {
30776+ txn_handle * trans;
30777+ lock_handle lh;
30778+ txn_atom *atom;
30779+ int ret;
30780+
30781+ /* Nothing to do if RO mount or the on-disk version is not less. */
30782+ if (super->s_flags & MS_RDONLY)
30783+ return 0;
30784+
30785+ if (get_super_private(super)->version >= FORMAT40_VERSION)
30786+ return 0;
30787+
30788+ printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
30789+ "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
30790+ "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
30791+
30792+ /* Mark the uber znode dirty to call log_super on write_logs. */
30793+ init_lh(&lh);
30794+ ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
30795+ ZNODE_LOCK_HIPRI, &lh);
30796+ if (ret != 0)
30797+ return ret;
30798+
30799+ znode_make_dirty(lh.node);
30800+ done_lh(&lh);
30801+
30802+ /* Update the backup blocks. */
30803+
30804+ /* Force write_logs immediately. */
30805+ trans = get_current_context()->trans;
30806+ atom = get_current_atom_locked();
30807+ assert("vpf-1906", atom != NULL);
30808+
30809+ spin_lock_txnh(trans);
30810+ return force_commit_atom(trans);
30811+}
30812+
30813+/* Make Linus happy.
30814+ Local variables:
30815+ c-indentation-style: "K&R"
30816+ mode-name: "LC"
30817+ c-basic-offset: 8
30818+ tab-width: 8
30819+ fill-column: 120
30820+ scroll-step: 1
30821+ End:
30822+*/
30823diff --git a/fs/reiser4/plugin/disk_format/disk_format40.h b/fs/reiser4/plugin/disk_format/disk_format40.h
30824new file mode 100644
30825index 0000000..7fc1772
30826--- /dev/null
30827+++ b/fs/reiser4/plugin/disk_format/disk_format40.h
30828@@ -0,0 +1,109 @@
30829+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30830+
30831+/* this file contains:
30832+ - definition of ondisk super block of standart disk layout for
30833+ reiser 4.0 (layout 40)
30834+ - definition of layout 40 specific portion of in-core super block
30835+ - declarations of functions implementing methods of layout plugin
30836+ for layout 40
30837+ - declarations of functions used to get/set fields in layout 40 super block
30838+*/
30839+
30840+#ifndef __DISK_FORMAT40_H__
30841+#define __DISK_FORMAT40_H__
30842+
30843+/* magic for default reiser4 layout */
30844+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30845+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30846+
30847+#include "../../dformat.h"
30848+
30849+#include <linux/fs.h> /* for struct super_block */
30850+
30851+typedef enum {
30852+ FORMAT40_LARGE_KEYS
30853+} format40_flags;
30854+
30855+/* ondisk super block for format 40. It is 512 bytes long */
30856+typedef struct format40_disk_super_block {
30857+ /* 0 */ d64 block_count;
30858+ /* number of block in a filesystem */
30859+ /* 8 */ d64 free_blocks;
30860+ /* number of free blocks */
30861+ /* 16 */ d64 root_block;
30862+ /* filesystem tree root block */
30863+ /* 24 */ d64 oid;
30864+ /* smallest free objectid */
30865+ /* 32 */ d64 file_count;
30866+ /* number of files in a filesystem */
30867+ /* 40 */ d64 flushes;
30868+ /* number of times super block was
30869+ flushed. Needed if format 40
30870+ will have few super blocks */
30871+ /* 48 */ d32 mkfs_id;
30872+ /* unique identifier of fs */
30873+ /* 52 */ char magic[16];
30874+ /* magic string ReIsEr40FoRmAt */
30875+ /* 68 */ d16 tree_height;
30876+ /* height of filesystem tree */
30877+ /* 70 */ d16 formatting_policy;
30878+ /* not used anymore */
30879+ /* 72 */ d64 flags;
30880+ /* 80 */ d32 version;
30881+ /* on-disk format version number
30882+ initially assigned by mkfs as the greatest format40
30883+ version number supported by reiser4progs and updated
30884+ in mount time in accordance with the greatest format40
30885+ version number supported by kernel.
30886+ Is used by fsck to catch possible corruption and
30887+ for various compatibility issues */
30888+ /* 84 */ char not_used[428];
30889+} format40_disk_super_block;
30890+
30891+/* format 40 specific part of reiser4_super_info_data */
30892+typedef struct format40_super_info {
30893+/* format40_disk_super_block actual_sb; */
30894+ jnode *sb_jnode;
30895+ struct {
30896+ reiser4_block_nr super;
30897+ } loc;
30898+} format40_super_info;
30899+
30900+/* Defines for journal header and footer respectively. */
30901+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
30902+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
30903+
30904+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
30905+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
30906+
30907+#define FORMAT40_STATUS_BLOCKNR \
30908+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
30909+
30910+/* Diskmap declarations */
30911+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
30912+#define FORMAT40_SUPER 1
30913+#define FORMAT40_JH 2
30914+#define FORMAT40_JF 3
30915+
30916+/* declarations of functions implementing methods of layout plugin for
30917+ format 40. The functions theirself are in disk_format40.c */
30918+extern int init_format_format40(struct super_block *, void *data);
30919+extern const reiser4_key *root_dir_key_format40(const struct super_block *);
30920+extern int release_format40(struct super_block *s);
30921+extern jnode *log_super_format40(struct super_block *s);
30922+extern int check_open_format40(const struct inode *object);
30923+extern int version_update_format40(struct super_block *super);
30924+
30925+/* __DISK_FORMAT40_H__ */
30926+#endif
30927+
30928+/* Make Linus happy.
30929+ Local variables:
30930+ c-indentation-style: "K&R"
30931+ mode-name: "LC"
30932+ c-basic-offset: 8
30933+ tab-width: 8
30934+ fill-column: 120
30935+ scroll-step: 1
30936+ End:
30937+*/
30938diff --git a/fs/reiser4/plugin/fibration.c b/fs/reiser4/plugin/fibration.c
30939new file mode 100644
30940index 0000000..690dac4
30941--- /dev/null
30942+++ b/fs/reiser4/plugin/fibration.c
30943@@ -0,0 +1,175 @@
30944+/* Copyright 2004 by Hans Reiser, licensing governed by
30945+ * reiser4/README */
30946+
30947+/* Directory fibrations */
30948+
30949+/*
30950+ * Suppose we have a directory tree with sources of some project. During
30951+ * compilation .o files are created within this tree. This makes access
30952+ * to the original source files less efficient, because source files are
30953+ * now "diluted" by object files: default directory plugin uses prefix
30954+ * of a file name as a part of the key for directory entry (and this
30955+ * part is also inherited by the key of file body). This means that
30956+ * foo.o will be located close to foo.c and foo.h in the tree.
30957+ *
30958+ * To avoid this effect directory plugin fill highest 7 (unused
30959+ * originally) bits of the second component of the directory entry key
30960+ * by bit-pattern depending on the file name (see
30961+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
30962+ * "fibre". Fibre of the file name key is inherited by key of stat data
30963+ * and keys of file body (in the case of REISER4_LARGE_KEY).
30964+ *
30965+ * Fibre for a given file is chosen by per-directory fibration
30966+ * plugin. Names within given fibre are ordered lexicographically.
30967+ */
30968+
30969+#include "../debug.h"
30970+#include "plugin_header.h"
30971+#include "plugin.h"
30972+#include "../super.h"
30973+#include "../inode.h"
30974+
30975+#include <linux/types.h>
30976+
30977+static const int fibre_shift = 57;
30978+
30979+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
30980+
30981+/*
30982+ * Trivial fibration: all files of directory are just ordered
30983+ * lexicographically.
30984+ */
30985+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
30986+{
30987+ return FIBRE_NO(0);
30988+}
30989+
30990+/*
30991+ * dot-o fibration: place .o files after all others.
30992+ */
30993+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
30994+{
30995+ /* special treatment for .*\.o */
30996+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
30997+ return FIBRE_NO(1);
30998+ else
30999+ return FIBRE_NO(0);
31000+}
31001+
31002+/*
31003+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
31004+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
31005+ * default fibre for the rest.
31006+ */
31007+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
31008+{
31009+ if (len > 2 && name[len - 2] == '.')
31010+ return FIBRE_NO(name[len - 1]);
31011+ else
31012+ return FIBRE_NO(0);
31013+}
31014+
31015+/*
31016+ * ext.3 fibration: try to separate files with different 3-character
31017+ * extensions from each other.
31018+ */
31019+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
31020+{
31021+ if (len > 4 && name[len - 4] == '.')
31022+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
31023+ else
31024+ return FIBRE_NO(0);
31025+}
31026+
31027+static int change_fibration(struct inode *inode,
31028+ reiser4_plugin * plugin,
31029+ pset_member memb)
31030+{
31031+ int result;
31032+
31033+ assert("nikita-3503", inode != NULL);
31034+ assert("nikita-3504", plugin != NULL);
31035+
31036+ assert("nikita-3505", is_reiser4_inode(inode));
31037+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
31038+ assert("nikita-3507",
31039+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
31040+
31041+ result = 0;
31042+ if (inode_fibration_plugin(inode) == NULL ||
31043+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
31044+ if (is_dir_empty(inode) == 0)
31045+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
31046+ PSET_FIBRATION, plugin);
31047+ else
31048+ result = RETERR(-ENOTEMPTY);
31049+
31050+ }
31051+ return result;
31052+}
31053+
31054+static reiser4_plugin_ops fibration_plugin_ops = {
31055+ .init = NULL,
31056+ .load = NULL,
31057+ .save_len = NULL,
31058+ .save = NULL,
31059+ .change = change_fibration
31060+};
31061+
31062+/* fibration plugins */
31063+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
31064+ [FIBRATION_LEXICOGRAPHIC] = {
31065+ .h = {
31066+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31067+ .id = FIBRATION_LEXICOGRAPHIC,
31068+ .pops = &fibration_plugin_ops,
31069+ .label = "lexicographic",
31070+ .desc = "no fibration",
31071+ .linkage = {NULL, NULL}
31072+ },
31073+ .fibre = fibre_trivial
31074+ },
31075+ [FIBRATION_DOT_O] = {
31076+ .h = {
31077+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31078+ .id = FIBRATION_DOT_O,
31079+ .pops = &fibration_plugin_ops,
31080+ .label = "dot-o",
31081+ .desc = "fibrate .o files separately",
31082+ .linkage = {NULL, NULL}
31083+ },
31084+ .fibre = fibre_dot_o
31085+ },
31086+ [FIBRATION_EXT_1] = {
31087+ .h = {
31088+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31089+ .id = FIBRATION_EXT_1,
31090+ .pops = &fibration_plugin_ops,
31091+ .label = "ext-1",
31092+ .desc = "fibrate file by single character extension",
31093+ .linkage = {NULL, NULL}
31094+ },
31095+ .fibre = fibre_ext_1
31096+ },
31097+ [FIBRATION_EXT_3] = {
31098+ .h = {
31099+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31100+ .id = FIBRATION_EXT_3,
31101+ .pops = &fibration_plugin_ops,
31102+ .label = "ext-3",
31103+ .desc = "fibrate file by three character extension",
31104+ .linkage = {NULL, NULL}
31105+ },
31106+ .fibre = fibre_ext_3
31107+ }
31108+};
31109+
31110+/*
31111+ * Local variables:
31112+ * c-indentation-style: "K&R"
31113+ * mode-name: "LC"
31114+ * c-basic-offset: 8
31115+ * tab-width: 8
31116+ * fill-column: 79
31117+ * End:
31118+ */
31119diff --git a/fs/reiser4/plugin/fibration.h b/fs/reiser4/plugin/fibration.h
31120new file mode 100644
31121index 0000000..0723cad
31122--- /dev/null
31123+++ b/fs/reiser4/plugin/fibration.h
31124@@ -0,0 +1,37 @@
31125+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
31126+
31127+/* Fibration plugin used by hashed directory plugin to segment content
31128+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
31129+
31130+#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
31131+#define __FS_REISER4_PLUGIN_FIBRATION_H__
31132+
31133+#include "plugin_header.h"
31134+
31135+typedef struct fibration_plugin {
31136+ /* generic fields */
31137+ plugin_header h;
31138+
31139+ __u64(*fibre) (const struct inode * dir, const char *name, int len);
31140+} fibration_plugin;
31141+
31142+typedef enum {
31143+ FIBRATION_LEXICOGRAPHIC,
31144+ FIBRATION_DOT_O,
31145+ FIBRATION_EXT_1,
31146+ FIBRATION_EXT_3,
31147+ LAST_FIBRATION_ID
31148+} reiser4_fibration_id;
31149+
31150+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
31151+#endif
31152+
31153+/* Make Linus happy.
31154+ Local variables:
31155+ c-indentation-style: "K&R"
31156+ mode-name: "LC"
31157+ c-basic-offset: 8
31158+ tab-width: 8
31159+ fill-column: 120
31160+ End:
31161+*/
31162diff --git a/fs/reiser4/plugin/file/Makefile b/fs/reiser4/plugin/file/Makefile
31163new file mode 100644
31164index 0000000..134fa7a
31165--- /dev/null
31166+++ b/fs/reiser4/plugin/file/Makefile
31167@@ -0,0 +1,7 @@
31168+obj-$(CONFIG_REISER4_FS) += file_plugins.o
31169+
31170+file_plugins-objs := \
31171+ file.o \
31172+ tail_conversion.o \
31173+ symlink.o \
31174+ cryptcompress.o
31175diff --git a/fs/reiser4/plugin/file/cryptcompress.c b/fs/reiser4/plugin/file/cryptcompress.c
31176new file mode 100644
31177index 0000000..2876e31
31178--- /dev/null
31179+++ b/fs/reiser4/plugin/file/cryptcompress.c
31180@@ -0,0 +1,3760 @@
31181+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
31182+ reiser4/README */
31183+
31184+/* This file contains implementations of inode/file/address_space/file plugin
31185+ * operations specific for cryptcompress file plugin which manages files with
31186+ * compressed and encrypted bodies. "Cryptcompress file" is built of items of
31187+ * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
31188+ */
31189+
31190+#include "../../inode.h"
31191+#include "../cluster.h"
31192+#include "../object.h"
31193+#include "../../tree_walk.h"
31194+#include "cryptcompress.h"
31195+
31196+#include <asm/scatterlist.h>
31197+#include <linux/pagevec.h>
31198+#include <asm/uaccess.h>
31199+#include <linux/swap.h>
31200+#include <linux/writeback.h>
31201+#include <linux/random.h>
31202+
31203+/* get cryptcompress specific portion of inode */
31204+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
31205+{
31206+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
31207+}
31208+
31209+/* plugin->u.file.init_inode_data */
31210+void init_inode_data_cryptcompress(struct inode *inode,
31211+ reiser4_object_create_data * crd,
31212+ int create)
31213+{
31214+ cryptcompress_info_t *data;
31215+
31216+ data = cryptcompress_inode_data(inode);
31217+ assert("edward-685", data != NULL);
31218+
31219+ memset(data, 0, sizeof(*data));
31220+
31221+ turn_on_compression(data);
31222+ set_lattice_factor(data, MIN_LATTICE_FACTOR);
31223+ init_inode_ordering(inode, crd, create);
31224+}
31225+
31226+#if REISER4_DEBUG
31227+int cryptcompress_inode_ok(struct inode *inode)
31228+{
31229+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
31230+ return 0;
31231+ if (!cluster_shift_ok(inode_cluster_shift(inode)))
31232+ return 0;
31233+ return 1;
31234+}
31235+#endif
31236+
31237+/* The following is a part of reiser4 cipher key manager
31238+ which is called when opening/creating a cryptcompress file */
31239+
31240+/* get/set cipher key info */
31241+crypto_stat_t * inode_crypto_stat (struct inode * inode)
31242+{
31243+ assert("edward-90", inode != NULL);
31244+ assert("edward-91", reiser4_inode_data(inode) != NULL);
31245+ return cryptcompress_inode_data(inode)->crypt;
31246+}
31247+
31248+static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
31249+{
31250+ cryptcompress_inode_data(inode)->crypt = stat;
31251+}
31252+
31253+/* allocate a cipher key info */
31254+crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode)
31255+{
31256+ crypto_stat_t * info;
31257+ int fipsize;
31258+
31259+ info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
31260+ if (!info)
31261+ return ERR_PTR(-ENOMEM);
31262+ memset(info, 0, sizeof (*info));
31263+ fipsize = inode_digest_plugin(inode)->fipsize;
31264+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
31265+ if (!info->keyid) {
31266+ kfree(info);
31267+ return ERR_PTR(-ENOMEM);
31268+ }
31269+ info->host = inode;
31270+ return info;
31271+}
31272+
31273+#if 0
31274+/* allocate/free low-level info for cipher and digest
31275+ transforms */
31276+static int alloc_crypto_tfms(crypto_stat_t * info)
31277+{
31278+ struct crypto_blkcipher * ctfm = NULL;
31279+ struct crypto_hash * dtfm = NULL;
31280+ cipher_plugin * cplug = inode_cipher_plugin(info->host);
31281+ digest_plugin * dplug = inode_digest_plugin(info->host);
31282+
31283+ if (cplug->alloc) {
31284+ ctfm = cplug->alloc();
31285+ if (IS_ERR(ctfm)) {
31286+ warning("edward-1364",
31287+ "Can not allocate info for %s\n",
31288+ cplug->h.desc);
31289+ return RETERR(PTR_ERR(ctfm));
31290+ }
31291+ }
31292+ info_set_cipher(info, ctfm);
31293+ if (dplug->alloc) {
31294+ dtfm = dplug->alloc();
31295+ if (IS_ERR(dtfm)) {
31296+ warning("edward-1365",
31297+ "Can not allocate info for %s\n",
31298+ dplug->h.desc);
31299+ goto unhappy_with_digest;
31300+ }
31301+ }
31302+ info_set_digest(info, dtfm);
31303+ return 0;
31304+ unhappy_with_digest:
31305+ if (cplug->free) {
31306+ cplug->free(ctfm);
31307+ info_set_cipher(info, NULL);
31308+ }
31309+ return RETERR(PTR_ERR(dtfm));
31310+}
31311+#endif
31312+
31313+static void
31314+free_crypto_tfms(crypto_stat_t * info)
31315+{
31316+ assert("edward-1366", info != NULL);
31317+ if (!info_get_cipher(info)) {
31318+ assert("edward-1601", !info_get_digest(info));
31319+ return;
31320+ }
31321+ inode_cipher_plugin(info->host)->free(info_get_cipher(info));
31322+ info_set_cipher(info, NULL);
31323+ inode_digest_plugin(info->host)->free(info_get_digest(info));
31324+ info_set_digest(info, NULL);
31325+ return;
31326+}
31327+
31328+#if 0
31329+/* create a key fingerprint for disk stat-data */
31330+static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
31331+{
31332+ int ret = -ENOMEM;
31333+ size_t blk, pad;
31334+ __u8 * dmem;
31335+ __u8 * cmem;
31336+ struct hash_desc ddesc;
31337+ struct blkcipher_desc cdesc;
31338+ struct scatterlist sg;
31339+
31340+ assert("edward-1367", info != NULL);
31341+ assert("edward-1368", info->keyid != NULL);
31342+
31343+ ddesc.tfm = info_get_digest(info);
31344+ ddesc.flags = 0;
31345+ cdesc.tfm = info_get_cipher(info);
31346+ cdesc.flags = 0;
31347+
31348+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
31349+ reiser4_ctx_gfp_mask_get());
31350+ if (!dmem)
31351+ goto exit1;
31352+
31353+ blk = crypto_blkcipher_blocksize(cdesc.tfm);
31354+
31355+ pad = data->keyid_size % blk;
31356+ pad = (pad ? blk - pad : 0);
31357+
31358+ cmem = kmalloc((size_t)data->keyid_size + pad,
31359+ reiser4_ctx_gfp_mask_get());
31360+ if (!cmem)
31361+ goto exit2;
31362+ memcpy(cmem, data->keyid, data->keyid_size);
31363+ memset(cmem + data->keyid_size, 0, pad);
31364+
31365+ sg.page = virt_to_page(cmem);
31366+ sg.offset = offset_in_page(cmem);
31367+ sg.length = data->keyid_size + pad;
31368+
31369+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
31370+ data->keyid_size + pad);
31371+ if (ret) {
31372+ warning("edward-1369",
31373+ "encryption failed flags=%x\n", cdesc.flags);
31374+ goto exit3;
31375+ }
31376+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
31377+ if (ret) {
31378+ warning("edward-1602",
31379+ "digest failed flags=%x\n", ddesc.flags);
31380+ goto exit3;
31381+ }
31382+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
31383+ exit3:
31384+ kfree(cmem);
31385+ exit2:
31386+ kfree(dmem);
31387+ exit1:
31388+ return ret;
31389+}
31390+#endif
31391+
31392+static void destroy_keyid(crypto_stat_t * info)
31393+{
31394+ assert("edward-1370", info != NULL);
31395+ assert("edward-1371", info->keyid != NULL);
31396+ kfree(info->keyid);
31397+ return;
31398+}
31399+
31400+static void __free_crypto_stat (struct inode * inode)
31401+{
31402+ crypto_stat_t * info = inode_crypto_stat(inode);
31403+ assert("edward-1372", info != NULL);
31404+
31405+ free_crypto_tfms(info);
31406+ destroy_keyid(info);
31407+ kfree(info);
31408+}
31409+
31410+#if 0
31411+static void instantiate_crypto_stat(crypto_stat_t * info)
31412+{
31413+ assert("edward-1373", info != NULL);
31414+ assert("edward-1374", info->inst == 0);
31415+ info->inst = 1;
31416+}
31417+#endif
31418+
31419+static void uninstantiate_crypto_stat(crypto_stat_t * info)
31420+{
31421+ assert("edward-1375", info != NULL);
31422+ info->inst = 0;
31423+}
31424+
31425+static int crypto_stat_instantiated(crypto_stat_t * info)
31426+{
31427+ return info->inst;
31428+}
31429+
31430+static int inode_has_cipher_key(struct inode * inode)
31431+{
31432+ assert("edward-1376", inode != NULL);
31433+ return inode_crypto_stat(inode) &&
31434+ crypto_stat_instantiated(inode_crypto_stat(inode));
31435+}
31436+
31437+static void free_crypto_stat (struct inode * inode)
31438+{
31439+ uninstantiate_crypto_stat(inode_crypto_stat(inode));
31440+ __free_crypto_stat(inode);
31441+}
31442+
31443+static int need_cipher(struct inode * inode)
31444+{
31445+ return inode_cipher_plugin(inode) !=
31446+ cipher_plugin_by_id(NONE_CIPHER_ID);
31447+}
31448+
31449+/* Create a crypto-stat and attach result to the @object.
31450+ If success is returned, then low-level cipher info contains
31451+ an instantiated key */
31452+#if 0
31453+crypto_stat_t *
31454+create_crypto_stat(struct inode * object,
31455+ crypto_data_t * data /* this contains a (uninstantiated)
31456+ cipher key imported from user
31457+ space */)
31458+{
31459+ int ret;
31460+ crypto_stat_t * info;
31461+
31462+ assert("edward-1377", data != NULL);
31463+ assert("edward-1378", need_cipher(object));
31464+
31465+ if (inode_file_plugin(object) !=
31466+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31467+ return ERR_PTR(-EINVAL);
31468+
31469+ info = reiser4_alloc_crypto_stat(object);
31470+ if (IS_ERR(info))
31471+ return info;
31472+ ret = alloc_crypto_tfms(info);
31473+ if (ret)
31474+ goto err;
31475+ /* instantiating a key */
31476+ ret = crypto_blkcipher_setkey(info_get_cipher(info),
31477+ data->key,
31478+ data->keysize);
31479+ if (ret) {
31480+ warning("edward-1379",
31481+ "setkey failed flags=%x\n",
31482+ crypto_blkcipher_get_flags(info_get_cipher(info)));
31483+ goto err;
31484+ }
31485+ info->keysize = data->keysize;
31486+ ret = create_keyid(info, data);
31487+ if (ret)
31488+ goto err;
31489+ instantiate_crypto_stat(info);
31490+ return info;
31491+ err:
31492+ __free_crypto_stat(object);
31493+ return ERR_PTR(ret);
31494+}
31495+#endif
31496+
31497+/* increment/decrement a load counter when
31498+ attaching/detaching the crypto-stat to any object */
31499+static void load_crypto_stat(crypto_stat_t * info)
31500+{
31501+ assert("edward-1380", info != NULL);
31502+ inc_keyload_count(info);
31503+}
31504+
31505+static void unload_crypto_stat(struct inode * inode)
31506+{
31507+ crypto_stat_t * info = inode_crypto_stat(inode);
31508+ assert("edward-1381", info->keyload_count > 0);
31509+
31510+ dec_keyload_count(inode_crypto_stat(inode));
31511+ if (info->keyload_count == 0)
31512+ /* final release */
31513+ free_crypto_stat(inode);
31514+}
31515+
31516+/* attach/detach an existing crypto-stat */
31517+void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31518+{
31519+ assert("edward-1382", inode != NULL);
31520+ assert("edward-1383", info != NULL);
31521+ assert("edward-1384", inode_crypto_stat(inode) == NULL);
31522+
31523+ set_inode_crypto_stat(inode, info);
31524+ load_crypto_stat(info);
31525+}
31526+
31527+/* returns true, if crypto stat can be attached to the @host */
31528+#if REISER4_DEBUG
31529+static int host_allows_crypto_stat(struct inode * host)
31530+{
31531+ int ret;
31532+ file_plugin * fplug = inode_file_plugin(host);
31533+
31534+ switch (fplug->h.id) {
31535+ case CRYPTCOMPRESS_FILE_PLUGIN_ID:
31536+ ret = 1;
31537+ break;
31538+ default:
31539+ ret = 0;
31540+ }
31541+ return ret;
31542+}
31543+#endif /* REISER4_DEBUG */
31544+
31545+static void reiser4_detach_crypto_stat(struct inode * inode)
31546+{
31547+ assert("edward-1385", inode != NULL);
31548+ assert("edward-1386", host_allows_crypto_stat(inode));
31549+
31550+ if (inode_crypto_stat(inode))
31551+ unload_crypto_stat(inode);
31552+ set_inode_crypto_stat(inode, NULL);
31553+}
31554+
31555+#if 0
31556+
31557+/* compare fingerprints of @child and @parent */
31558+static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31559+{
31560+ return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31561+}
31562+
31563+/* check if a crypto-stat (which is bound to @parent) can be inherited */
31564+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
31565+{
31566+ if (!need_cipher(child))
31567+ return 0;
31568+ /* the child is created */
31569+ if (!inode_crypto_stat(child))
31570+ return 1;
31571+ /* the child is looked up */
31572+ if (!inode_crypto_stat(parent))
31573+ return 0;
31574+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31575+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31576+ inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
31577+ keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
31578+}
31579+#endif
31580+
31581+/* helper functions for ->create() method of the cryptcompress plugin */
31582+static int inode_set_crypto(struct inode * object)
31583+{
31584+ reiser4_inode * info;
31585+ if (!inode_crypto_stat(object)) {
31586+ if (need_cipher(object))
31587+ return RETERR(-EINVAL);
31588+ /* the file is not to be encrypted */
31589+ return 0;
31590+ }
31591+ info = reiser4_inode_data(object);
31592+ info->extmask |= (1 << CRYPTO_STAT);
31593+ return 0;
31594+}
31595+
31596+static int inode_init_compression(struct inode * object)
31597+{
31598+ int result = 0;
31599+ assert("edward-1461", object != NULL);
31600+ if (inode_compression_plugin(object)->init)
31601+ result = inode_compression_plugin(object)->init();
31602+ return result;
31603+}
31604+
31605+static int inode_check_cluster(struct inode * object)
31606+{
31607+ assert("edward-696", object != NULL);
31608+
31609+ if (inode_cluster_size(object) < PAGE_CACHE_SIZE) {
31610+ warning("edward-1320", "Can not support '%s' "
31611+ "logical clusters (less then page size)",
31612+ inode_cluster_plugin(object)->h.label);
31613+ return RETERR(-EINVAL);
31614+ }
31615+ return 0;
31616+}
31617+
31618+/* ->destroy_inode() method of the cryptcompress plugin */
31619+void destroy_inode_cryptcompress(struct inode * inode)
31620+{
31621+ assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
31622+ reiser4_detach_crypto_stat(inode);
31623+ return;
31624+}
31625+
31626+/* ->create() method of the cryptcompress plugin
31627+
31628+. install plugins
31629+. attach crypto info if specified
31630+. attach compression info if specified
31631+. attach cluster info
31632+*/
31633+int
31634+create_cryptcompress(struct inode *object, struct inode *parent,
31635+ reiser4_object_create_data * data)
31636+{
31637+ int result;
31638+ reiser4_inode *info;
31639+
31640+ assert("edward-23", object != NULL);
31641+ assert("edward-24", parent != NULL);
31642+ assert("edward-30", data != NULL);
31643+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
31644+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
31645+
31646+ info = reiser4_inode_data(object);
31647+
31648+ assert("edward-29", info != NULL);
31649+
31650+ /* set file bit */
31651+ info->plugin_mask |= (1 << PSET_FILE);
31652+
31653+ /* set crypto */
31654+ result = inode_set_crypto(object);
31655+ if (result)
31656+ goto error;
31657+ /* set compression */
31658+ result = inode_init_compression(object);
31659+ if (result)
31660+ goto error;
31661+ /* set cluster */
31662+ result = inode_check_cluster(object);
31663+ if (result)
31664+ goto error;
31665+
31666+ /* save everything in disk stat-data */
31667+ result = write_sd_by_inode_common(object);
31668+ if (!result)
31669+ return 0;
31670+ error:
31671+ reiser4_detach_crypto_stat(object);
31672+ return result;
31673+}
31674+
31675+/* ->open() method of the cryptcompress plugin */
31676+int open_object_cryptcompress(struct inode * inode, struct file * file)
31677+{
31678+ int result;
31679+ struct inode * parent;
31680+
31681+ assert("edward-1394", inode != NULL);
31682+ assert("edward-1395", file != NULL);
31683+ assert("edward-1396", file != NULL);
31684+ assert("edward-1397", file->f_dentry->d_inode == inode);
31685+ assert("edward-1398", file->f_dentry->d_parent != NULL);
31686+ assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31687+ assert("edward-698",
31688+ inode_file_plugin(inode) ==
31689+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31690+ result = inode_check_cluster(inode);
31691+ if (result)
31692+ return result;
31693+ result = inode_init_compression(inode);
31694+ if (result)
31695+ return result;
31696+ if (!need_cipher(inode))
31697+ /* the file is not to be ciphered */
31698+ return 0;
31699+ parent = file->f_dentry->d_parent->d_inode;
31700+ if (!inode_has_cipher_key(inode))
31701+ return RETERR(-EINVAL);
31702+ return 0;
31703+}
31704+
31705+/* returns a blocksize, the attribute of a cipher algorithm */
31706+static unsigned int
31707+cipher_blocksize(struct inode * inode)
31708+{
31709+ assert("edward-758", need_cipher(inode));
31710+ assert("edward-1400", inode_crypto_stat(inode) != NULL);
31711+ return crypto_blkcipher_blocksize
31712+ (info_get_cipher(inode_crypto_stat(inode)));
31713+}
31714+
31715+/* returns offset translated by scale factor of the crypto-algorithm */
31716+static loff_t inode_scaled_offset (struct inode * inode,
31717+ const loff_t src_off /* input offset */)
31718+{
31719+ assert("edward-97", inode != NULL);
31720+
31721+ if (!need_cipher(inode) ||
31722+ src_off == get_key_offset(reiser4_min_key()) ||
31723+ src_off == get_key_offset(reiser4_max_key()))
31724+ return src_off;
31725+
31726+ return inode_cipher_plugin(inode)->scale(inode,
31727+ cipher_blocksize(inode),
31728+ src_off);
31729+}
31730+
31731+/* returns disk cluster size */
31732+size_t inode_scaled_cluster_size(struct inode * inode)
31733+{
31734+ assert("edward-110", inode != NULL);
31735+
31736+ return inode_scaled_offset(inode, inode_cluster_size(inode));
31737+}
31738+
31739+static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
31740+{
31741+ return (clust_to_off(clust->index, inode) >= inode->i_size);
31742+}
31743+
31744+/* set number of cluster pages */
31745+static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
31746+{
31747+ reiser4_slide_t *win;
31748+
31749+ assert("edward-180", clust != NULL);
31750+ assert("edward-1040", inode != NULL);
31751+
31752+ win = clust->win;
31753+ if (!win) {
31754+ /* NOTE-EDWARD: i_size should be protected */
31755+ clust->nr_pages =
31756+ count_to_nrpages(fsize_to_count(clust, inode));
31757+ return;
31758+ }
31759+ assert("edward-1176", clust->op != PCL_UNKNOWN);
31760+ assert("edward-1064", win->off + win->count + win->delta != 0);
31761+
31762+ if (win->stat == HOLE_WINDOW &&
31763+ win->off == 0 && win->count == inode_cluster_size(inode)) {
31764+ /* special case: we start write hole from fake cluster */
31765+ clust->nr_pages = 0;
31766+ return;
31767+ }
31768+ clust->nr_pages =
31769+ count_to_nrpages(max_count(win->off + win->count + win->delta,
31770+ fsize_to_count(clust, inode)));
31771+ return;
31772+}
31773+
31774+/* ->key_by_inode() method of the cryptcompress plugin */
31775+/* see plugin/plugin.h for details */
31776+int
31777+key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
31778+{
31779+ loff_t clust_off;
31780+
31781+ assert("edward-64", inode != 0);
31782+ // assert("edward-112", ergo(off != get_key_offset(reiser4_max_key()), !off_to_cloff(off, inode)));
31783+ /* don't come here with other offsets */
31784+
31785+ clust_off =
31786+ (off ==
31787+ get_key_offset(reiser4_max_key())? get_key_offset(reiser4_max_key()) :
31788+ off_to_clust_to_off(off, inode));
31789+
31790+ key_by_inode_and_offset_common(inode, 0, key);
31791+ set_key_offset(key,
31792+ (__u64) (!inode_crypto_stat(inode) ? clust_off :
31793+ inode_scaled_offset(inode, clust_off)));
31794+ return 0;
31795+}
31796+
31797+/* plugin->flow_by_inode */
31798+int
31799+flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
31800+ const char __user *buf /* user level buffer */ ,
31801+ int user /* 1 if @buf is of user space, 0 - if it is
31802+ kernel space */ ,
31803+ loff_t size /* buffer size */ ,
31804+ loff_t off /* offset to start io from */ ,
31805+ rw_op op /* READ or WRITE */ ,
31806+ flow_t * f /* resulting flow */ )
31807+{
31808+ assert("edward-436", f != NULL);
31809+ assert("edward-149", inode != NULL);
31810+ assert("edward-150", inode_file_plugin(inode) != NULL);
31811+
31812+ f->length = size;
31813+ memcpy(&f->data, &buf, sizeof(buf));
31814+ f->user = user;
31815+ f->op = op;
31816+
31817+ if (op == WRITE_OP && user == 1)
31818+ return 0;
31819+ return key_by_inode_cryptcompress(inode, off, &f->key);
31820+}
31821+
31822+static int
31823+cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
31824+ znode_lock_mode lock_mode)
31825+{
31826+ coord_t *coord;
31827+
31828+ assert("edward-704", hint != NULL);
31829+ assert("edward-1089", !hint_is_valid(hint));
31830+ assert("edward-706", hint->lh.owner == NULL);
31831+
31832+ coord = &hint->ext_coord.coord;
31833+
31834+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
31835+ /* hint either not set or set by different operation */
31836+ return RETERR(-E_REPEAT);
31837+
31838+ if (get_key_offset(key) != hint->offset)
31839+ /* hint is set for different key */
31840+ return RETERR(-E_REPEAT);
31841+
31842+ assert("edward-707", reiser4_schedulable());
31843+
31844+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
31845+ key, &hint->lh, lock_mode,
31846+ ZNODE_LOCK_LOPRI);
31847+}
31848+
31849+/* reserve disk space when writing a logical cluster */
31850+static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
31851+{
31852+ int result = 0;
31853+
31854+ assert("edward-965", reiser4_schedulable());
31855+ assert("edward-439", inode != NULL);
31856+ assert("edward-440", clust != NULL);
31857+ assert("edward-441", clust->pages != NULL);
31858+
31859+ if (clust->nr_pages == 0) {
31860+ assert("edward-1152", clust->win != NULL);
31861+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
31862+ /* don't reserve space for fake disk clusteer */
31863+ return 0;
31864+ }
31865+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
31866+
31867+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
31868+ estimate_update_cluster(inode),
31869+ BA_CAN_COMMIT);
31870+ if (result)
31871+ return result;
31872+ clust->reserved = 1;
31873+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
31874+ estimate_update_cluster(inode));
31875+#if REISER4_DEBUG
31876+ clust->reserved_prepped = estimate_update_cluster(inode);
31877+ clust->reserved_unprepped = estimate_insert_cluster(inode);
31878+#endif
31879+ /* there can be space grabbed by txnmgr_force_commit_all */
31880+ return 0;
31881+}
31882+
31883+/* free reserved disk space if writing a logical cluster fails */
31884+static void
31885+free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
31886+{
31887+ assert("edward-967", clust->reserved == 1);
31888+
31889+ cluster_reserved2free(count);
31890+ clust->reserved = 0;
31891+}
31892+
31893+/* The core search procedure of the cryptcompress plugin.
31894+ If returned value is not cbk_errored, then current znode is locked */
31895+static int find_cluster_item(hint_t * hint,
31896+ const reiser4_key * key, /* key of the item we are
31897+ looking for */
31898+ znode_lock_mode lock_mode /* which lock */ ,
31899+ ra_info_t * ra_info, lookup_bias bias, __u32 flags)
31900+{
31901+ int result;
31902+ reiser4_key ikey;
31903+ int went_right = 0;
31904+ coord_t *coord = &hint->ext_coord.coord;
31905+ coord_t orig = *coord;
31906+
31907+ assert("edward-152", hint != NULL);
31908+
31909+ if (!hint_is_valid(hint)) {
31910+ result = cryptcompress_hint_validate(hint, key, lock_mode);
31911+ if (result == -E_REPEAT)
31912+ goto traverse_tree;
31913+ else if (result) {
31914+ assert("edward-1216", 0);
31915+ return result;
31916+ }
31917+ hint_set_valid(hint);
31918+ }
31919+ assert("edward-709", znode_is_any_locked(coord->node));
31920+
31921+ /* In-place lookup is going here, it means we just need to
31922+ check if next item of the @coord match to the @keyhint) */
31923+
31924+ if (equal_to_rdk(coord->node, key)) {
31925+ result = goto_right_neighbor(coord, &hint->lh);
31926+ if (result == -E_NO_NEIGHBOR) {
31927+ assert("edward-1217", 0);
31928+ return RETERR(-EIO);
31929+ }
31930+ if (result)
31931+ return result;
31932+ assert("edward-1218", equal_to_ldk(coord->node, key));
31933+ went_right = 1;
31934+ } else {
31935+ coord->item_pos++;
31936+ coord->unit_pos = 0;
31937+ coord->between = AT_UNIT;
31938+ }
31939+ result = zload(coord->node);
31940+ if (result)
31941+ return result;
31942+ assert("edward-1219", !node_is_empty(coord->node));
31943+
31944+ if (!coord_is_existing_item(coord)) {
31945+ zrelse(coord->node);
31946+ goto not_found;
31947+ }
31948+ item_key_by_coord(coord, &ikey);
31949+ zrelse(coord->node);
31950+ if (!keyeq(key, &ikey))
31951+ goto not_found;
31952+ /* Ok, item is found, update node counts */
31953+ if (went_right)
31954+ dclust_inc_extension_ncount(hint);
31955+ return CBK_COORD_FOUND;
31956+
31957+ not_found:
31958+ assert("edward-1220", coord->item_pos > 0);
31959+ //coord->item_pos--;
31960+ /* roll back */
31961+ *coord = orig;
31962+ ON_DEBUG(coord_update_v(coord));
31963+ return CBK_COORD_NOTFOUND;
31964+
31965+ traverse_tree:
31966+ assert("edward-713", hint->lh.owner == NULL);
31967+ assert("edward-714", reiser4_schedulable());
31968+
31969+ reiser4_unset_hint(hint);
31970+ dclust_init_extension(hint);
31971+ coord_init_zero(coord);
31972+ result = coord_by_key(current_tree, key, coord, &hint->lh,
31973+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
31974+ CBK_UNIQUE | flags, ra_info);
31975+ if (cbk_errored(result))
31976+ return result;
31977+ if(result == CBK_COORD_FOUND)
31978+ dclust_inc_extension_ncount(hint);
31979+ hint_set_valid(hint);
31980+ return result;
31981+}
31982+
31983+/* This function is called by deflate[inflate] manager when
31984+ creating a transformed/plain stream to check if we should
31985+ create/cut some overhead. If this returns true, then @oh
31986+ contains the size of this overhead.
31987+ */
31988+static int
31989+need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
31990+ rw_op rw, int * oh)
31991+{
31992+ tfm_cluster_t * tc = &clust->tc;
31993+ switch (rw) {
31994+ case WRITE_OP: /* estimate align */
31995+ *oh = tc->len % cipher_blocksize(inode);
31996+ if (*oh != 0)
31997+ return 1;
31998+ break;
31999+ case READ_OP: /* estimate cut */
32000+ *oh = *(tfm_output_data(clust) + tc->len - 1);
32001+ break;
32002+ default:
32003+ impossible("edward-1401", "bad option");
32004+ }
32005+ return (tc->len != tc->lsize);
32006+}
32007+
32008+/* create/cut an overhead of transformed/plain stream */
32009+static void
32010+align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
32011+{
32012+ int oh;
32013+ cipher_plugin * cplug = inode_cipher_plugin(inode);
32014+
32015+ assert("edward-1402", need_cipher(inode));
32016+
32017+ if (!need_cut_or_align(inode, clust, rw, &oh))
32018+ return;
32019+ switch (rw) {
32020+ case WRITE_OP: /* do align */
32021+ clust->tc.len +=
32022+ cplug->align_stream(tfm_input_data(clust) +
32023+ clust->tc.len, clust->tc.len,
32024+ cipher_blocksize(inode));
32025+ *(tfm_input_data(clust) + clust->tc.len - 1) =
32026+ cipher_blocksize(inode) - oh;
32027+ break;
32028+ case READ_OP: /* do cut */
32029+ assert("edward-1403", oh <= cipher_blocksize(inode));
32030+ clust->tc.len -= oh;
32031+ break;
32032+ default:
32033+ impossible("edward-1404", "bad option");
32034+ }
32035+ return;
32036+}
32037+
32038+/* the following two functions are to evaluate results
32039+ of compression transform */
32040+static unsigned
32041+max_cipher_overhead(struct inode * inode)
32042+{
32043+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
32044+ return 0;
32045+ return cipher_blocksize(inode);
32046+}
32047+
32048+static int deflate_overhead(struct inode *inode)
32049+{
32050+ return (inode_compression_plugin(inode)->
32051+ checksum ? DC_CHECKSUM_SIZE : 0);
32052+}
32053+
32054+static unsigned deflate_overrun(struct inode * inode, int ilen)
32055+{
32056+ return coa_overrun(inode_compression_plugin(inode), ilen);
32057+}
32058+
32059+/* Estimating compressibility of a logical cluster by various
32060+ policies represented by compression mode plugin.
32061+ If this returns false, then compressor won't be called for
32062+ the cluster of index @index.
32063+*/
32064+static int should_compress(tfm_cluster_t * tc, cloff_t index,
32065+ struct inode *inode)
32066+{
32067+ compression_plugin *cplug = inode_compression_plugin(inode);
32068+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
32069+
32070+ assert("edward-1321", tc->len != 0);
32071+ assert("edward-1322", cplug != NULL);
32072+ assert("edward-1323", mplug != NULL);
32073+
32074+ return /* estimate by size */
32075+ (cplug->min_size_deflate ?
32076+ tc->len >= cplug->min_size_deflate() :
32077+ 1) &&
32078+ /* estimate by compression mode plugin */
32079+ (mplug->should_deflate ?
32080+ mplug->should_deflate(inode, index) :
32081+ 1);
32082+}
32083+
32084+/* Evaluating results of compression transform.
32085+ Returns true, if we need to accept this results */
32086+static int
32087+save_compressed(int size_before, int size_after, struct inode * inode)
32088+{
32089+ return (size_after + deflate_overhead(inode) +
32090+ max_cipher_overhead(inode) < size_before);
32091+}
32092+
32093+/* Guess result of the evaluation above */
32094+static int
32095+need_inflate(reiser4_cluster_t * clust, struct inode *inode,
32096+ int encrypted /* is cluster encrypted */ )
32097+{
32098+ tfm_cluster_t *tc = &clust->tc;
32099+
32100+ assert("edward-142", tc != 0);
32101+ assert("edward-143", inode != NULL);
32102+
32103+ return tc->len <
32104+ (encrypted ?
32105+ inode_scaled_offset(inode, tc->lsize) :
32106+ tc->lsize);
32107+}
32108+
32109+/* If results of compression were accepted, then we add
32110+ a checksum to catch possible disk cluster corruption.
32111+ The following is a format of the data stored in disk clusters:
32112+
32113+ data This is (transformed) logical cluster.
32114+ cipher_overhead This is created by ->align() method
32115+ of cipher plugin. May be absent.
32116+ checksum (4) This is created by ->checksum method
32117+ of compression plugin to check
32118+ integrity. May be absent.
32119+
32120+ Crypto overhead format:
32121+
32122+ data
32123+ control_byte (1) contains aligned overhead size:
32124+ 1 <= overhead <= cipher_blksize
32125+*/
32126+/* Append a checksum at the end of a transformed stream */
32127+static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32128+{
32129+ __u32 checksum;
32130+
32131+ assert("edward-1309", tc != NULL);
32132+ assert("edward-1310", tc->len > 0);
32133+ assert("edward-1311", cplug->checksum != NULL);
32134+
32135+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
32136+ put_unaligned(cpu_to_le32(checksum),
32137+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
32138+ tc->len += (int)DC_CHECKSUM_SIZE;
32139+}
32140+
32141+/* Check a disk cluster checksum.
32142+ Returns 0 if checksum is correct, otherwise returns 1 */
32143+static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32144+{
32145+ assert("edward-1312", tc != NULL);
32146+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
32147+ assert("edward-1314", cplug->checksum != NULL);
32148+
32149+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
32150+ tc->len - (int)DC_CHECKSUM_SIZE) !=
32151+ le32_to_cpu(get_unaligned((d32 *)
32152+ (tfm_stream_data(tc, INPUT_STREAM)
32153+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
32154+ warning("edward-156",
32155+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
32156+ (int)le32_to_cpu
32157+ (get_unaligned((d32 *)
32158+ (tfm_stream_data(tc, INPUT_STREAM) +
32159+ tc->len - (int)DC_CHECKSUM_SIZE))),
32160+ (int)cplug->checksum
32161+ (tfm_stream_data(tc, INPUT_STREAM),
32162+ tc->len - (int)DC_CHECKSUM_SIZE));
32163+ return 1;
32164+ }
32165+ tc->len -= (int)DC_CHECKSUM_SIZE;
32166+ return 0;
32167+}
32168+
32169+/* get input/output stream for some transform action */
32170+int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
32171+ tfm_stream_id id)
32172+{
32173+ size_t size = inode_scaled_cluster_size(inode);
32174+
32175+ assert("edward-901", tc != NULL);
32176+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
32177+
32178+ if (cluster_get_tfm_act(tc) == TFMA_WRITE)
32179+ size += deflate_overrun(inode, inode_cluster_size(inode));
32180+
32181+ if (!tfm_stream(tc, id) && id == INPUT_STREAM)
32182+ alternate_streams(tc);
32183+ if (!tfm_stream(tc, id))
32184+ return alloc_tfm_stream(tc, size, id);
32185+
32186+ assert("edward-902", tfm_stream_is_set(tc, id));
32187+
32188+ if (tfm_stream_size(tc, id) < size)
32189+ return realloc_tfm_stream(tc, size, id);
32190+ return 0;
32191+}
32192+
32193+/* Common deflate manager */
32194+int reiser4_deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32195+{
32196+ int result = 0;
32197+ int compressed = 0;
32198+ int encrypted = 0;
32199+ tfm_cluster_t * tc = &clust->tc;
32200+ compression_plugin * coplug;
32201+
32202+ assert("edward-401", inode != NULL);
32203+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
32204+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
32205+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
32206+
32207+ coplug = inode_compression_plugin(inode);
32208+ if (should_compress(tc, clust->index, inode)) {
32209+ /* try to compress, discard bad results */
32210+ __u32 dst_len;
32211+ compression_mode_plugin * mplug =
32212+ inode_compression_mode_plugin(inode);
32213+ assert("edward-602", coplug != NULL);
32214+ assert("edward-1423", coplug->compress != NULL);
32215+
32216+ result = grab_coa(tc, coplug);
32217+ if (result) {
32218+ warning("edward-1424",
32219+ "alloc_coa failed with ret=%d, skipped compression",
32220+ result);
32221+ goto cipher;
32222+ }
32223+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32224+ if (result) {
32225+ warning("edward-1425",
32226+ "alloc stream failed with ret=%d, skipped compression",
32227+ result);
32228+ goto cipher;
32229+ }
32230+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
32231+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
32232+ tfm_input_data(clust), tc->len,
32233+ tfm_output_data(clust), &dst_len);
32234+ /* make sure we didn't overwrite extra bytes */
32235+ assert("edward-603",
32236+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
32237+
32238+ /* evaluate results of compression transform */
32239+ if (save_compressed(tc->len, dst_len, inode)) {
32240+ /* good result, accept */
32241+ tc->len = dst_len;
32242+ if (mplug->accept_hook != NULL) {
32243+ result = mplug->accept_hook(inode, clust->index);
32244+ if (result)
32245+ warning("edward-1426",
32246+ "accept_hook failed with ret=%d",
32247+ result);
32248+ }
32249+ compressed = 1;
32250+ }
32251+ else {
32252+ /* bad result, discard */
32253+#if REISER4_DEBUG
32254+ if (cluster_is_complete(clust, inode))
32255+ warning("edward-1338",
32256+ "incompressible cluster %lu (inode %llu)",
32257+ clust->index,
32258+ (unsigned long long)get_inode_oid(inode));
32259+#endif
32260+ if (mplug->discard_hook != NULL &&
32261+ cluster_is_complete(clust, inode)) {
32262+ result = mplug->discard_hook(inode,
32263+ clust->index);
32264+ if (result)
32265+ warning("edward-1427",
32266+ "discard_hook failed with ret=%d",
32267+ result);
32268+ }
32269+ }
32270+ }
32271+ cipher:
32272+ if (need_cipher(inode)) {
32273+ cipher_plugin * ciplug;
32274+ struct blkcipher_desc desc;
32275+ struct scatterlist src;
32276+ struct scatterlist dst;
32277+
32278+ ciplug = inode_cipher_plugin(inode);
32279+ desc.tfm = info_get_cipher(inode_crypto_stat(inode));
32280+ desc.flags = 0;
32281+ if (compressed)
32282+ alternate_streams(tc);
32283+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32284+ if (result)
32285+ return result;
32286+
32287+ align_or_cut_overhead(inode, clust, WRITE_OP);
32288+ src.page = virt_to_page(tfm_input_data(clust));
32289+ src.offset = offset_in_page(tfm_input_data(clust));
32290+ src.length = tc->len;
32291+
32292+ dst.page = virt_to_page(tfm_output_data(clust));
32293+ dst.offset = offset_in_page(tfm_output_data(clust));
32294+ dst.length = tc->len;
32295+
32296+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
32297+ if (result) {
32298+ warning("edward-1405",
32299+ "encryption failed flags=%x\n", desc.flags);
32300+ return result;
32301+ }
32302+ encrypted = 1;
32303+ }
32304+ if (compressed && coplug->checksum != NULL)
32305+ dc_set_checksum(coplug, tc);
32306+ if (!compressed && !encrypted)
32307+ alternate_streams(tc);
32308+ return result;
32309+}
32310+
32311+/* Common inflate manager. */
32312+int reiser4_inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32313+{
32314+ int result = 0;
32315+ int transformed = 0;
32316+ tfm_cluster_t * tc = &clust->tc;
32317+ compression_plugin * coplug;
32318+
32319+ assert("edward-905", inode != NULL);
32320+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
32321+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
32322+ assert("edward-1349", tc->act == TFMA_READ);
32323+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
32324+
32325+ /* Handle a checksum (if any) */
32326+ coplug = inode_compression_plugin(inode);
32327+ if (need_inflate(clust, inode, need_cipher(inode)) &&
32328+ coplug->checksum != NULL) {
32329+ result = dc_check_checksum(coplug, tc);
32330+ if (unlikely(result)) {
32331+ warning("edward-1460",
32332+ "Inode %llu: disk cluster %lu looks corrupted",
32333+ (unsigned long long)get_inode_oid(inode),
32334+ clust->index);
32335+ return RETERR(-EIO);
32336+ }
32337+ }
32338+ if (need_cipher(inode)) {
32339+ cipher_plugin * ciplug;
32340+ struct blkcipher_desc desc;
32341+ struct scatterlist src;
32342+ struct scatterlist dst;
32343+
32344+ ciplug = inode_cipher_plugin(inode);
32345+ desc.tfm = info_get_cipher(inode_crypto_stat(inode));
32346+ desc.flags = 0;
32347+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32348+ if (result)
32349+ return result;
32350+ assert("edward-909", tfm_cluster_is_set(tc));
32351+
32352+ src.page = virt_to_page(tfm_input_data(clust));
32353+ src.offset = offset_in_page(tfm_input_data(clust));
32354+ src.length = tc->len;
32355+
32356+ dst.page = virt_to_page(tfm_output_data(clust));
32357+ dst.offset = offset_in_page(tfm_output_data(clust));
32358+ dst.length = tc->len;
32359+
32360+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
32361+ if (result) {
32362+ warning("edward-1600", "decrypt failed flags=%x\n",
32363+ desc.flags);
32364+ return result;
32365+ }
32366+ align_or_cut_overhead(inode, clust, READ_OP);
32367+ transformed = 1;
32368+ }
32369+ if (need_inflate(clust, inode, 0)) {
32370+ unsigned dst_len = inode_cluster_size(inode);
32371+ if(transformed)
32372+ alternate_streams(tc);
32373+
32374+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32375+ if (result)
32376+ return result;
32377+ assert("edward-1305", coplug->decompress != NULL);
32378+ assert("edward-910", tfm_cluster_is_set(tc));
32379+
32380+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
32381+ tfm_input_data(clust), tc->len,
32382+ tfm_output_data(clust), &dst_len);
32383+ /* check length */
32384+ tc->len = dst_len;
32385+ assert("edward-157", dst_len == tc->lsize);
32386+ transformed = 1;
32387+ }
32388+ if (!transformed)
32389+ alternate_streams(tc);
32390+ return result;
32391+}
32392+
32393+/* This is implementation of readpage method of struct
32394+ address_space_operations for cryptcompress plugin. */
32395+int readpage_cryptcompress(struct file *file, struct page *page)
32396+{
32397+ reiser4_context *ctx;
32398+ reiser4_cluster_t clust;
32399+ item_plugin *iplug;
32400+ int result;
32401+
32402+ assert("edward-88", PageLocked(page));
32403+ assert("vs-976", !PageUptodate(page));
32404+ assert("edward-89", page->mapping && page->mapping->host);
32405+
32406+ ctx = reiser4_init_context(page->mapping->host->i_sb);
32407+ if (IS_ERR(ctx)) {
32408+ unlock_page(page);
32409+ return PTR_ERR(ctx);
32410+ }
32411+ assert("edward-113",
32412+ ergo(file != NULL,
32413+ page->mapping == file->f_dentry->d_inode->i_mapping));
32414+
32415+ if (PageUptodate(page)) {
32416+ warning("edward-1338", "page is already uptodate\n");
32417+ unlock_page(page);
32418+ reiser4_exit_context(ctx);
32419+ return 0;
32420+ }
32421+ cluster_init_read(&clust, NULL);
32422+ clust.file = file;
32423+ iplug = item_plugin_by_id(CTAIL_ID);
32424+ if (!iplug->s.file.readpage) {
32425+ unlock_page(page);
32426+ put_cluster_handle(&clust);
32427+ reiser4_exit_context(ctx);
32428+ return -EINVAL;
32429+ }
32430+ result = iplug->s.file.readpage(&clust, page);
32431+
32432+ assert("edward-1459", !PageLocked(page));
32433+ assert("edward-64", ergo(result == 0, PageUptodate(page)));
32434+ put_cluster_handle(&clust);
32435+ reiser4_exit_context(ctx);
32436+ return result;
32437+}
32438+
32439+/* how much pages will be captured */
32440+static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
32441+{
32442+ switch (clust->op) {
32443+ case PCL_APPEND:
32444+ return clust->nr_pages;
32445+ case PCL_TRUNCATE:
32446+ assert("edward-1179", clust->win != NULL);
32447+ return count_to_nrpages(clust->win->off + clust->win->count);
32448+ default:
32449+ impossible("edward-1180", "bad page cluster option");
32450+ return 0;
32451+ }
32452+}
32453+
32454+static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
32455+{
32456+ int i;
32457+ struct page *pg;
32458+ int nrpages = cluster_nrpages_to_capture(clust);
32459+
32460+ for (i = 0; i < nrpages; i++) {
32461+
32462+ pg = clust->pages[i];
32463+ assert("edward-968", pg != NULL);
32464+ lock_page(pg);
32465+ assert("edward-1065", PageUptodate(pg));
32466+ reiser4_set_page_dirty_internal(pg);
32467+ unlock_page(pg);
32468+ mark_page_accessed(pg);
32469+ }
32470+}
32471+
32472+static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
32473+{
32474+ int i;
32475+ assert("edward-1275", clust != NULL);
32476+
32477+ for (i = 0; i < clust->nr_pages; i++) {
32478+ assert("edward-1276", clust->pages[i] != NULL);
32479+
32480+ lock_page(clust->pages[i]);
32481+ if (PageDirty(clust->pages[i])) {
32482+ assert("edward-1277", PageUptodate(clust->pages[i]));
32483+ cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
32484+ }
32485+#if REISER4_DEBUG
32486+ else
32487+ /* Race between flush and write:
32488+ some pages became clean when write() (or another
32489+ process which modifies data) capture the cluster. */
32490+ warning("edward-985", "Page of index %lu (inode %llu)"
32491+ " is not dirty\n", clust->pages[i]->index,
32492+ (unsigned long long)get_inode_oid(clust->
32493+ pages[i]->
32494+ mapping->
32495+ host));
32496+#endif
32497+ unlock_page(clust->pages[i]);
32498+ }
32499+}
32500+
32501+/* update i_size by window */
32502+static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
32503+{
32504+ loff_t size;
32505+ reiser4_slide_t *win;
32506+
32507+ assert("edward-1181", clust != NULL);
32508+ assert("edward-1182", inode != NULL);
32509+
32510+ win = clust->win;
32511+ assert("edward-1183", win != NULL);
32512+ assert("edward-1183", win->count != 0);
32513+
32514+ size = clust_to_off(clust->index, inode) + win->off;
32515+
32516+ switch (clust->op) {
32517+ case PCL_APPEND:
32518+ if (size + win->count <= inode->i_size)
32519+ /* overwrite only */
32520+ return;
32521+ size += win->count;
32522+ break;
32523+ case PCL_TRUNCATE:
32524+ break;
32525+ default:
32526+ impossible("edward-1184", "bad page cluster option");
32527+ break;
32528+ }
32529+ inode_check_scale_nolock(inode, inode->i_size, size);
32530+ inode->i_size = size;
32531+ return;
32532+}
32533+
32534+/* Check in page cluster modifications.
32535+ . Make jnode dirty, if it wasn't;
32536+ . Reserve space for a disk cluster update by flush algorithm, if needed;
32537+ . Clean up old references (if any).
32538+ . Put pages (grabbed in this thread) which will be truncated
32539+*/
32540+static void
32541+make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32542+ loff_t * old_isize, struct inode *inode)
32543+{
32544+ int i;
32545+ int old_nrpages;
32546+ int new_nrpages = cluster_nrpages_to_capture(clust);
32547+
32548+ assert("edward-973", new_nrpages > 0);
32549+ assert("edward-221", node != NULL);
32550+ assert("edward-971", clust->reserved == 1);
32551+ assert_spin_locked(&(node->guard));
32552+ assert("edward-972", node->page_count <= cluster_nrpages(inode));
32553+ assert("edward-1263",
32554+ clust->reserved_prepped == estimate_update_cluster(inode));
32555+ assert("edward-1264", clust->reserved_unprepped == 0);
32556+
32557+ if (JF_ISSET(node, JNODE_DIRTY)) {
32558+ /* someone has modified this cluster, but
32559+ the modifications are not committed yet */
32560+ old_nrpages =
32561+ count_to_nrpages(cnt_to_clcnt(*old_isize,
32562+ clust->index, inode));
32563+ /* free space which is already reserved */
32564+ free_reserved4cluster(inode, clust,
32565+ estimate_update_cluster(inode));
32566+ /* put old references */
32567+ for (i = 0; i < old_nrpages; i++) {
32568+ assert("edward-975", clust->pages[i]);
32569+ assert("edward-1185", PageUptodate(clust->pages[i]));
32570+
32571+ page_cache_release(clust->pages[i]);
32572+#if REISER4_DEBUG
32573+ cryptcompress_inode_data(inode)->pgcount --;
32574+#endif
32575+ }
32576+ } else {
32577+ /* no captured pages */
32578+ assert("edward-1043", node->page_count == 0);
32579+ jnode_make_dirty_locked(node);
32580+ clust->reserved = 0;
32581+ }
32582+ /* put pages that will be truncated (if any) */
32583+ for (i = new_nrpages; i < clust->nr_pages; i++) {
32584+ assert("edward-1433", clust->pages[i]);
32585+ assert("edward-1434", PageUptodate(clust->pages[i]));
32586+ page_cache_release(clust->pages[i]);
32587+#if REISER4_DEBUG
32588+ cryptcompress_inode_data(inode)->pgcount --;
32589+#endif
32590+ }
32591+#if REISER4_DEBUG
32592+ clust->reserved_prepped -= estimate_update_cluster(inode);
32593+ node->page_count = new_nrpages;
32594+#endif
32595+ return;
32596+}
32597+
32598+/* This function spawns a transaction and
32599+ is called by any thread as a final step in page cluster modification.
32600+*/
32601+static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
32602+{
32603+ int result = 0;
32604+ loff_t old_size;
32605+ jnode *node;
32606+
32607+ assert("edward-1029", clust != NULL);
32608+ assert("edward-1030", clust->reserved == 1);
32609+ assert("edward-1031", clust->nr_pages != 0);
32610+ assert("edward-1032", clust->pages != NULL);
32611+ assert("edward-1033", clust->pages[0] != NULL);
32612+
32613+ node = jprivate(clust->pages[0]);
32614+ assert("edward-1035", node != NULL);
32615+ assert("edward-1446", jnode_is_cluster_page(node));
32616+
32617+ spin_lock_jnode(node);
32618+
32619+ old_size = inode->i_size;
32620+ if (clust->win)
32621+ inode_set_new_size(clust, inode);
32622+
32623+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32624+ if (result)
32625+ goto exit;
32626+ make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
32627+ exit:
32628+ spin_unlock_jnode(node);
32629+ jput(node);
32630+ return result;
32631+}
32632+
32633+/* Collect unlocked cluster pages for any modifications and attach a jnode.
32634+ We allocate only one jnode per cluster, this jnode is binded to the first
32635+ page of this cluster, so we have an extra-reference that will exist with
32636+ this jnode, other references will be cleaned up in flush time.
32637+*/
32638+static int
32639+grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
32640+{
32641+ int i;
32642+ int result = 0;
32643+ jnode *node = NULL;
32644+
32645+ assert("edward-182", clust != NULL);
32646+ assert("edward-183", clust->pages != NULL);
32647+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32648+
32649+ if (clust->nr_pages == 0)
32650+ return 0;
32651+
32652+ for (i = 0; i < clust->nr_pages; i++) {
32653+
32654+ assert("edward-1044", clust->pages[i] == NULL);
32655+
32656+ clust->pages[i] =
32657+ find_or_create_page(inode->i_mapping,
32658+ clust_to_pg(clust->index, inode) + i,
32659+ reiser4_ctx_gfp_mask_get());
32660+ if (!clust->pages[i]) {
32661+ result = RETERR(-ENOMEM);
32662+ break;
32663+ }
32664+ if (i == 0) {
32665+ node = jnode_of_page(clust->pages[i]);
32666+ if (IS_ERR(node)) {
32667+ result = PTR_ERR(node);
32668+ unlock_page(clust->pages[i]);
32669+ break;
32670+ }
32671+ JF_SET(node, JNODE_CLUSTER_PAGE);
32672+ unlock_page(clust->pages[i]);
32673+ assert("edward-919", node);
32674+ continue;
32675+ }
32676+ unlock_page(clust->pages[i]);
32677+ }
32678+ if (result) {
32679+ while (i)
32680+ page_cache_release(clust->pages[--i]);
32681+ if (node && !IS_ERR(node))
32682+ jput(node);
32683+ return result;
32684+ }
32685+ assert("edward-920", jprivate(clust->pages[0]));
32686+#if REISER4_DEBUG
32687+ cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
32688+#endif
32689+ return 0;
32690+}
32691+
32692+/* Collect unlocked cluster pages only for read (not to modify) */
32693+int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32694+{
32695+ int i;
32696+ int result = 0;
32697+
32698+ assert("edward-1428", inode != NULL);
32699+ assert("edward-1429", inode->i_mapping != NULL);
32700+ assert("edward-787", clust != NULL);
32701+ assert("edward-788", clust->pages != NULL);
32702+ assert("edward-789", clust->nr_pages != 0);
32703+ assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
32704+
32705+ for (i = 0; i < clust->nr_pages; i++) {
32706+ clust->pages[i] =
32707+ find_or_create_page(inode->i_mapping,
32708+ clust_to_pg(clust->index, inode) + i,
32709+ reiser4_ctx_gfp_mask_get());
32710+ if (!clust->pages[i]) {
32711+ result = RETERR(-ENOMEM);
32712+ break;
32713+ }
32714+ unlock_page(clust->pages[i]);
32715+ }
32716+ if (result)
32717+ while (i)
32718+ page_cache_release(clust->pages[--i]);
32719+ return result;
32720+}
32721+
32722+/* @node might be attached by reiser4_writepage(), not by
32723+ cryptcompress plugin code, but emergency flush should
32724+ understand that pages of cryptcompress files are not
32725+ flushable.
32726+*/
32727+#if 0
32728+int jnode_of_cluster(const jnode * node, struct page * page)
32729+{
32730+ assert("edward-1339", node != NULL);
32731+ assert("edward-1340", page != NULL);
32732+ assert("edward-1341", page->mapping != NULL);
32733+ assert("edward-1342", page->mapping->host != NULL);
32734+ assert("edward-1343",
32735+ ergo(jnode_is_unformatted(node),
32736+ get_inode_oid(page->mapping->host) ==
32737+ node->key.j.objectid));
32738+ if (inode_file_plugin(page->mapping->host) ==
32739+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) {
32740+#if REISER4_DEBUG
32741+ if (!jnode_is_cluster_page(node))
32742+ warning("edward-1345",
32743+ "inode %llu: cluster page of index %lu became private",
32744+ (unsigned long long)get_inode_oid(page->mapping->host),
32745+ page->index);
32746+#endif
32747+ return 1;
32748+ }
32749+ return 0;
32750+}
32751+#endif /* 0 */
32752+
32753+/* put cluster pages */
32754+void reiser4_release_cluster_pages(reiser4_cluster_t * clust)
32755+{
32756+ int i;
32757+
32758+ assert("edward-447", clust != NULL);
32759+ for (i = 0; i < clust->nr_pages; i++) {
32760+
32761+ assert("edward-449", clust->pages[i] != NULL);
32762+
32763+ page_cache_release(clust->pages[i]);
32764+ }
32765+}
32766+
32767+/* this is called when something is failed */
32768+static void reiser4_release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
32769+{
32770+ jnode *node;
32771+
32772+ assert("edward-445", clust != NULL);
32773+ assert("edward-922", clust->pages != NULL);
32774+ assert("edward-446", clust->pages[0] != NULL);
32775+
32776+ node = jprivate(clust->pages[0]);
32777+
32778+ assert("edward-447", node != NULL);
32779+
32780+ reiser4_release_cluster_pages(clust);
32781+ jput(node);
32782+}
32783+
32784+#if REISER4_DEBUG
32785+static int window_ok(reiser4_slide_t * win, struct inode *inode)
32786+{
32787+ assert("edward-1115", win != NULL);
32788+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32789+
32790+ return (win->off != inode_cluster_size(inode)) &&
32791+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
32792+}
32793+
32794+static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
32795+{
32796+ assert("edward-279", clust != NULL);
32797+
32798+ if (!clust->pages)
32799+ return 0;
32800+ return (clust->win ? window_ok(clust->win, inode) : 1);
32801+}
32802+#endif
32803+
32804+/* guess next window stat */
32805+static inline window_stat next_window_stat(reiser4_slide_t * win)
32806+{
32807+ assert("edward-1130", win != NULL);
32808+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32809+ HOLE_WINDOW : DATA_WINDOW);
32810+}
32811+
32812+/* guess next cluster index and window params */
32813+static void
32814+update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32815+ loff_t to_file)
32816+{
32817+ reiser4_slide_t *win;
32818+
32819+ assert("edward-185", clust != NULL);
32820+ assert("edward-438", clust->pages != NULL);
32821+ assert("edward-281", cluster_ok(clust, inode));
32822+
32823+ win = clust->win;
32824+ if (!win)
32825+ return;
32826+
32827+ switch (win->stat) {
32828+ case DATA_WINDOW:
32829+ /* increment window position */
32830+ clust->index++;
32831+ win->stat = DATA_WINDOW;
32832+ win->off = 0;
32833+ win->count = min_count(inode_cluster_size(inode), to_file);
32834+ break;
32835+ case HOLE_WINDOW:
32836+ switch (next_window_stat(win)) {
32837+ case HOLE_WINDOW:
32838+ /* set window to fit the offset we start write from */
32839+ clust->index = off_to_clust(file_off, inode);
32840+ win->stat = HOLE_WINDOW;
32841+ win->off = 0;
32842+ win->count = off_to_cloff(file_off, inode);
32843+ win->delta =
32844+ min_count(inode_cluster_size(inode) - win->count,
32845+ to_file);
32846+ break;
32847+ case DATA_WINDOW:
32848+ /* do not move the window, just change its state,
32849+ off+count+delta=inv */
32850+ win->stat = DATA_WINDOW;
32851+ win->off = win->off + win->count;
32852+ win->count = win->delta;
32853+ win->delta = 0;
32854+ break;
32855+ default:
32856+ impossible("edward-282", "wrong next window state");
32857+ }
32858+ break;
32859+ default:
32860+ impossible("edward-283", "wrong current window state");
32861+ }
32862+ assert("edward-1068", cluster_ok(clust, inode));
32863+}
32864+
32865+static int update_sd_cryptcompress(struct inode *inode)
32866+{
32867+ int result = 0;
32868+
32869+ assert("edward-978", reiser4_schedulable());
32870+
32871+ result = reiser4_grab_space_force( /* one for stat data update */
32872+ estimate_update_common(inode),
32873+ BA_CAN_COMMIT);
32874+ if (result)
32875+ return result;
32876+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
32877+ result = reiser4_update_sd(inode);
32878+
32879+ return result;
32880+}
32881+
32882+/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
32883+static void uncapture_cluster_jnode(jnode * node)
32884+{
32885+ txn_atom *atom;
32886+
32887+ assert_spin_locked(&(node->guard));
32888+
32889+ /*jnode_make_clean(node); */
32890+ atom = jnode_get_atom(node);
32891+ if (atom == NULL) {
32892+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
32893+ spin_unlock_jnode(node);
32894+ return;
32895+ }
32896+
32897+ reiser4_uncapture_block(node);
32898+ spin_unlock_atom(atom);
32899+ jput(node);
32900+}
32901+
32902+static void forget_cluster_pages(struct page **pages, int nr)
32903+{
32904+ int i;
32905+ for (i = 0; i < nr; i++) {
32906+
32907+ assert("edward-1045", pages[i] != NULL);
32908+ page_cache_release(pages[i]);
32909+ }
32910+}
32911+
32912+/* Check out last modifications we are about to commit,
32913+ and prepare input stream for transform operations.
32914+*/
32915+int
32916+flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
32917+ struct inode *inode)
32918+{
32919+ int result = 0;
32920+ int i;
32921+ int nr_pages = 0;
32922+ tfm_cluster_t *tc = &clust->tc;
32923+#if REISER4_DEBUG
32924+ int node_pgcount;
32925+#endif
32926+ assert("edward-980", node != NULL);
32927+ assert("edward-236", inode != NULL);
32928+ assert("edward-237", clust != NULL);
32929+ assert("edward-240", !clust->win);
32930+ assert("edward-241", reiser4_schedulable());
32931+ assert("edward-718", cryptcompress_inode_ok(inode));
32932+
32933+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
32934+ if (result) {
32935+ warning("edward-1430",
32936+ "alloc stream failed with ret=%d", result);
32937+ return result;
32938+ }
32939+ spin_lock_jnode(node);
32940+#if REISER4_DEBUG
32941+ node_pgcount = node->page_count;
32942+#endif
32943+ if (!JF_ISSET(node, JNODE_DIRTY)) {
32944+ /* race with another flush */
32945+#if REISER4_DEBUG
32946+ assert("edward-981", node_pgcount == 0);
32947+ warning("edward-982", "flush_cluster_pages: jnode is not dirty "
32948+ "clust %lu, inode %llu\n",
32949+ clust->index, (unsigned long long)get_inode_oid(inode));
32950+#endif
32951+ spin_unlock_jnode(node);
32952+ return RETERR(-E_REPEAT);
32953+ }
32954+ /* Check out a size of logical cluster and
32955+ set a number of cluster pages to commit. */
32956+ tc->len = tc->lsize = fsize_to_count(clust, inode);
32957+ clust->nr_pages = count_to_nrpages(tc->len);
32958+
32959+#if REISER4_DEBUG
32960+ node->page_count = 0;
32961+#endif
32962+ cluster_reserved2grabbed(estimate_update_cluster(inode));
32963+ uncapture_cluster_jnode(node);
32964+
32965+ assert("edward-1224", reiser4_schedulable());
32966+ /* Check out page cluster for commit */
32967+ nr_pages =
32968+ find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
32969+ clust->nr_pages, clust->pages);
32970+ if (nr_pages != clust->nr_pages)
32971+ goto checkout_failed;
32972+
32973+ /* Try to construct input stream from the checked out pages */
32974+ for (i = 0; i < clust->nr_pages; i++) {
32975+ char *data;
32976+
32977+ assert("edward-242", clust->pages[i] != NULL);
32978+ if (clust->pages[i]->index !=
32979+ clust_to_pg(clust->index, inode) + i)
32980+ goto checkout_failed;
32981+ BUG_ON(!PageUptodate(clust->pages[i]));
32982+
32983+ /* flush the page into input transform stream */
32984+ lock_page(clust->pages[i]);
32985+ data = kmap(clust->pages[i]);
32986+
32987+ assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
32988+
32989+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
32990+ data, cnt_to_pgcnt(tc->len, i));
32991+ kunmap(clust->pages[i]);
32992+ unlock_page(clust->pages[i]);
32993+ }
32994+ /* page cluster flushed successfully */
32995+
32996+ clear_cluster_pages_dirty(clust);
32997+ reiser4_release_cluster_pages(clust);
32998+#if REISER4_DEBUG
32999+ cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
33000+#endif
33001+ goto out;
33002+ checkout_failed:
33003+#if REISER4_DEBUG
33004+ assert("edward-1282", node_pgcount == 0);
33005+ warning("edward-1435", "Inode %llu : checkout page cluster"
33006+ "of index %lu failed\n",
33007+ (unsigned long long)get_inode_oid(inode), clust->index);
33008+#endif /* REISER4_DEBUG */
33009+ result = RETERR(-E_REPEAT);
33010+ out:
33011+ /* put pages that were found here */
33012+ forget_cluster_pages(clust->pages, nr_pages);
33013+ return result;
33014+}
33015+
33016+/* set hint for the cluster of the index @index */
33017+static void set_hint_cluster(struct inode *inode, hint_t * hint,
33018+ cloff_t index, znode_lock_mode mode)
33019+{
33020+ reiser4_key key;
33021+ assert("edward-722", cryptcompress_inode_ok(inode));
33022+ assert("edward-723",
33023+ inode_file_plugin(inode) ==
33024+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33025+
33026+ inode_file_plugin(inode)->key_by_inode(inode,
33027+ clust_to_off(index, inode),
33028+ &key);
33029+
33030+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
33031+ hint->offset = get_key_offset(&key);
33032+ hint->mode = mode;
33033+}
33034+
33035+void invalidate_hint_cluster(reiser4_cluster_t * clust)
33036+{
33037+ assert("edward-1291", clust != NULL);
33038+ assert("edward-1292", clust->hint != NULL);
33039+
33040+ done_lh(&clust->hint->lh);
33041+ hint_clr_valid(clust->hint);
33042+}
33043+
33044+void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
33045+ znode_lock_mode mode)
33046+{
33047+ assert("edward-1286", clust != NULL);
33048+ assert("edward-1287", clust->hint != NULL);
33049+
33050+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
33051+ invalidate_hint_cluster(clust);
33052+}
33053+
33054+static int
33055+balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
33056+ loff_t off, loff_t to_file)
33057+{
33058+ int result;
33059+
33060+ assert("edward-724", inode != NULL);
33061+ assert("edward-725", cryptcompress_inode_ok(inode));
33062+
33063+ /* set next window params */
33064+ update_cluster(inode, clust, off, to_file);
33065+
33066+ result = update_sd_cryptcompress(inode);
33067+ if (result)
33068+ return result;
33069+ assert("edward-726", clust->hint->lh.owner == NULL);
33070+
33071+ reiser4_throttle_write(inode);
33072+ return 0;
33073+}
33074+
33075+/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
33076+static int
33077+write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
33078+ loff_t to_file)
33079+{
33080+ char *data;
33081+ int result = 0;
33082+ unsigned cl_off, cl_count = 0;
33083+ unsigned to_pg, pg_off;
33084+ reiser4_slide_t *win;
33085+
33086+ assert("edward-190", clust != NULL);
33087+ assert("edward-1069", clust->win != NULL);
33088+ assert("edward-191", inode != NULL);
33089+ assert("edward-727", cryptcompress_inode_ok(inode));
33090+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
33091+ assert("edward-1154",
33092+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
33093+
33094+ win = clust->win;
33095+
33096+ assert("edward-1070", win != NULL);
33097+ assert("edward-201", win->stat == HOLE_WINDOW);
33098+ assert("edward-192", cluster_ok(clust, inode));
33099+
33100+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
33101+ /* the hole will be represented by fake disk cluster */
33102+ update_cluster(inode, clust, file_off, to_file);
33103+ return 0;
33104+ }
33105+ cl_count = win->count; /* number of zeroes to write */
33106+ cl_off = win->off;
33107+ pg_off = off_to_pgoff(win->off);
33108+
33109+ while (cl_count) {
33110+ struct page *page;
33111+ page = clust->pages[off_to_pg(cl_off)];
33112+
33113+ assert("edward-284", page != NULL);
33114+
33115+ to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
33116+ lock_page(page);
33117+ data = kmap_atomic(page, KM_USER0);
33118+ memset(data + pg_off, 0, to_pg);
33119+ flush_dcache_page(page);
33120+ kunmap_atomic(data, KM_USER0);
33121+ SetPageUptodate(page);
33122+ unlock_page(page);
33123+
33124+ cl_off += to_pg;
33125+ cl_count -= to_pg;
33126+ pg_off = 0;
33127+ }
33128+ if (!win->delta) {
33129+ /* only zeroes, try to capture */
33130+
33131+ set_cluster_pages_dirty(clust);
33132+ result = try_capture_cluster(clust, inode);
33133+ if (result)
33134+ return result;
33135+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
33136+ result =
33137+ balance_dirty_page_cluster(clust, inode, file_off, to_file);
33138+ } else
33139+ update_cluster(inode, clust, file_off, to_file);
33140+ return result;
33141+}
33142+
33143+/*
33144+ The main disk search procedure for cryptcompress plugins, which
33145+ . scans all items of disk cluster with the lock mode @mode
33146+ . maybe reads each one (if @read)
33147+ . maybe makes its znode dirty (if write lock mode was specified)
33148+
33149+ NOTE-EDWARD: Callers should handle the case when disk cluster
33150+ is incomplete (-EIO)
33151+*/
33152+int find_disk_cluster(reiser4_cluster_t * clust,
33153+ struct inode *inode, int read, znode_lock_mode mode)
33154+{
33155+ flow_t f;
33156+ hint_t *hint;
33157+ int result = 0;
33158+ unsigned long cl_idx;
33159+ ra_info_t ra_info;
33160+ file_plugin *fplug;
33161+ item_plugin *iplug;
33162+ tfm_cluster_t *tc;
33163+ int was_grabbed;
33164+
33165+ assert("edward-138", clust != NULL);
33166+ assert("edward-728", clust->hint != NULL);
33167+ assert("edward-226", reiser4_schedulable());
33168+ assert("edward-137", inode != NULL);
33169+ assert("edward-729", cryptcompress_inode_ok(inode));
33170+
33171+ hint = clust->hint;
33172+ cl_idx = clust->index;
33173+ fplug = inode_file_plugin(inode);
33174+ was_grabbed = get_current_context()->grabbed_blocks;
33175+ tc = &clust->tc;
33176+
33177+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
33178+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
33179+
33180+ dclust_init_extension(hint);
33181+
33182+ /* set key of the first disk cluster item */
33183+ fplug->flow_by_inode(inode,
33184+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
33185+ 0 /* kernel space */ ,
33186+ inode_scaled_cluster_size(inode),
33187+ clust_to_off(cl_idx, inode), READ_OP, &f);
33188+ if (mode == ZNODE_WRITE_LOCK) {
33189+ /* reserve for flush to make dirty all the leaf nodes
33190+ which contain disk cluster */
33191+ result =
33192+ reiser4_grab_space_force(estimate_dirty_cluster(inode),
33193+ BA_CAN_COMMIT);
33194+ if (result)
33195+ goto out;
33196+ }
33197+
33198+ ra_info.key_to_stop = f.key;
33199+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
33200+
33201+ while (f.length) {
33202+ result = find_cluster_item(hint, &f.key, mode,
33203+ NULL, FIND_EXACT,
33204+ (mode == ZNODE_WRITE_LOCK ?
33205+ CBK_FOR_INSERT : 0));
33206+ switch (result) {
33207+ case CBK_COORD_NOTFOUND:
33208+ result = 0;
33209+ if (inode_scaled_offset
33210+ (inode,
33211+ clust_to_off(cl_idx,
33212+ inode)) == get_key_offset(&f.key)) {
33213+ /* first item not found, this is treated
33214+ as disk cluster is absent */
33215+ clust->dstat = FAKE_DISK_CLUSTER;
33216+ goto out;
33217+ }
33218+ /* we are outside the cluster, stop search here */
33219+ assert("edward-146",
33220+ f.length != inode_scaled_cluster_size(inode));
33221+ goto ok;
33222+ case CBK_COORD_FOUND:
33223+ assert("edward-148",
33224+ hint->ext_coord.coord.between == AT_UNIT);
33225+ assert("edward-460",
33226+ hint->ext_coord.coord.unit_pos == 0);
33227+
33228+ coord_clear_iplug(&hint->ext_coord.coord);
33229+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
33230+ if (unlikely(result))
33231+ goto out;
33232+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
33233+ assert("edward-147",
33234+ item_id_by_coord(&hint->ext_coord.coord) ==
33235+ CTAIL_ID);
33236+
33237+ result = iplug->s.file.read(NULL, &f, hint);
33238+ if (result) {
33239+ zrelse(hint->ext_coord.coord.node);
33240+ goto out;
33241+ }
33242+ if (mode == ZNODE_WRITE_LOCK) {
33243+ /* Don't make dirty more nodes then it was
33244+ estimated (see comments before
33245+ estimate_dirty_cluster). Missed nodes will be
33246+ read up in flush time if they are evicted from
33247+ memory */
33248+ if (dclust_get_extension_ncount(hint) <=
33249+ estimate_dirty_cluster(inode))
33250+ znode_make_dirty(hint->ext_coord.coord.node);
33251+
33252+ znode_set_convertible(hint->ext_coord.coord.
33253+ node);
33254+ }
33255+ zrelse(hint->ext_coord.coord.node);
33256+ break;
33257+ default:
33258+ goto out;
33259+ }
33260+ }
33261+ ok:
33262+ /* at least one item was found */
33263+ /* NOTE-EDWARD: Callers should handle the case
33264+ when disk cluster is incomplete (-EIO) */
33265+ tc->len = inode_scaled_cluster_size(inode) - f.length;
33266+ tc->lsize = fsize_to_count(clust, inode);
33267+ assert("edward-1196", tc->len > 0);
33268+ assert("edward-1406", tc->lsize > 0);
33269+
33270+ if (hint_is_unprepped_dclust(clust->hint))
33271+ clust->dstat = UNPR_DISK_CLUSTER;
33272+ else {
33273+ dclust_set_extension_dsize(clust->hint, tc->len);
33274+ clust->dstat = PREP_DISK_CLUSTER;
33275+ }
33276+ out:
33277+ assert("edward-1339",
33278+ get_current_context()->grabbed_blocks >= was_grabbed);
33279+ grabbed2free(get_current_context(),
33280+ get_current_super_private(),
33281+ get_current_context()->grabbed_blocks - was_grabbed);
33282+ return result;
33283+}
33284+
33285+int
33286+get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
33287+ znode_lock_mode lock_mode)
33288+{
33289+ reiser4_key key;
33290+ ra_info_t ra_info;
33291+
33292+ assert("edward-730", reiser4_schedulable());
33293+ assert("edward-731", clust != NULL);
33294+ assert("edward-732", inode != NULL);
33295+
33296+ if (hint_is_valid(clust->hint)) {
33297+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33298+ assert("edward-1294",
33299+ znode_is_write_locked(clust->hint->lh.node));
33300+ /* already have a valid locked position */
33301+ return (clust->dstat ==
33302+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33303+ CBK_COORD_FOUND);
33304+ }
33305+ key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33306+ &key);
33307+ ra_info.key_to_stop = key;
33308+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
33309+
33310+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33311+ CBK_FOR_INSERT);
33312+}
33313+
33314+/* Read needed cluster pages before modifying.
33315+ If success, @clust->hint contains locked position in the tree.
33316+ Also:
33317+ . find and set disk cluster state
33318+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33319+*/
33320+static int
33321+read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
33322+{
33323+ int i;
33324+ int result = 0;
33325+ item_plugin *iplug;
33326+ reiser4_slide_t *win = clust->win;
33327+ znode_lock_mode mode = ZNODE_WRITE_LOCK;
33328+
33329+ iplug = item_plugin_by_id(CTAIL_ID);
33330+
33331+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33332+
33333+#if REISER4_DEBUG
33334+ if (clust->nr_pages == 0) {
33335+ /* start write hole from fake disk cluster */
33336+ assert("edward-1117", win != NULL);
33337+ assert("edward-1118", win->stat == HOLE_WINDOW);
33338+ assert("edward-1119", new_cluster(clust, inode));
33339+ }
33340+#endif
33341+ if (new_cluster(clust, inode)) {
33342+ /*
33343+ new page cluster is about to be written, nothing to read,
33344+ */
33345+ assert("edward-734", reiser4_schedulable());
33346+ assert("edward-735", clust->hint->lh.owner == NULL);
33347+
33348+ if (clust->nr_pages) {
33349+ int off;
33350+ char *data;
33351+ struct page * pg;
33352+ assert("edward-1419", clust->pages != NULL);
33353+ pg = clust->pages[clust->nr_pages - 1];
33354+ assert("edward-1420", pg != NULL);
33355+ off = off_to_pgoff(win->off+win->count+win->delta);
33356+ if (off) {
33357+ lock_page(pg);
33358+ data = kmap_atomic(pg, KM_USER0);
33359+ memset(data + off, 0, PAGE_CACHE_SIZE - off);
33360+ flush_dcache_page(pg);
33361+ kunmap_atomic(data, KM_USER0);
33362+ unlock_page(pg);
33363+ }
33364+ }
33365+ clust->dstat = FAKE_DISK_CLUSTER;
33366+ return 0;
33367+ }
33368+ /*
33369+ Here we should search for disk cluster to figure out its real state.
33370+ Also there is one more important reason to do disk search: we need
33371+ to make disk cluster _dirty_ if it exists
33372+ */
33373+
33374+ /* if windows is specified, read the only pages
33375+ that will be modified partially */
33376+
33377+ for (i = 0; i < clust->nr_pages; i++) {
33378+ struct page *pg = clust->pages[i];
33379+
33380+ lock_page(pg);
33381+ if (PageUptodate(pg)) {
33382+ unlock_page(pg);
33383+ continue;
33384+ }
33385+ unlock_page(pg);
33386+
33387+ if (win &&
33388+ i >= count_to_nrpages(win->off) &&
33389+ i < off_to_pg(win->off + win->count + win->delta))
33390+ /* page will be completely overwritten */
33391+ continue;
33392+
33393+ if (win && (i == clust->nr_pages - 1) &&
33394+ /* the last page is
33395+ partially modified,
33396+ not uptodate .. */
33397+ (count_to_nrpages(inode->i_size) <= pg->index)) {
33398+ /* .. and appended,
33399+ so set zeroes to the rest */
33400+ char *data;
33401+ int offset;
33402+ lock_page(pg);
33403+ data = kmap_atomic(pg, KM_USER0);
33404+
33405+ assert("edward-1260",
33406+ count_to_nrpages(win->off + win->count +
33407+ win->delta) - 1 == i);
33408+
33409+ offset =
33410+ off_to_pgoff(win->off + win->count + win->delta);
33411+ memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
33412+ flush_dcache_page(pg);
33413+ kunmap_atomic(data, KM_USER0);
33414+ unlock_page(pg);
33415+ /* still not uptodate */
33416+ break;
33417+ }
33418+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
33419+ result = ctail_read_disk_cluster(clust, inode, mode);
33420+ if (result)
33421+ goto out;
33422+ assert("edward-925",
33423+ tfm_cluster_is_uptodate(&clust->tc));
33424+ }
33425+ lock_page(pg);
33426+ result = do_readpage_ctail(inode, clust, pg, mode);
33427+ unlock_page(pg);
33428+ if (result) {
33429+ impossible("edward-219",
33430+ "do_readpage_ctail returned crap");
33431+ goto out;
33432+ }
33433+ }
33434+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
33435+ /* disk cluster unclaimed, but we need to make its znodes dirty
33436+ to make flush update convert its content */
33437+ result = find_disk_cluster(clust, inode, 0 /* do not read items */,
33438+ mode);
33439+ }
33440+ out:
33441+ tfm_cluster_clr_uptodate(&clust->tc);
33442+ return result;
33443+}
33444+
33445+static int
33446+should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33447+{
33448+ assert("edward-737", clust != NULL);
33449+
33450+ switch (clust->dstat) {
33451+ case PREP_DISK_CLUSTER:
33452+ case UNPR_DISK_CLUSTER:
33453+ return 0;
33454+ case FAKE_DISK_CLUSTER:
33455+ if (clust->win &&
33456+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
33457+ assert("edward-1172", new_cluster(clust, inode));
33458+ return 0;
33459+ }
33460+ return 1;
33461+ default:
33462+ impossible("edward-1173", "bad disk cluster state");
33463+ return 0;
33464+ }
33465+}
33466+
33467+static int
33468+cryptcompress_make_unprepped_cluster(reiser4_cluster_t * clust,
33469+ struct inode *inode)
33470+{
33471+ int result;
33472+
33473+ assert("edward-1123", reiser4_schedulable());
33474+ assert("edward-737", clust != NULL);
33475+ assert("edward-738", inode != NULL);
33476+ assert("edward-739", cryptcompress_inode_ok(inode));
33477+ assert("edward-1053", clust->hint != NULL);
33478+
33479+ if (!should_create_unprepped_cluster(clust, inode)) {
33480+ if (clust->reserved) {
33481+ cluster_reserved2free(estimate_insert_cluster(inode));
33482+#if REISER4_DEBUG
33483+ assert("edward-1267",
33484+ clust->reserved_unprepped ==
33485+ estimate_insert_cluster(inode));
33486+ clust->reserved_unprepped -=
33487+ estimate_insert_cluster(inode);
33488+#endif
33489+ }
33490+ return 0;
33491+ }
33492+ assert("edward-1268", clust->reserved);
33493+ cluster_reserved2grabbed(estimate_insert_cluster(inode));
33494+#if REISER4_DEBUG
33495+ assert("edward-1441",
33496+ clust->reserved_unprepped == estimate_insert_cluster(inode));
33497+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
33498+#endif
33499+ result = ctail_insert_unprepped_cluster(clust, inode);
33500+ if (result)
33501+ return result;
33502+
33503+ inode_add_bytes(inode, inode_cluster_size(inode));
33504+
33505+ assert("edward-743", cryptcompress_inode_ok(inode));
33506+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33507+
33508+ clust->dstat = UNPR_DISK_CLUSTER;
33509+ return 0;
33510+}
33511+
33512+#if REISER4_DEBUG
33513+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
33514+{
33515+ jnode *node;
33516+ node =
33517+ jlookup(current_tree, get_inode_oid(inode),
33518+ clust_to_pg(index, inode));
33519+ if (likely(!node))
33520+ return 1;
33521+ /* someone got this jnode */
33522+ warning("edward-1315", "jnode %p is untruncated\n", node);
33523+ jput(node);
33524+ return (atomic_read(&node->x_count));
33525+}
33526+#endif
33527+
33528+/* Collect unlocked cluster pages and jnode (the last is in the
33529+ case when the page cluster will be modified and captured) */
33530+int
33531+prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33532+ int capture)
33533+{
33534+ assert("edward-177", inode != NULL);
33535+ assert("edward-741", cryptcompress_inode_ok(inode));
33536+ assert("edward-740", clust->pages != NULL);
33537+
33538+ set_cluster_nrpages(clust, inode);
33539+ reset_cluster_pgset(clust, cluster_nrpages(inode));
33540+ return (capture ?
33541+ grab_cluster_pages_jnode(inode, clust) :
33542+ grab_cluster_pages(inode, clust));
33543+}
33544+
33545+/* Truncate all pages of the cluster of index @index.
33546+ This is called by ->kill_hook() method of item plugin */
33547+void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t index,
33548+ int even_cows)
33549+{
33550+ int i;
33551+ int found = 0;
33552+ int nr_pages;
33553+ jnode *node;
33554+ struct page *pages[MAX_CLUSTER_NRPAGES];
33555+
33556+ node =
33557+ jlookup(current_tree, get_inode_oid(inode),
33558+ clust_to_pg(index, inode));
33559+ /* jnode is absent, just drop pages which can not
33560+ acquire jnode because of exclusive access */
33561+ if (!node)
33562+ goto truncate;
33563+ /* jnode is present and may be dirty */
33564+ nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33565+
33566+ found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33567+ nr_pages, pages);
33568+ spin_lock_jnode(node);
33569+
33570+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
33571+ && index == 0)
33572+ /* converting to unix_file in progress */
33573+ JF_CLR(node, JNODE_CLUSTER_PAGE);
33574+ if (JF_ISSET(node, JNODE_DIRTY)) {
33575+ /* someone has done modifications which are not
33576+ yet committed, so we need to release some resources */
33577+
33578+ /* free disk space grabbed for disk cluster converting */
33579+ cluster_reserved2grabbed(estimate_update_cluster(inode));
33580+ grabbed2free(get_current_context(),
33581+ get_current_super_private(),
33582+ estimate_update_cluster(inode));
33583+
33584+ assert("edward-1198", found == nr_pages);
33585+ assert("edward-1199", node->page_count == nr_pages);
33586+#if REISER4_DEBUG
33587+ node->page_count = 0;
33588+#endif
33589+ /* This will clear dirty bit */
33590+ uncapture_cluster_jnode(node);
33591+
33592+ /* put pages grabbed for last uncommitted modifications */
33593+ for (i = 0; i < nr_pages; i++) {
33594+ assert("edward-1200", PageUptodate(pages[i]));
33595+ page_cache_release(pages[i]);
33596+#if REISER4_DEBUG
33597+ cryptcompress_inode_data(inode)->pgcount --;
33598+#endif
33599+ }
33600+ } else
33601+ spin_unlock_jnode(node);
33602+ /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
33603+
33604+ jput(node);
33605+ /* put pages found here */
33606+ forget_cluster_pages(pages, found);
33607+ truncate:
33608+ if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
33609+ index == 0)
33610+ return;
33611+ reiser4_invalidate_pages(inode->i_mapping,
33612+ clust_to_pg(index, inode),
33613+ cluster_nrpages(inode),
33614+ even_cows);
33615+ assert("edward-1201",
33616+ ergo(!reiser4_inode_get_flag(inode,
33617+ REISER4_FILE_CONV_IN_PROGRESS),
33618+ jnode_truncate_ok(inode, index)));
33619+ return;
33620+}
33621+
33622+/* Prepare cluster handle before(after) modifications
33623+ which are supposed to be committed.
33624+
33625+ . grab cluster pages;
33626+ . reserve disk space;
33627+ . maybe read pages from disk and set the disk cluster dirty;
33628+ . maybe write hole;
33629+ . maybe create 'unprepped' disk cluster if the last one is fake
33630+ (i.e. is not represenred by any items)
33631+*/
33632+
33633+static int
33634+prepare_cluster(struct inode *inode,
33635+ loff_t file_off /* write position in the file */ ,
33636+ loff_t to_file, /* bytes of users data to write to the file */
33637+ reiser4_cluster_t * clust, page_cluster_op op)
33638+{
33639+ int result = 0;
33640+ reiser4_slide_t *win = clust->win;
33641+
33642+ reset_cluster_params(clust);
33643+ cluster_set_tfm_act(&clust->tc, TFMA_READ);
33644+#if REISER4_DEBUG
33645+ clust->ctx = get_current_context();
33646+#endif
33647+ assert("edward-1190", op != PCL_UNKNOWN);
33648+
33649+ clust->op = op;
33650+
33651+ result = prepare_page_cluster(inode, clust, 1);
33652+ if (result)
33653+ return result;
33654+ assert("edward-1447",
33655+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
33656+ assert("edward-1448",
33657+ ergo(clust->nr_pages != 0,
33658+ jnode_is_cluster_page(jprivate(clust->pages[0]))));
33659+
33660+ result = reserve4cluster(inode, clust);
33661+ if (result)
33662+ goto err1;
33663+ result = read_some_cluster_pages(inode, clust);
33664+ if (result) {
33665+ free_reserved4cluster(inode,
33666+ clust,
33667+ estimate_update_cluster(inode) +
33668+ estimate_insert_cluster(inode));
33669+ goto err1;
33670+ }
33671+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33672+
33673+ result = cryptcompress_make_unprepped_cluster(clust, inode);
33674+ if (result)
33675+ goto err2;
33676+ if (win && win->stat == HOLE_WINDOW) {
33677+ result = write_hole(inode, clust, file_off, to_file);
33678+ if (result)
33679+ goto err2;
33680+ }
33681+ return 0;
33682+ err2:
33683+ free_reserved4cluster(inode, clust,
33684+ estimate_update_cluster(inode));
33685+ err1:
33686+ reiser4_release_cluster_pages_and_jnode(clust);
33687+ assert("edward-1125", result == -ENOSPC);
33688+ return result;
33689+}
33690+
33691+/* set window by two offsets */
33692+static void
33693+set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
33694+ struct inode *inode, loff_t o1, loff_t o2)
33695+{
33696+ assert("edward-295", clust != NULL);
33697+ assert("edward-296", inode != NULL);
33698+ assert("edward-1071", win != NULL);
33699+ assert("edward-297", o1 <= o2);
33700+
33701+ clust->index = off_to_clust(o1, inode);
33702+
33703+ win->off = off_to_cloff(o1, inode);
33704+ win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
33705+ win->delta = 0;
33706+
33707+ clust->win = win;
33708+}
33709+
33710+static int
33711+set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
33712+ reiser4_slide_t * win, flow_t * f, loff_t file_off)
33713+{
33714+ int result;
33715+
33716+ assert("edward-197", clust != NULL);
33717+ assert("edward-1072", win != NULL);
33718+ assert("edward-198", inode != NULL);
33719+
33720+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33721+ if (result)
33722+ return result;
33723+
33724+ if (file_off > inode->i_size) {
33725+ /* Uhmm, hole in cryptcompress file... */
33726+ loff_t hole_size;
33727+ hole_size = file_off - inode->i_size;
33728+
33729+ set_window(clust, win, inode, inode->i_size, file_off);
33730+ win->stat = HOLE_WINDOW;
33731+ if (win->off + hole_size < inode_cluster_size(inode))
33732+ /* there is also user's data to append to the hole */
33733+ win->delta =
33734+ min_count(inode_cluster_size(inode) -
33735+ (win->off + win->count), f->length);
33736+ return 0;
33737+ }
33738+ set_window(clust, win, inode, file_off, file_off + f->length);
33739+ win->stat = DATA_WINDOW;
33740+ return 0;
33741+}
33742+
33743+int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
33744+ int count)
33745+{
33746+ int result = 0;
33747+ int (*setting_actor)(reiser4_cluster_t * clust, int count);
33748+
33749+ assert("edward-1358", clust != NULL);
33750+ assert("edward-1359", page != NULL);
33751+ assert("edward-1360", page->mapping != NULL);
33752+ assert("edward-1361", page->mapping->host != NULL);
33753+
33754+ setting_actor = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33755+ result = setting_actor(clust, count);
33756+ clust->index = pg_to_clust(page->index, page->mapping->host);
33757+ return result;
33758+}
33759+
33760+/* reset all the params that not get updated */
33761+void reset_cluster_params(reiser4_cluster_t * clust)
33762+{
33763+ assert("edward-197", clust != NULL);
33764+
33765+ clust->dstat = INVAL_DISK_CLUSTER;
33766+ clust->tc.uptodate = 0;
33767+ clust->tc.len = 0;
33768+}
33769+
33770+/* Core write procedure of cryptcompress plugin, which slices user's
33771+ flow into logical clusters, maps the last ones to the appropriate
33772+ page clusters, and tries to capture them.
33773+ If @buf != NULL, returns number of successfully written bytes,
33774+ otherwise returns error
33775+*/
33776+static loff_t
33777+write_cryptcompress_flow(struct file *file, struct inode *inode,
33778+ const char __user *buf, size_t count, loff_t pos,
33779+ int *conv_occured)
33780+{
33781+ int i;
33782+ flow_t f;
33783+ hint_t *hint;
33784+ int result = 0;
33785+ size_t to_write = 0;
33786+ loff_t file_off;
33787+ reiser4_slide_t win;
33788+ reiser4_cluster_t clust;
33789+
33790+ assert("edward-161", reiser4_schedulable());
33791+ assert("edward-748", cryptcompress_inode_ok(inode));
33792+ assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33793+ assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33794+
33795+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33796+ if (hint == NULL)
33797+ return RETERR(-ENOMEM);
33798+
33799+ result = load_file_hint(file, hint);
33800+ if (result) {
33801+ kfree(hint);
33802+ return result;
33803+ }
33804+
33805+ result =
33806+ flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
33807+ count, pos, WRITE_OP, &f);
33808+ if (result)
33809+ goto out;
33810+ to_write = f.length;
33811+
33812+ /* current write position in file */
33813+ file_off = pos;
33814+ reiser4_slide_init(&win);
33815+ cluster_init_read(&clust, &win);
33816+ clust.hint = hint;
33817+
33818+ result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
33819+ if (result)
33820+ goto out;
33821+
33822+ if (next_window_stat(&win) == HOLE_WINDOW) {
33823+ result = write_conversion_hook(file, inode, pos, &clust, NULL);
33824+ if (result)
33825+ goto out;
33826+ result =
33827+ prepare_cluster(inode, file_off, f.length, &clust,
33828+ PCL_APPEND);
33829+ if (result)
33830+ goto out;
33831+ }
33832+ do {
33833+ char *src;
33834+ unsigned page_off, page_count;
33835+
33836+ assert("edward-750", reiser4_schedulable());
33837+
33838+ result = write_conversion_hook(file, inode, pos, &clust,
33839+ conv_occured);
33840+ if (result || *conv_occured)
33841+ goto out;
33842+ result =
33843+ prepare_cluster(inode, file_off, f.length, &clust,
33844+ PCL_APPEND);
33845+ if (result)
33846+ goto out;
33847+
33848+ assert("edward-751", cryptcompress_inode_ok(inode));
33849+ assert("edward-204", win.stat == DATA_WINDOW);
33850+ assert("edward-1288", hint_is_valid(clust.hint));
33851+ assert("edward-752",
33852+ znode_is_write_locked(hint->ext_coord.coord.node));
33853+
33854+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33855+
33856+ /* set write position in page */
33857+ page_off = off_to_pgoff(win.off);
33858+
33859+ /* copy user's data to cluster pages */
33860+ for (i = off_to_pg(win.off), src = f.data;
33861+ i < count_to_nrpages(win.off + win.count);
33862+ i++, src += page_count) {
33863+ page_count =
33864+ cnt_to_pgcnt(win.off + win.count, i) - page_off;
33865+
33866+ assert("edward-1039",
33867+ page_off + page_count <= PAGE_CACHE_SIZE);
33868+ assert("edward-287", clust.pages[i] != NULL);
33869+
33870+ lock_page(clust.pages[i]);
33871+ result =
33872+ __copy_from_user((char *)kmap(clust.pages[i]) +
33873+ page_off, (char __user *)src, page_count);
33874+ kunmap(clust.pages[i]);
33875+ if (unlikely(result)) {
33876+ unlock_page(clust.pages[i]);
33877+ result = -EFAULT;
33878+ goto err2;
33879+ }
33880+ SetPageUptodate(clust.pages[i]);
33881+ unlock_page(clust.pages[i]);
33882+ page_off = 0;
33883+ }
33884+ assert("edward-753", cryptcompress_inode_ok(inode));
33885+
33886+ set_cluster_pages_dirty(&clust);
33887+
33888+ result = try_capture_cluster(&clust, inode);
33889+ if (result)
33890+ goto err2;
33891+
33892+ assert("edward-998", f.user == 1);
33893+
33894+ move_flow_forward(&f, win.count);
33895+
33896+ /* disk cluster may be already clean at this point */
33897+
33898+ /* . update cluster
33899+ . set hint for new offset
33900+ . unlock znode
33901+ . update inode
33902+ . balance dirty pages
33903+ */
33904+ result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
33905+ if (result)
33906+ goto err1;
33907+ assert("edward-755", hint->lh.owner == NULL);
33908+ reset_cluster_params(&clust);
33909+ continue;
33910+ err2:
33911+ reiser4_release_cluster_pages_and_jnode(&clust);
33912+ err1:
33913+ if (clust.reserved)
33914+ free_reserved4cluster(inode,
33915+ &clust,
33916+ estimate_update_cluster(inode));
33917+ break;
33918+ } while (f.length);
33919+ out:
33920+ done_lh(&hint->lh);
33921+ if (result == -EEXIST)
33922+ warning("edward-1407", "write returns EEXIST!\n");
33923+
33924+ put_cluster_handle(&clust);
33925+ save_file_hint(file, hint);
33926+ kfree(hint);
33927+ if (buf) {
33928+ /* if nothing were written - there must be an error */
33929+ assert("edward-195", ergo((to_write == f.length),
33930+ (result < 0 || *conv_occured)));
33931+ return (to_write - f.length) ? (to_write - f.length) : result;
33932+ }
33933+ return result;
33934+}
33935+
33936+/**
33937+ * write_cryptcompress - write of struct file_operations
33938+ * @file: file to write to
33939+ * @buf: address of user-space buffer
33940+ * @read_amount: number of bytes to write
33941+ * @off: position in file to write to
33942+ *
33943+ * This is implementation of vfs's write method of struct file_operations for
33944+ * cryptcompress plugin.
33945+ */
33946+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
33947+ size_t count, loff_t *off, int *conv)
33948+{
33949+ ssize_t result;
33950+ struct inode *inode;
33951+ reiser4_context *ctx;
33952+ loff_t pos = *off;
33953+ cryptcompress_info_t *info;
33954+
33955+ assert("edward-1449", *conv == 0);
33956+
33957+ inode = file->f_dentry->d_inode;
33958+ assert("edward-196", cryptcompress_inode_ok(inode));
33959+
33960+ info = cryptcompress_inode_data(inode);
33961+
33962+ ctx = reiser4_init_context(inode->i_sb);
33963+ if (IS_ERR(ctx))
33964+ return PTR_ERR(ctx);
33965+
33966+ mutex_lock(&inode->i_mutex);
33967+
33968+ result = generic_write_checks(file, &pos, &count, 0);
33969+ if (unlikely(result != 0))
33970+ goto out;
33971+ if (unlikely(count == 0))
33972+ goto out;
33973+ result = remove_suid(file->f_dentry);
33974+ if (unlikely(result != 0))
33975+ goto out;
33976+ /* remove_suid might create a transaction */
33977+ reiser4_txn_restart(ctx);
33978+
33979+ result = write_cryptcompress_flow(file, inode, buf, count, pos, conv);
33980+
33981+ if (result < 0)
33982+ goto out;
33983+ /* update position in a file */
33984+ *off = pos + result;
33985+ out:
33986+ mutex_unlock(&inode->i_mutex);
33987+
33988+ context_set_commit_async(ctx);
33989+ reiser4_exit_context(ctx);
33990+ return result;
33991+}
33992+
33993+int readpages_cryptcompress(struct file *file, struct address_space *mapping,
33994+ struct list_head *pages, unsigned nr_pages)
33995+{
33996+ reiser4_context * ctx;
33997+ int ret;
33998+
33999+ ctx = reiser4_init_context(mapping->host->i_sb);
34000+ if (IS_ERR(ctx)) {
34001+ ret = PTR_ERR(ctx);
34002+ goto err;
34003+ }
34004+ /* crc files can be built of ctail items only */
34005+ ret = readpages_ctail(file, mapping, pages);
34006+ reiser4_exit_context(ctx);
34007+ if (ret) {
34008+err:
34009+ put_pages_list(pages);
34010+ }
34011+ return ret;
34012+}
34013+
34014+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
34015+{
34016+ /* reserve one block to update stat data item */
34017+ assert("edward-1193",
34018+ inode_file_plugin(inode)->estimate.update ==
34019+ estimate_update_common);
34020+ return estimate_update_common(inode);
34021+}
34022+
34023+/**
34024+ * read_cryptcompress - read of struct file_operations
34025+ * @file: file to read from
34026+ * @buf: address of user-space buffer
34027+ * @read_amount: number of bytes to read
34028+ * @off: position in file to read from
34029+ *
34030+ * This is implementation of vfs's read method of struct file_operations for
34031+ * cryptcompress plugin.
34032+ */
34033+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
34034+ loff_t * off)
34035+{
34036+ ssize_t result;
34037+ struct inode *inode;
34038+ reiser4_context *ctx;
34039+ cryptcompress_info_t *info;
34040+ reiser4_block_nr needed;
34041+
34042+ inode = file->f_dentry->d_inode;
34043+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34044+
34045+ ctx = reiser4_init_context(inode->i_sb);
34046+ if (IS_ERR(ctx))
34047+ return PTR_ERR(ctx);
34048+
34049+ info = cryptcompress_inode_data(inode);
34050+ needed = cryptcompress_estimate_read(inode);
34051+
34052+ result = reiser4_grab_space(needed, BA_CAN_COMMIT);
34053+ if (result != 0) {
34054+ reiser4_exit_context(ctx);
34055+ return result;
34056+ }
34057+
34058+ LOCK_CNT_INC(inode_sem_r);
34059+
34060+ result = do_sync_read(file, buf, size, off);
34061+
34062+ LOCK_CNT_DEC(inode_sem_r);
34063+
34064+ context_set_commit_async(ctx);
34065+ reiser4_exit_context(ctx);
34066+
34067+ return result;
34068+}
34069+
34070+/* If @index > 0, find real disk cluster of the index (@index - 1),
34071+ If @index == 0 find the real disk cluster of the object of maximal index.
34072+ Keep incremented index of the result in @found.
34073+ It succes was returned:
34074+ (@index == 0 && @found == 0) means that the object doesn't have real disk
34075+ clusters.
34076+ (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
34077+ exist.
34078+*/
34079+static int
34080+find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
34081+{
34082+ int result;
34083+ reiser4_key key;
34084+ loff_t offset;
34085+ hint_t *hint;
34086+ lock_handle *lh;
34087+ lookup_bias bias;
34088+ coord_t *coord;
34089+ item_plugin *iplug;
34090+
34091+ assert("edward-1131", inode != NULL);
34092+ assert("edward-95", cryptcompress_inode_ok(inode));
34093+
34094+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34095+ if (hint == NULL)
34096+ return RETERR(-ENOMEM);
34097+ hint_init_zero(hint);
34098+ lh = &hint->lh;
34099+
34100+ bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
34101+ offset =
34102+ (index ? clust_to_off(index, inode) -
34103+ 1 : get_key_offset(reiser4_max_key()));
34104+
34105+ key_by_inode_cryptcompress(inode, offset, &key);
34106+
34107+ /* find the last item of this object */
34108+ result =
34109+ find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
34110+ bias, 0);
34111+ if (cbk_errored(result)) {
34112+ done_lh(lh);
34113+ kfree(hint);
34114+ return result;
34115+ }
34116+ if (result == CBK_COORD_NOTFOUND) {
34117+ /* no real disk clusters */
34118+ done_lh(lh);
34119+ kfree(hint);
34120+ *found = 0;
34121+ return 0;
34122+ }
34123+ /* disk cluster is found */
34124+ coord = &hint->ext_coord.coord;
34125+ coord_clear_iplug(coord);
34126+ result = zload(coord->node);
34127+ if (unlikely(result)) {
34128+ done_lh(lh);
34129+ kfree(hint);
34130+ return result;
34131+ }
34132+ iplug = item_plugin_by_coord(coord);
34133+ assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
34134+ assert("edward-1202", ctail_ok(coord));
34135+
34136+ item_key_by_coord(coord, &key);
34137+ *found = off_to_clust(get_key_offset(&key), inode) + 1;
34138+
34139+ assert("edward-1132", ergo(index, index == *found));
34140+
34141+ zrelse(coord->node);
34142+ done_lh(lh);
34143+ kfree(hint);
34144+ return 0;
34145+}
34146+
34147+static int find_fake_appended(struct inode *inode, cloff_t * index)
34148+{
34149+ return find_real_disk_cluster(inode, index,
34150+ 0 /* find last real one */ );
34151+}
34152+
34153+/* Set left coord when unit is not found after node_lookup()
34154+ This takes into account that there can be holes in a sequence
34155+ of disk clusters */
34156+
34157+static void adjust_left_coord(coord_t * left_coord)
34158+{
34159+ switch (left_coord->between) {
34160+ case AFTER_UNIT:
34161+ left_coord->between = AFTER_ITEM;
34162+ case AFTER_ITEM:
34163+ case BEFORE_UNIT:
34164+ break;
34165+ default:
34166+ impossible("edward-1204", "bad left coord to cut");
34167+ }
34168+ return;
34169+}
34170+
34171+#define CRC_CUT_TREE_MIN_ITERATIONS 64
34172+int
34173+cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
34174+ const reiser4_key * to_key,
34175+ reiser4_key * smallest_removed,
34176+ struct inode *object, int truncate, int *progress)
34177+{
34178+ lock_handle next_node_lock;
34179+ coord_t left_coord;
34180+ int result;
34181+
34182+ assert("edward-1158", tap->coord->node != NULL);
34183+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
34184+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
34185+
34186+ *progress = 0;
34187+ init_lh(&next_node_lock);
34188+
34189+ while (1) {
34190+ znode *node; /* node from which items are cut */
34191+ node_plugin *nplug; /* node plugin for @node */
34192+
34193+ node = tap->coord->node;
34194+
34195+ /* Move next_node_lock to the next node on the left. */
34196+ result =
34197+ reiser4_get_left_neighbor(&next_node_lock, node,
34198+ ZNODE_WRITE_LOCK,
34199+ GN_CAN_USE_UPPER_LEVELS);
34200+ if (result != 0 && result != -E_NO_NEIGHBOR)
34201+ break;
34202+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
34203+ result = reiser4_tap_load(tap);
34204+ if (result)
34205+ return result;
34206+
34207+ /* Prepare the second (right) point for cut_node() */
34208+ if (*progress)
34209+ coord_init_last_unit(tap->coord, node);
34210+
34211+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
34212+ /* set rightmost unit for the items without lookup method */
34213+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
34214+
34215+ nplug = node->nplug;
34216+
34217+ assert("edward-1161", nplug);
34218+ assert("edward-1162", nplug->lookup);
34219+
34220+ /* left_coord is leftmost unit cut from @node */
34221+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
34222+
34223+ if (IS_CBKERR(result))
34224+ break;
34225+
34226+ if (result == CBK_COORD_NOTFOUND)
34227+ adjust_left_coord(&left_coord);
34228+
34229+ /* adjust coordinates so that they are set to existing units */
34230+ if (coord_set_to_right(&left_coord)
34231+ || coord_set_to_left(tap->coord)) {
34232+ result = 0;
34233+ break;
34234+ }
34235+
34236+ if (coord_compare(&left_coord, tap->coord) ==
34237+ COORD_CMP_ON_RIGHT) {
34238+ /* keys from @from_key to @to_key are not in the tree */
34239+ result = 0;
34240+ break;
34241+ }
34242+
34243+ /* cut data from one node */
34244+ *smallest_removed = *reiser4_min_key();
34245+ result = kill_node_content(&left_coord,
34246+ tap->coord,
34247+ from_key,
34248+ to_key,
34249+ smallest_removed,
34250+ next_node_lock.node,
34251+ object, truncate);
34252+#if REISER4_DEBUG
34253+ /*node_check(node, ~0U); */
34254+#endif
34255+ reiser4_tap_relse(tap);
34256+
34257+ if (result)
34258+ break;
34259+
34260+ ++(*progress);
34261+
34262+ /* Check whether all items with keys >= from_key were removed
34263+ * from the tree. */
34264+ if (keyle(smallest_removed, from_key))
34265+ /* result = 0; */
34266+ break;
34267+
34268+ if (next_node_lock.node == NULL)
34269+ break;
34270+
34271+ result = reiser4_tap_move(tap, &next_node_lock);
34272+ done_lh(&next_node_lock);
34273+ if (result)
34274+ break;
34275+
34276+ /* Break long cut_tree operation (deletion of a large file) if
34277+ * atom requires commit. */
34278+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
34279+ && current_atom_should_commit()) {
34280+ result = -E_REPEAT;
34281+ break;
34282+ }
34283+ }
34284+ done_lh(&next_node_lock);
34285+ return result;
34286+}
34287+
34288+/* Append or expand hole in two steps (exclusive access should be aquired!)
34289+ 1) write zeroes to the current real cluster,
34290+ 2) expand hole via fake clusters (just increase i_size) */
34291+static int
34292+cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
34293+ loff_t new_size)
34294+{
34295+ int result = 0;
34296+ hint_t *hint;
34297+ lock_handle *lh;
34298+ loff_t hole_size;
34299+ int nr_zeroes;
34300+ reiser4_slide_t win;
34301+ reiser4_cluster_t clust;
34302+
34303+ assert("edward-1133", inode->i_size < new_size);
34304+ assert("edward-1134", reiser4_schedulable());
34305+ assert("edward-1135", cryptcompress_inode_ok(inode));
34306+ assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
34307+ assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
34308+
34309+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34310+ if (hint == NULL)
34311+ return RETERR(-ENOMEM);
34312+ hint_init_zero(hint);
34313+ lh = &hint->lh;
34314+
34315+ reiser4_slide_init(&win);
34316+ cluster_init_read(&clust, &win);
34317+ clust.hint = hint;
34318+
34319+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34320+ if (result)
34321+ goto out;
34322+ if (off_to_cloff(inode->i_size, inode) == 0)
34323+ goto fake_append;
34324+ hole_size = new_size - inode->i_size;
34325+ nr_zeroes =
34326+ inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
34327+ if (hole_size < nr_zeroes)
34328+ nr_zeroes = hole_size;
34329+ set_window(&clust, &win, inode, inode->i_size,
34330+ inode->i_size + nr_zeroes);
34331+ win.stat = HOLE_WINDOW;
34332+
34333+ assert("edward-1137",
34334+ clust.index == off_to_clust(inode->i_size, inode));
34335+
34336+ result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
34337+
34338+ assert("edward-1271", !result || result == -ENOSPC);
34339+ if (result)
34340+ goto out;
34341+ assert("edward-1139",
34342+ clust.dstat == PREP_DISK_CLUSTER ||
34343+ clust.dstat == UNPR_DISK_CLUSTER);
34344+
34345+ assert("edward-1431", hole_size >= nr_zeroes);
34346+ if (hole_size == nr_zeroes)
34347+ /* nothing to append anymore */
34348+ goto out;
34349+ fake_append:
34350+ INODE_SET_FIELD(inode, i_size, new_size);
34351+ out:
34352+ done_lh(lh);
34353+ kfree(hint);
34354+ put_cluster_handle(&clust);
34355+ return result;
34356+}
34357+
34358+#if REISER4_DEBUG
34359+static int
34360+pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
34361+{
34362+ struct pagevec pvec;
34363+ int i;
34364+ int count;
34365+ int rest;
34366+
34367+ rest = count_to_nrpages(old_size) - start;
34368+
34369+ pagevec_init(&pvec, 0);
34370+ count = min_count(pagevec_space(&pvec), rest);
34371+
34372+ while (rest) {
34373+ count = min_count(pagevec_space(&pvec), rest);
34374+ pvec.nr = find_get_pages(inode->i_mapping, start,
34375+ count, pvec.pages);
34376+ for (i = 0; i < pagevec_count(&pvec); i++) {
34377+ if (PageUptodate(pvec.pages[i])) {
34378+ warning("edward-1205",
34379+ "truncated page of index %lu is uptodate",
34380+ pvec.pages[i]->index);
34381+ return 0;
34382+ }
34383+ }
34384+ start += count;
34385+ rest -= count;
34386+ pagevec_release(&pvec);
34387+ }
34388+ return 1;
34389+}
34390+
34391+static int body_truncate_ok(struct inode *inode, cloff_t aidx)
34392+{
34393+ int result;
34394+ cloff_t raidx;
34395+
34396+ result = find_fake_appended(inode, &raidx);
34397+ return !result && (aidx == raidx);
34398+}
34399+#endif
34400+
34401+static int
34402+update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34403+{
34404+ return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
34405+ ? 0 : reiser4_update_file_size(inode, key, update_sd));
34406+}
34407+
34408+/* prune cryptcompress file in two steps (exclusive access should be acquired!)
34409+ 1) cut all disk clusters but the last one partially truncated,
34410+ 2) set zeroes and capture last partially truncated page cluster if the last
34411+ one exists, otherwise truncate via prune fake cluster (just decrease i_size)
34412+*/
34413+static int
34414+prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
34415+ cloff_t aidx)
34416+{
34417+ int result = 0;
34418+ unsigned nr_zeroes;
34419+ loff_t to_prune;
34420+ loff_t old_size;
34421+ cloff_t ridx;
34422+
34423+ hint_t *hint;
34424+ lock_handle *lh;
34425+ reiser4_slide_t win;
34426+ reiser4_cluster_t clust;
34427+
34428+ assert("edward-1140", inode->i_size >= new_size);
34429+ assert("edward-1141", reiser4_schedulable());
34430+ assert("edward-1142", cryptcompress_inode_ok(inode));
34431+ assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34432+
34433+ old_size = inode->i_size;
34434+
34435+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34436+ if (hint == NULL)
34437+ return RETERR(-ENOMEM);
34438+ hint_init_zero(hint);
34439+ lh = &hint->lh;
34440+
34441+ reiser4_slide_init(&win);
34442+ cluster_init_read(&clust, &win);
34443+ clust.hint = hint;
34444+
34445+ /* rightmost completely truncated cluster */
34446+ ridx = count_to_nrclust(new_size, inode);
34447+
34448+ assert("edward-1174", ridx <= aidx);
34449+ old_size = inode->i_size;
34450+ if (ridx != aidx) {
34451+ result = cut_file_items(inode,
34452+ clust_to_off(ridx, inode),
34453+ update_sd,
34454+ clust_to_off(aidx, inode),
34455+ update_cryptcompress_size);
34456+ if (result)
34457+ goto out;
34458+ }
34459+ if (!off_to_cloff(new_size, inode)) {
34460+ /* no partially truncated clusters */
34461+ assert("edward-1145", inode->i_size == new_size);
34462+ goto finish;
34463+ }
34464+ assert("edward-1146", new_size < inode->i_size);
34465+
34466+ to_prune = inode->i_size - new_size;
34467+
34468+ /* partial truncate of leftmost cluster,
34469+ first check if it is fake */
34470+ result = find_real_disk_cluster(inode, &aidx, ridx);
34471+ if (result)
34472+ goto out;
34473+ if (!aidx)
34474+ /* yup, this is fake one */
34475+ goto finish;
34476+
34477+ assert("edward-1148", aidx == ridx);
34478+
34479+ /* do partial truncate of the leftmost page cluster,
34480+ then try to capture this one */
34481+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34482+ if (result)
34483+ goto out;
34484+ nr_zeroes = (off_to_pgoff(new_size) ?
34485+ PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34486+ set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34487+ win.stat = HOLE_WINDOW;
34488+
34489+ assert("edward-1149", clust.index == ridx - 1);
34490+
34491+ result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
34492+ if (result)
34493+ goto out;
34494+ assert("edward-1151",
34495+ clust.dstat == PREP_DISK_CLUSTER ||
34496+ clust.dstat == UNPR_DISK_CLUSTER);
34497+
34498+ assert("edward-1191", inode->i_size == new_size);
34499+ assert("edward-1206", body_truncate_ok(inode, ridx));
34500+ finish:
34501+ /* drop all the pages that don't have jnodes (i.e. pages
34502+ which can not be truncated by cut_file_items() because
34503+ of holes represented by fake disk clusters) including
34504+ the pages of partially truncated cluster which was
34505+ released by prepare_cluster() */
34506+ truncate_inode_pages(inode->i_mapping, new_size);
34507+ INODE_SET_FIELD(inode, i_size, new_size);
34508+ out:
34509+ assert("edward-1334", !result || result == -ENOSPC);
34510+ assert("edward-1209",
34511+ pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
34512+ done_lh(lh);
34513+ kfree(hint);
34514+ put_cluster_handle(&clust);
34515+ return result;
34516+}
34517+
34518+/* Prepare cryptcompress file for truncate:
34519+ prune or append rightmost fake logical clusters (if any)
34520+*/
34521+static int
34522+start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34523+ int update_sd)
34524+{
34525+ int result = 0;
34526+ int bytes;
34527+
34528+ if (new_size > inode->i_size) {
34529+ /* append */
34530+ if (inode->i_size < clust_to_off(aidx, inode))
34531+ /* no fake bytes */
34532+ return 0;
34533+ bytes = new_size - inode->i_size;
34534+ INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34535+ } else {
34536+ /* prune */
34537+ if (inode->i_size <= clust_to_off(aidx, inode))
34538+ /* no fake bytes */
34539+ return 0;
34540+ bytes =
34541+ inode->i_size - max_count(new_size,
34542+ clust_to_off(aidx, inode));
34543+ if (!bytes)
34544+ return 0;
34545+ INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34546+ /* In the case of fake prune we need to drop page cluster.
34547+ There are only 2 cases for partially truncated page:
34548+ 1. If is is dirty, therefore it is anonymous
34549+ (was dirtied via mmap), and will be captured
34550+ later via ->capture().
34551+ 2. If is clean, therefore it is filled by zeroes.
34552+ In both cases we don't need to make it dirty and
34553+ capture here.
34554+ */
34555+ truncate_inode_pages(inode->i_mapping, inode->i_size);
34556+ }
34557+ if (update_sd)
34558+ result = update_sd_cryptcompress(inode);
34559+ return result;
34560+}
34561+
34562+/* This is called in setattr_cryptcompress when it is used to truncate,
34563+ and in delete_cryptcompress */
34564+static int cryptcompress_truncate(struct inode *inode, /* old size */
34565+ loff_t new_size, /* new size */
34566+ int update_sd)
34567+{
34568+ int result;
34569+ cloff_t aidx;
34570+
34571+ result = find_fake_appended(inode, &aidx);
34572+ if (result)
34573+ return result;
34574+ assert("edward-1208",
34575+ ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34576+
34577+ result = start_truncate_fake(inode, aidx, new_size, update_sd);
34578+ if (result)
34579+ return result;
34580+ if (inode->i_size == new_size)
34581+ /* nothing to truncate anymore */
34582+ return 0;
34583+ result = (inode->i_size < new_size ?
34584+ cryptcompress_append_hole(inode, new_size) :
34585+ prune_cryptcompress(inode, new_size, update_sd, aidx));
34586+ if (!result && update_sd)
34587+ result = update_sd_cryptcompress(inode);
34588+ return result;
34589+}
34590+
34591+static void clear_moved_tag_cluster(struct address_space * mapping,
34592+ reiser4_cluster_t * clust)
34593+{
34594+ int i;
34595+ void * ret;
34596+ read_lock_irq(&mapping->tree_lock);
34597+ for (i = 0; i < clust->nr_pages; i++) {
34598+ assert("edward-1438", clust->pages[i] != NULL);
34599+ ret = radix_tree_tag_clear(&mapping->page_tree,
34600+ clust->pages[i]->index,
34601+ PAGECACHE_TAG_REISER4_MOVED);
34602+ assert("edward-1439", ret == clust->pages[i]);
34603+ }
34604+ read_unlock_irq(&mapping->tree_lock);
34605+}
34606+
34607+/* Capture an anonymous pager cluster. (Page cluser is
34608+ anonymous if it contains at least one anonymous page */
34609+static int
34610+capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
34611+{
34612+ int result;
34613+
34614+ assert("edward-1073", clust != NULL);
34615+ assert("edward-1074", inode != NULL);
34616+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34617+
34618+ result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
34619+ if (result)
34620+ return result;
34621+ set_cluster_pages_dirty(clust);
34622+ clear_moved_tag_cluster(inode->i_mapping, clust);
34623+
34624+ result = try_capture_cluster(clust, inode);
34625+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34626+ if (unlikely(result)) {
34627+ /* set cleared tag back, so it will be
34628+ possible to capture it again later */
34629+ read_lock_irq(&inode->i_mapping->tree_lock);
34630+ radix_tree_tag_set(&inode->i_mapping->page_tree,
34631+ clust_to_pg(clust->index, inode),
34632+ PAGECACHE_TAG_REISER4_MOVED);
34633+ read_unlock_irq(&inode->i_mapping->tree_lock);
34634+
34635+ reiser4_release_cluster_pages_and_jnode(clust);
34636+ }
34637+ return result;
34638+}
34639+
34640+#define MAX_CLUSTERS_TO_CAPTURE(inode) (1024 >> cluster_nrpages_shift(inode))
34641+
34642+/* read lock should be acquired */
34643+static int
34644+capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
34645+ int to_capture)
34646+{
34647+ int result = 0;
34648+ int found;
34649+ struct page *page = NULL;
34650+ hint_t *hint;
34651+ lock_handle *lh;
34652+ reiser4_cluster_t clust;
34653+
34654+ assert("edward-1127", mapping != NULL);
34655+ assert("edward-1128", mapping->host != NULL);
34656+ assert("edward-1440", mapping->host->i_mapping == mapping);
34657+
34658+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34659+ if (hint == NULL)
34660+ return RETERR(-ENOMEM);
34661+ hint_init_zero(hint);
34662+ lh = &hint->lh;
34663+
34664+ cluster_init_read(&clust, NULL);
34665+ clust.hint = hint;
34666+
34667+ result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
34668+ if (result)
34669+ goto out;
34670+
34671+ while (to_capture > 0) {
34672+ found =
34673+ find_get_pages_tag(mapping, index,
34674+ PAGECACHE_TAG_REISER4_MOVED, 1, &page);
34675+ if (!found) {
34676+ *index = (pgoff_t) - 1;
34677+ break;
34678+ }
34679+ assert("edward-1109", page != NULL);
34680+
34681+ move_cluster_forward(&clust, mapping->host, page->index);
34682+ result = capture_page_cluster(&clust, mapping->host);
34683+ page_cache_release(page);
34684+ if (result)
34685+ break;
34686+ to_capture -= clust.nr_pages;
34687+ }
34688+ if (result) {
34689+ warning("edward-1077",
34690+ "Cannot capture anon pages: result=%i (captured=%d)\n",
34691+ result,
34692+ ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
34693+ to_capture);
34694+ } else {
34695+ /* something had to be found */
34696+ assert("edward-1078",
34697+ to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
34698+ if (to_capture <= 0)
34699+ /* there may be left more pages */
34700+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
34701+ }
34702+ out:
34703+ done_lh(lh);
34704+ kfree(hint);
34705+ put_cluster_handle(&clust);
34706+ return result;
34707+}
34708+
34709+/* Check mapping for existence of not captured dirty pages.
34710+ This returns !0 if either page tree contains pages tagged
34711+ PAGECACHE_TAG_REISER4_MOVED */
34712+static int cryptcompress_inode_has_anon_pages(struct inode *inode)
34713+{
34714+ return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
34715+}
34716+
34717+/* this is implementation of vfs's writepages method of struct
34718+ address_space_operations */
34719+int
34720+writepages_cryptcompress(struct address_space *mapping,
34721+ struct writeback_control *wbc)
34722+{
34723+ int result;
34724+ int to_capture;
34725+ pgoff_t nrpages;
34726+ pgoff_t index = 0;
34727+ cryptcompress_info_t *info;
34728+ struct inode *inode;
34729+
34730+ inode = mapping->host;
34731+ if (!cryptcompress_inode_has_anon_pages(inode)) {
34732+ result = 0;
34733+ goto end;
34734+ }
34735+
34736+ info = cryptcompress_inode_data(inode);
34737+ nrpages = count_to_nrpages(i_size_read(inode));
34738+
34739+ if (wbc->sync_mode != WB_SYNC_ALL)
34740+ to_capture =
34741+ min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
34742+ else
34743+ to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
34744+ do {
34745+ reiser4_context *ctx;
34746+
34747+ ctx = reiser4_init_context(inode->i_sb);
34748+ if (IS_ERR(ctx)) {
34749+ result = PTR_ERR(ctx);
34750+ break;
34751+ }
34752+ ctx->nobalance = 1;
34753+
34754+ assert("edward-1079",
34755+ lock_stack_isclean(get_current_lock_stack()));
34756+
34757+ LOCK_CNT_INC(inode_sem_r);
34758+
34759+ result =
34760+ capture_anonymous_clusters(inode->i_mapping, &index,
34761+ to_capture);
34762+
34763+ if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
34764+ reiser4_exit_context(ctx);
34765+ break;
34766+ }
34767+ result = txnmgr_force_commit_all(inode->i_sb, 0);
34768+ reiser4_exit_context(ctx);
34769+ } while (result == 0 && index < nrpages);
34770+
34771+ end:
34772+ if (is_in_reiser4_context()) {
34773+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34774+ /* there are already pages to flush, flush them out, do
34775+ not delay until end of reiser4_sync_inodes */
34776+ reiser4_writeout(inode->i_sb, wbc);
34777+ get_current_context()->nr_captured = 0;
34778+ }
34779+ }
34780+ return result;
34781+}
34782+
34783+/* plugin->u.file.mmap */
34784+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34785+{
34786+ int result;
34787+ struct inode *inode;
34788+ reiser4_context *ctx;
34789+
34790+ inode = file->f_dentry->d_inode;
34791+ ctx = reiser4_init_context(inode->i_sb);
34792+ if (IS_ERR(ctx))
34793+ return PTR_ERR(ctx);
34794+ /*
34795+ * generic_file_mmap will do update_atime. Grab space for stat data
34796+ * update.
34797+ */
34798+ result = reiser4_grab_space_force
34799+ (inode_file_plugin(inode)->estimate.update(inode),
34800+ BA_CAN_COMMIT);
34801+ if (result) {
34802+ reiser4_exit_context(ctx);
34803+ return result;
34804+ }
34805+ result = generic_file_mmap(file, vma);
34806+ reiser4_exit_context(ctx);
34807+ return result;
34808+}
34809+
34810+/* plugin->u.file.release */
34811+/* plugin->u.file.get_block */
34812+
34813+/* this is implementation of delete method of file plugin for
34814+ cryptcompress objects */
34815+int delete_object_cryptcompress(struct inode *inode)
34816+{
34817+ int result;
34818+
34819+ assert("edward-429", inode->i_nlink == 0);
34820+
34821+ reiser4_txn_restart_current();
34822+
34823+ result = cryptcompress_truncate(inode, 0, 0);
34824+ if (result) {
34825+ warning("edward-430",
34826+ "cannot truncate cryptcompress file %lli: %i",
34827+ (unsigned long long)get_inode_oid(inode),
34828+ result);
34829+ }
34830+ truncate_inode_pages(inode->i_mapping, 0);
34831+ /* and remove stat data */
34832+ return reiser4_delete_object_common(inode);
34833+}
34834+
34835+/* plugin->u.file.setattr method
34836+ This implements actual truncate (see comments in reiser4/page_cache.c) */
34837+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
34838+{
34839+ int result;
34840+ struct inode *inode;
34841+
34842+ inode = dentry->d_inode;
34843+ if (attr->ia_valid & ATTR_SIZE) {
34844+ if (inode->i_size != attr->ia_size) {
34845+ reiser4_context *ctx;
34846+ loff_t old_size;
34847+
34848+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
34849+ if (IS_ERR(ctx))
34850+ return PTR_ERR(ctx);
34851+
34852+ inode_check_scale(inode, inode->i_size, attr->ia_size);
34853+
34854+ old_size = inode->i_size;
34855+
34856+ result =
34857+ cryptcompress_truncate(inode, attr->ia_size,
34858+ 1 /* update stat data */ );
34859+ if (result) {
34860+ warning("edward-1192",
34861+ "truncate_cryptcompress failed: oid %lli, "
34862+ "old size %lld, new size %lld, retval %d",
34863+ (unsigned long long)
34864+ get_inode_oid(inode), old_size,
34865+ attr->ia_size, result);
34866+ }
34867+ context_set_commit_async(ctx);
34868+ reiser4_exit_context(ctx);
34869+ } else
34870+ result = 0;
34871+ } else
34872+ result = reiser4_setattr_common(dentry, attr);
34873+ return result;
34874+}
34875+
34876+/* sendfile_cryptcompress - sendfile of struct file_operations */
34877+ssize_t
34878+sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
34879+ read_actor_t actor, void *target)
34880+{
34881+ reiser4_context *ctx;
34882+ ssize_t result;
34883+ struct inode *inode;
34884+ cryptcompress_info_t *info;
34885+
34886+ inode = file->f_dentry->d_inode;
34887+ ctx = reiser4_init_context(inode->i_sb);
34888+ if (IS_ERR(ctx))
34889+ return PTR_ERR(ctx);
34890+ /*
34891+ * generic_file_sndfile may want to call update_atime. Grab space for
34892+ * stat data update
34893+ */
34894+ result = reiser4_grab_space(estimate_update_common(inode),
34895+ BA_CAN_COMMIT);
34896+ if (result)
34897+ goto exit;
34898+ info = cryptcompress_inode_data(inode);
34899+
34900+ result = generic_file_sendfile(file, ppos, count, actor, target);
34901+ exit:
34902+ reiser4_exit_context(ctx);
34903+ return result;
34904+}
34905+
34906+/*
34907+ * release_cryptcompress - release of struct file_operations
34908+ * @inode: inode of released file
34909+ * @file: file to release
34910+ */
34911+int release_cryptcompress(struct inode *inode, struct file *file)
34912+{
34913+ reiser4_context *ctx = reiser4_init_context(inode->i_sb);
34914+
34915+ if (IS_ERR(ctx))
34916+ return PTR_ERR(ctx);
34917+ reiser4_free_file_fsdata(file);
34918+ reiser4_exit_context(ctx);
34919+ return 0;
34920+}
34921+
34922+#if 0
34923+int prepare_write_cryptcompress(struct file *file, struct page *page,
34924+ unsigned from, unsigned to)
34925+{
34926+ return prepare_write_common(file, page, from, to);
34927+}
34928+#endif /* 0 */
34929+
34930+
34931+/*
34932+ Local variables:
34933+ c-indentation-style: "K&R"
34934+ mode-name: "LC"
34935+ c-basic-offset: 8
34936+ tab-width: 8
34937+ fill-column: 80
34938+ scroll-step: 1
34939+ End:
34940+*/
34941diff --git a/fs/reiser4/plugin/file/cryptcompress.h b/fs/reiser4/plugin/file/cryptcompress.h
34942new file mode 100644
34943index 0000000..5f2d7fb
34944--- /dev/null
34945+++ b/fs/reiser4/plugin/file/cryptcompress.h
34946@@ -0,0 +1,554 @@
34947+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
34948+/* See http://www.namesys.com/cryptcompress_design.html */
34949+
34950+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
34951+#define __FS_REISER4_CRYPTCOMPRESS_H__
34952+
34953+#include "../../page_cache.h"
34954+#include "../compress/compress.h"
34955+#include "../crypto/cipher.h"
34956+
34957+#include <linux/pagemap.h>
34958+
34959+#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
34960+#define MAX_CLUSTER_SHIFT 16
34961+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
34962+#define DC_CHECKSUM_SIZE 4
34963+
34964+#define MIN_LATTICE_FACTOR 1
34965+#define MAX_LATTICE_FACTOR 32
34966+
34967+/* this mask contains all non-standard plugins that might
34968+ be present in reiser4-specific part of inode managed by
34969+ cryptcompress file plugin */
34970+#define cryptcompress_mask \
34971+ ((1 << PSET_FILE) | \
34972+ (1 << PSET_CLUSTER) | \
34973+ (1 << PSET_CIPHER) | \
34974+ (1 << PSET_DIGEST) | \
34975+ (1 << PSET_COMPRESSION) | \
34976+ (1 << PSET_COMPRESSION_MODE))
34977+
34978+static inline loff_t min_count(loff_t a, loff_t b)
34979+{
34980+ return (a < b ? a : b);
34981+}
34982+
34983+static inline loff_t max_count(loff_t a, loff_t b)
34984+{
34985+ return (a > b ? a : b);
34986+}
34987+
34988+#if REISER4_DEBUG
34989+static inline int cluster_shift_ok(int shift)
34990+{
34991+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
34992+}
34993+#endif
34994+
34995+typedef struct tfm_stream {
34996+ __u8 *data;
34997+ size_t size;
34998+} tfm_stream_t;
34999+
35000+typedef enum {
35001+ INPUT_STREAM,
35002+ OUTPUT_STREAM,
35003+ LAST_STREAM
35004+} tfm_stream_id;
35005+
35006+typedef tfm_stream_t *tfm_unit[LAST_STREAM];
35007+
35008+static inline __u8 *ts_data(tfm_stream_t * stm)
35009+{
35010+ assert("edward-928", stm != NULL);
35011+ return stm->data;
35012+}
35013+
35014+static inline size_t ts_size(tfm_stream_t * stm)
35015+{
35016+ assert("edward-929", stm != NULL);
35017+ return stm->size;
35018+}
35019+
35020+static inline void set_ts_size(tfm_stream_t * stm, size_t size)
35021+{
35022+ assert("edward-930", stm != NULL);
35023+
35024+ stm->size = size;
35025+}
35026+
35027+static inline int alloc_ts(tfm_stream_t ** stm)
35028+{
35029+ assert("edward-931", stm);
35030+ assert("edward-932", *stm == NULL);
35031+
35032+ *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get());
35033+ if (*stm == NULL)
35034+ return -ENOMEM;
35035+ memset(*stm, 0, sizeof **stm);
35036+ return 0;
35037+}
35038+
35039+static inline void free_ts(tfm_stream_t * stm)
35040+{
35041+ assert("edward-933", !ts_data(stm));
35042+ assert("edward-934", !ts_size(stm));
35043+
35044+ kfree(stm);
35045+}
35046+
35047+static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
35048+{
35049+ assert("edward-935", !ts_data(stm));
35050+ assert("edward-936", !ts_size(stm));
35051+ assert("edward-937", size != 0);
35052+
35053+ stm->data = reiser4_vmalloc(size);
35054+ if (!stm->data)
35055+ return -ENOMEM;
35056+ set_ts_size(stm, size);
35057+ return 0;
35058+}
35059+
35060+static inline void free_ts_data(tfm_stream_t * stm)
35061+{
35062+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
35063+
35064+ if (ts_data(stm))
35065+ vfree(ts_data(stm));
35066+ memset(stm, 0, sizeof *stm);
35067+}
35068+
35069+/* Write modes for item conversion in flush convert phase */
35070+typedef enum {
35071+ CRC_APPEND_ITEM = 1,
35072+ CRC_OVERWRITE_ITEM = 2,
35073+ CRC_CUT_ITEM = 3
35074+} cryptcompress_write_mode_t;
35075+
35076+typedef enum {
35077+ PCL_UNKNOWN = 0, /* invalid option */
35078+ PCL_APPEND = 1, /* append and/or overwrite */
35079+ PCL_TRUNCATE = 2 /* truncate */
35080+} page_cluster_op;
35081+
35082+/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
35083+ using crypto/compression transforms implemented by reiser4 transform plugins.
35084+ Before each transform we allocate a pair of streams (tfm_unit) and assemble
35085+ page cluster into the input one. After transform we split output stream into
35086+ a set of items (disk cluster).
35087+*/
35088+typedef struct tfm_cluster {
35089+ coa_set coa;
35090+ tfm_unit tun;
35091+ tfm_action act;
35092+ int uptodate;
35093+ int lsize; /* size of the logical cluster */
35094+ int len; /* length of the transform stream */
35095+} tfm_cluster_t;
35096+
35097+static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
35098+{
35099+ return tc->coa[id][act];
35100+}
35101+
35102+static inline void
35103+set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
35104+{
35105+ tc->coa[id][act] = coa;
35106+}
35107+
35108+static inline int
35109+alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35110+{
35111+ coa_t coa;
35112+
35113+ coa = cplug->alloc(tc->act);
35114+ if (IS_ERR(coa))
35115+ return PTR_ERR(coa);
35116+ set_coa(tc, cplug->h.id, tc->act, coa);
35117+ return 0;
35118+}
35119+
35120+static inline int
35121+grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35122+{
35123+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
35124+ alloc_coa(tc, cplug) : 0);
35125+}
35126+
35127+static inline void free_coa_set(tfm_cluster_t * tc)
35128+{
35129+ tfm_action j;
35130+ reiser4_compression_id i;
35131+ compression_plugin *cplug;
35132+
35133+ assert("edward-810", tc != NULL);
35134+
35135+ for (j = 0; j < TFMA_LAST; j++)
35136+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
35137+ if (!get_coa(tc, i, j))
35138+ continue;
35139+ cplug = compression_plugin_by_id(i);
35140+ assert("edward-812", cplug->free != NULL);
35141+ cplug->free(get_coa(tc, i, j), j);
35142+ set_coa(tc, i, j, 0);
35143+ }
35144+ return;
35145+}
35146+
35147+static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35148+{
35149+ return tc->tun[id];
35150+}
35151+
35152+static inline void
35153+set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
35154+{
35155+ tc->tun[id] = ts;
35156+}
35157+
35158+static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
35159+{
35160+ return ts_data(tfm_stream(tc, id));
35161+}
35162+
35163+static inline void
35164+set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
35165+{
35166+ tfm_stream(tc, id)->data = data;
35167+}
35168+
35169+static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
35170+{
35171+ return ts_size(tfm_stream(tc, id));
35172+}
35173+
35174+static inline void
35175+set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
35176+{
35177+ tfm_stream(tc, id)->size = size;
35178+}
35179+
35180+static inline int
35181+alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35182+{
35183+ assert("edward-939", tc != NULL);
35184+ assert("edward-940", !tfm_stream(tc, id));
35185+
35186+ tc->tun[id] = kmalloc(sizeof(tfm_stream_t), reiser4_ctx_gfp_mask_get());
35187+ if (!tc->tun[id])
35188+ return -ENOMEM;
35189+ memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
35190+ return alloc_ts_data(tfm_stream(tc, id), size);
35191+}
35192+
35193+static inline int
35194+realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35195+{
35196+ assert("edward-941", tfm_stream_size(tc, id) < size);
35197+ free_ts_data(tfm_stream(tc, id));
35198+ return alloc_ts_data(tfm_stream(tc, id), size);
35199+}
35200+
35201+static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35202+{
35203+ free_ts_data(tfm_stream(tc, id));
35204+ free_ts(tfm_stream(tc, id));
35205+ set_tfm_stream(tc, id, 0);
35206+}
35207+
35208+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
35209+{
35210+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
35211+}
35212+
35213+static inline void free_tfm_unit(tfm_cluster_t * tc)
35214+{
35215+ tfm_stream_id id;
35216+ for (id = 0; id < LAST_STREAM; id++) {
35217+ if (!tfm_stream(tc, id))
35218+ continue;
35219+ free_tfm_stream(tc, id);
35220+ }
35221+}
35222+
35223+static inline void put_tfm_cluster(tfm_cluster_t * tc)
35224+{
35225+ assert("edward-942", tc != NULL);
35226+ free_coa_set(tc);
35227+ free_tfm_unit(tc);
35228+}
35229+
35230+static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
35231+{
35232+ assert("edward-943", tc != NULL);
35233+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
35234+ return (tc->uptodate == 1);
35235+}
35236+
35237+static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
35238+{
35239+ assert("edward-945", tc != NULL);
35240+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
35241+ tc->uptodate = 1;
35242+ return;
35243+}
35244+
35245+static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
35246+{
35247+ assert("edward-947", tc != NULL);
35248+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
35249+ tc->uptodate = 0;
35250+ return;
35251+}
35252+
35253+static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
35254+{
35255+ return (tfm_stream(tc, id) &&
35256+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
35257+}
35258+
35259+static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
35260+{
35261+ int i;
35262+ for (i = 0; i < LAST_STREAM; i++)
35263+ if (!tfm_stream_is_set(tc, i))
35264+ return 0;
35265+ return 1;
35266+}
35267+
35268+static inline void alternate_streams(tfm_cluster_t * tc)
35269+{
35270+ tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
35271+
35272+ set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
35273+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
35274+}
35275+
35276+/* a kind of data that we can write to the window */
35277+typedef enum {
35278+ DATA_WINDOW, /* the data we copy form user space */
35279+ HOLE_WINDOW /* zeroes if we write hole */
35280+} window_stat;
35281+
35282+/* Sliding window of cluster size which should be set to the approprite position
35283+ (defined by cluster index) in a file before page cluster modification by
35284+ file_write. Then we translate file size, offset to write from, number of
35285+ bytes to write, etc.. to the following configuration needed to estimate
35286+ number of pages to read before write, etc...
35287+*/
35288+typedef struct reiser4_slide {
35289+ unsigned off; /* offset we start to write/truncate from */
35290+ unsigned count; /* number of bytes (zeroes) to write/truncate */
35291+ unsigned delta; /* number of bytes to append to the hole */
35292+ window_stat stat; /* a kind of data to write to the window */
35293+} reiser4_slide_t;
35294+
35295+/* The following is a set of possible disk cluster states */
35296+typedef enum {
35297+ INVAL_DISK_CLUSTER, /* unknown state */
35298+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
35299+ at least 1 time */
35300+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
35301+ converted by flush */
35302+ FAKE_DISK_CLUSTER /* disk cluster doesn't exist neither in memory
35303+ nor on disk */
35304+} disk_cluster_stat;
35305+
35306+/*
35307+ While implementing all transforms (from page to disk cluster, and back)
35308+ reiser4 cluster manager fills the following structure incapsulating pointers
35309+ to all the clusters for the same index including the sliding window above
35310+*/
35311+typedef struct reiser4_cluster {
35312+ tfm_cluster_t tc; /* transform cluster */
35313+ int nr_pages; /* number of pages */
35314+ struct page **pages; /* page cluster */
35315+ page_cluster_op op; /* page cluster operation */
35316+ struct file *file;
35317+ hint_t *hint; /* disk cluster item for traversal */
35318+ disk_cluster_stat dstat; /* state of the current disk cluster */
35319+ cloff_t index; /* offset in the units of cluster size */
35320+ int index_valid; /* to validate the index above, if needed */
35321+ reiser4_slide_t *win; /* sliding window of cluster size */
35322+ int reserved; /* this indicates that space for disk
35323+ cluster modification is reserved */
35324+#if REISER4_DEBUG
35325+ reiser4_context *ctx;
35326+ int reserved_prepped;
35327+ int reserved_unprepped;
35328+#endif
35329+
35330+} reiser4_cluster_t;
35331+
35332+static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
35333+{
35334+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
35335+}
35336+
35337+static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
35338+{
35339+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35340+}
35341+
35342+static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35343+{
35344+ assert("edward-1057", clust->pages != NULL);
35345+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35346+ return 0;
35347+}
35348+
35349+static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35350+{
35351+ assert("edward-949", clust != NULL);
35352+ assert("edward-1362", clust->pages == NULL);
35353+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35354+
35355+ clust->pages =
35356+ kmalloc(sizeof(*clust->pages) * nrpages,
35357+ reiser4_ctx_gfp_mask_get());
35358+ if (!clust->pages)
35359+ return RETERR(-ENOMEM);
35360+ reset_cluster_pgset(clust, nrpages);
35361+ return 0;
35362+}
35363+
35364+static inline void free_cluster_pgset(reiser4_cluster_t * clust)
35365+{
35366+ assert("edward-951", clust->pages != NULL);
35367+ kfree(clust->pages);
35368+ clust->pages = NULL;
35369+}
35370+
35371+static inline void put_cluster_handle(reiser4_cluster_t * clust)
35372+{
35373+ assert("edward-435", clust != NULL);
35374+
35375+ put_tfm_cluster(&clust->tc);
35376+ if (clust->pages)
35377+ free_cluster_pgset(clust);
35378+ memset(clust, 0, sizeof *clust);
35379+}
35380+
35381+static inline void inc_keyload_count(crypto_stat_t * data)
35382+{
35383+ assert("edward-1410", data != NULL);
35384+ data->keyload_count++;
35385+}
35386+
35387+static inline void dec_keyload_count(crypto_stat_t * data)
35388+{
35389+ assert("edward-1411", data != NULL);
35390+ assert("edward-1412", data->keyload_count > 0);
35391+ data->keyload_count--;
35392+}
35393+
35394+/* cryptcompress specific part of reiser4_inode */
35395+typedef struct cryptcompress_info {
35396+ crypto_stat_t *crypt;
35397+ /* the following 2 fields are controlled by compression mode plugin */
35398+ int compress_toggle; /* current status of compressibility */
35399+ int lattice_factor; /* factor of dynamic lattice. FIXME: Have a
35400+ compression_toggle to keep the factor */
35401+#if REISER4_DEBUG
35402+ int pgcount; /* number of captured pages */
35403+#endif
35404+} cryptcompress_info_t;
35405+
35406+static inline void set_compression_toggle (cryptcompress_info_t * info, int val)
35407+{
35408+ info->compress_toggle = val;
35409+}
35410+
35411+static inline int get_compression_toggle (cryptcompress_info_t * info)
35412+{
35413+ return info->compress_toggle;
35414+}
35415+
35416+static inline int compression_is_on(cryptcompress_info_t * info)
35417+{
35418+ return get_compression_toggle(info) == 1;
35419+}
35420+
35421+static inline void turn_on_compression(cryptcompress_info_t * info)
35422+{
35423+ set_compression_toggle(info, 1);
35424+}
35425+
35426+static inline void turn_off_compression(cryptcompress_info_t * info)
35427+{
35428+ set_compression_toggle(info, 0);
35429+}
35430+
35431+static inline void set_lattice_factor(cryptcompress_info_t * info, int val)
35432+{
35433+ info->lattice_factor = val;
35434+}
35435+
35436+static inline int get_lattice_factor(cryptcompress_info_t * info)
35437+{
35438+ return info->lattice_factor;
35439+}
35440+
35441+cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
35442+int equal_to_rdk(znode *, const reiser4_key *);
35443+int goto_right_neighbor(coord_t *, lock_handle *);
35444+int cryptcompress_inode_ok(struct inode *inode);
35445+int coord_is_unprepped_ctail(const coord_t * coord);
35446+extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *,
35447+ znode_lock_mode mode);
35448+extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
35449+ struct page * page, znode_lock_mode mode);
35450+extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
35451+ struct inode * inode);
35452+extern int readpages_cryptcompress(struct file*, struct address_space*,
35453+ struct list_head*, unsigned);
35454+int bind_cryptcompress(struct inode *child, struct inode *parent);
35455+void destroy_inode_cryptcompress(struct inode * inode);
35456+int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust);
35457+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
35458+ reiser4_cluster_t * clust, int * progress);
35459+crypto_stat_t * inode_crypto_stat (struct inode * inode);
35460+void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
35461+ int (*can_inherit)(struct inode * child,
35462+ struct inode * parent));
35463+void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
35464+void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
35465+crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode);
35466+
35467+static inline struct crypto_blkcipher * info_get_cipher(crypto_stat_t * info)
35468+{
35469+ return info->cipher;
35470+}
35471+
35472+static inline void info_set_cipher(crypto_stat_t * info,
35473+ struct crypto_blkcipher * tfm)
35474+{
35475+ info->cipher = tfm;
35476+}
35477+
35478+static inline struct crypto_hash * info_get_digest(crypto_stat_t * info)
35479+{
35480+ return info->digest;
35481+}
35482+
35483+static inline void info_set_digest(crypto_stat_t * info,
35484+ struct crypto_hash * tfm)
35485+{
35486+ info->digest = tfm;
35487+}
35488+
35489+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
35490+
35491+/* Make Linus happy.
35492+ Local variables:
35493+ c-indentation-style: "K&R"
35494+ mode-name: "LC"
35495+ c-basic-offset: 8
35496+ tab-width: 8
35497+ fill-column: 120
35498+ scroll-step: 1
35499+ End:
35500+*/
35501diff --git a/fs/reiser4/plugin/file/file.c b/fs/reiser4/plugin/file/file.c
35502new file mode 100644
35503index 0000000..67501aa
35504--- /dev/null
35505+++ b/fs/reiser4/plugin/file/file.c
35506@@ -0,0 +1,2820 @@
35507+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35508+ * reiser4/README */
35509+
35510+/*
35511+ * this file contains implementations of inode/file/address_space/file plugin
35512+ * operations specific for "unix file plugin" (plugin id is
35513+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35514+ * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35515+ * no items but stat data)
35516+ */
35517+
35518+#include "../../inode.h"
35519+#include "../../super.h"
35520+#include "../../tree_walk.h"
35521+#include "../../carry.h"
35522+#include "../../page_cache.h"
35523+#include "../../ioctl.h"
35524+#include "../object.h"
35525+#include "../../safe_link.h"
35526+
35527+#include <linux/writeback.h>
35528+#include <linux/pagevec.h>
35529+#include <linux/syscalls.h>
35530+
35531+
35532+static int unpack(struct file *file, struct inode *inode, int forever);
35533+static void drop_access(unix_file_info_t *);
35534+static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35535+ znode_lock_mode lock_mode);
35536+
35537+/* get unix file plugin specific portion of inode */
35538+unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35539+{
35540+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35541+}
35542+
35543+/**
35544+ * equal_to_rdk - compare key and znode's right delimiting key
35545+ * @node: node whose right delimiting key to compare with @key
35546+ * @key: key to compare with @node's right delimiting key
35547+ *
35548+ * Returns true if @key is equal to right delimiting key of @node.
35549+ */
35550+int equal_to_rdk(znode *node, const reiser4_key *key)
35551+{
35552+ int result;
35553+
35554+ read_lock_dk(znode_get_tree(node));
35555+ result = keyeq(key, znode_get_rd_key(node));
35556+ read_unlock_dk(znode_get_tree(node));
35557+ return result;
35558+}
35559+
35560+#if REISER4_DEBUG
35561+
35562+/**
35563+ * equal_to_ldk - compare key and znode's left delimiting key
35564+ * @node: node whose left delimiting key to compare with @key
35565+ * @key: key to compare with @node's left delimiting key
35566+ *
35567+ * Returns true if @key is equal to left delimiting key of @node.
35568+ */
35569+int equal_to_ldk(znode *node, const reiser4_key *key)
35570+{
35571+ int result;
35572+
35573+ read_lock_dk(znode_get_tree(node));
35574+ result = keyeq(key, znode_get_ld_key(node));
35575+ read_unlock_dk(znode_get_tree(node));
35576+ return result;
35577+}
35578+
35579+/**
35580+ * check_coord - check whether coord corresponds to key
35581+ * @coord: coord to check
35582+ * @key: key @coord has to correspond to
35583+ *
35584+ * Returns true if @coord is set as if it was set as result of lookup with @key
35585+ * in coord->node.
35586+ */
35587+static int check_coord(const coord_t *coord, const reiser4_key *key)
35588+{
35589+ coord_t twin;
35590+
35591+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
35592+ FIND_MAX_NOT_MORE_THAN, &twin);
35593+ return coords_equal(coord, &twin);
35594+}
35595+
35596+#endif /* REISER4_DEBUG */
35597+
35598+/**
35599+ * init_uf_coord - initialize extended coord
35600+ * @uf_coord:
35601+ * @lh:
35602+ *
35603+ *
35604+ */
35605+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35606+{
35607+ coord_init_zero(&uf_coord->coord);
35608+ coord_clear_iplug(&uf_coord->coord);
35609+ uf_coord->lh = lh;
35610+ init_lh(lh);
35611+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35612+ uf_coord->valid = 0;
35613+}
35614+
35615+static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35616+{
35617+ assert("vs-1333", uf_coord->valid == 0);
35618+
35619+ if (coord_is_between_items(&uf_coord->coord))
35620+ return;
35621+
35622+ assert("vs-1348",
35623+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35624+ init_coord_extension);
35625+
35626+ item_body_by_coord(&uf_coord->coord);
35627+ item_plugin_by_coord(&uf_coord->coord)->s.file.
35628+ init_coord_extension(uf_coord, offset);
35629+}
35630+
35631+/**
35632+ * goto_right_neighbor - lock right neighbor, drop current node lock
35633+ * @coord:
35634+ * @lh:
35635+ *
35636+ * Obtain lock on right neighbor and drop lock on current node.
35637+ */
35638+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35639+{
35640+ int result;
35641+ lock_handle lh_right;
35642+
35643+ assert("vs-1100", znode_is_locked(coord->node));
35644+
35645+ init_lh(&lh_right);
35646+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
35647+ znode_is_wlocked(coord->node) ?
35648+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35649+ GN_CAN_USE_UPPER_LEVELS);
35650+ if (result) {
35651+ done_lh(&lh_right);
35652+ return result;
35653+ }
35654+
35655+ /*
35656+ * we hold two longterm locks on neighboring nodes. Unlock left of
35657+ * them
35658+ */
35659+ done_lh(lh);
35660+
35661+ coord_init_first_unit_nocheck(coord, lh_right.node);
35662+ move_lh(lh, &lh_right);
35663+
35664+ return 0;
35665+
35666+}
35667+
35668+/**
35669+ * set_file_state
35670+ * @uf_info:
35671+ * @cbk_result:
35672+ * @level:
35673+ *
35674+ * This is to be used by find_file_item and in find_file_state to
35675+ * determine real state of file
35676+ */
35677+static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
35678+ tree_level level)
35679+{
35680+ if (cbk_errored(cbk_result))
35681+ /* error happened in find_file_item */
35682+ return;
35683+
35684+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35685+
35686+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35687+ /*
35688+ * container is unknown, therefore conversion can not be in
35689+ * progress
35690+ */
35691+ assert("",
35692+ !reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35693+ REISER4_PART_IN_CONV));
35694+ if (cbk_result == CBK_COORD_NOTFOUND)
35695+ uf_info->container = UF_CONTAINER_EMPTY;
35696+ else if (level == LEAF_LEVEL)
35697+ uf_info->container = UF_CONTAINER_TAILS;
35698+ else
35699+ uf_info->container = UF_CONTAINER_EXTENTS;
35700+ } else {
35701+ /*
35702+ * file state is known, check whether it is set correctly if
35703+ * file is not being tail converted
35704+ */
35705+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35706+ REISER4_PART_IN_CONV)) {
35707+ assert("vs-1162",
35708+ ergo(level == LEAF_LEVEL &&
35709+ cbk_result == CBK_COORD_FOUND,
35710+ uf_info->container == UF_CONTAINER_TAILS));
35711+ assert("vs-1165",
35712+ ergo(level == TWIG_LEVEL &&
35713+ cbk_result == CBK_COORD_FOUND,
35714+ uf_info->container == UF_CONTAINER_EXTENTS));
35715+ }
35716+ }
35717+}
35718+
35719+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35720+ const reiser4_key *key, znode_lock_mode lock_mode,
35721+ struct inode *inode)
35722+{
35723+ return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
35724+ FIND_MAX_NOT_MORE_THAN,
35725+ TWIG_LEVEL, LEAF_LEVEL,
35726+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35727+ (CBK_UNIQUE | CBK_FOR_INSERT),
35728+ NULL /* ra_info */ );
35729+}
35730+
35731+/**
35732+ * find_file_item - look for file item in the tree
35733+ * @hint: provides coordinate, lock handle, seal
35734+ * @key: key for search
35735+ * @mode: mode of lock to put on returned node
35736+ * @ra_info:
35737+ * @inode:
35738+ *
35739+ * This finds position in the tree corresponding to @key. It first tries to use
35740+ * @hint's seal if it is set.
35741+ */
35742+int find_file_item(hint_t *hint, const reiser4_key *key,
35743+ znode_lock_mode lock_mode,
35744+ struct inode *inode)
35745+{
35746+ int result;
35747+ coord_t *coord;
35748+ lock_handle *lh;
35749+
35750+ assert("nikita-3030", reiser4_schedulable());
35751+ assert("vs-1707", hint != NULL);
35752+ assert("vs-47", inode != NULL);
35753+
35754+ coord = &hint->ext_coord.coord;
35755+ lh = hint->ext_coord.lh;
35756+ init_lh(lh);
35757+
35758+ result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35759+ if (!result) {
35760+ if (coord->between == AFTER_UNIT &&
35761+ equal_to_rdk(coord->node, key)) {
35762+ result = goto_right_neighbor(coord, lh);
35763+ if (result == -E_NO_NEIGHBOR)
35764+ return RETERR(-EIO);
35765+ if (result)
35766+ return result;
35767+ assert("vs-1152", equal_to_ldk(coord->node, key));
35768+ /*
35769+ * we moved to different node. Invalidate coord
35770+ * extension, zload is necessary to init it again
35771+ */
35772+ hint->ext_coord.valid = 0;
35773+ }
35774+
35775+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35776+ znode_get_level(coord->node));
35777+
35778+ return CBK_COORD_FOUND;
35779+ }
35780+
35781+ coord_init_zero(coord);
35782+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35783+ set_file_state(unix_file_inode_data(inode), result,
35784+ znode_get_level(coord->node));
35785+
35786+ /* FIXME: we might already have coord extension initialized */
35787+ hint->ext_coord.valid = 0;
35788+ return result;
35789+}
35790+
35791+/* plugin->u.file.write_flowom = NULL
35792+ plugin->u.file.read_flow = NULL */
35793+
35794+void hint_init_zero(hint_t * hint)
35795+{
35796+ memset(hint, 0, sizeof(*hint));
35797+ init_lh(&hint->lh);
35798+ hint->ext_coord.lh = &hint->lh;
35799+}
35800+
35801+static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
35802+{
35803+ int result;
35804+ reiser4_key key;
35805+ coord_t coord;
35806+ lock_handle lh;
35807+
35808+ assert("vs-1628", ea_obtained(uf_info));
35809+
35810+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35811+ key_by_inode_and_offset_common(inode, 0, &key);
35812+ init_lh(&lh);
35813+ result = find_file_item_nohint(&coord, &lh, &key,
35814+ ZNODE_READ_LOCK, inode);
35815+ set_file_state(uf_info, result, znode_get_level(coord.node));
35816+ done_lh(&lh);
35817+ if (!cbk_errored(result))
35818+ result = 0;
35819+ } else
35820+ result = 0;
35821+ assert("vs-1074",
35822+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35823+ reiser4_txn_restart_current();
35824+ return result;
35825+}
35826+
35827+/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
35828+ data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
35829+ if page corresponds to hole extent and unallocated one will have to be created */
35830+static int reserve_partial_page(reiser4_tree * tree)
35831+{
35832+ grab_space_enable();
35833+ return reiser4_grab_reserved(reiser4_get_current_sb(),
35834+ 1 +
35835+ 2 * estimate_one_insert_into_item(tree),
35836+ BA_CAN_COMMIT);
35837+}
35838+
35839+/* estimate and reserve space needed to cut one item and update one stat data */
35840+static int reserve_cut_iteration(reiser4_tree * tree)
35841+{
35842+ __u64 estimate = estimate_one_item_removal(tree)
35843+ + estimate_one_insert_into_item(tree);
35844+
35845+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
35846+
35847+ grab_space_enable();
35848+ /* We need to double our estimate now that we can delete more than one
35849+ node. */
35850+ return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
35851+ BA_CAN_COMMIT);
35852+}
35853+
35854+int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
35855+ int update_sd)
35856+{
35857+ int result = 0;
35858+
35859+ INODE_SET_FIELD(inode, i_size, get_key_offset(key));
35860+ if (update_sd) {
35861+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
35862+ result = reiser4_update_sd(inode);
35863+ }
35864+ return result;
35865+}
35866+
35867+/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
35868+ and update file stat data on every single cut from the tree */
35869+int
35870+cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
35871+ loff_t cur_size, int (*update_actor) (struct inode *,
35872+ reiser4_key *, int))
35873+{
35874+ reiser4_key from_key, to_key;
35875+ reiser4_key smallest_removed;
35876+ file_plugin *fplug = inode_file_plugin(inode);
35877+ int result;
35878+ int progress = 0;
35879+
35880+ assert("vs-1248",
35881+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
35882+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
35883+
35884+ fplug->key_by_inode(inode, new_size, &from_key);
35885+ to_key = from_key;
35886+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
35887+ /* this loop normally runs just once */
35888+ while (1) {
35889+ result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
35890+ if (result)
35891+ break;
35892+
35893+ result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
35894+ &smallest_removed, inode, 1,
35895+ &progress);
35896+ if (result == -E_REPEAT) {
35897+ /* -E_REPEAT is a signal to interrupt a long file truncation process */
35898+ if (progress) {
35899+ result =
35900+ update_actor(inode, &smallest_removed,
35901+ update_sd);
35902+ if (result)
35903+ break;
35904+ }
35905+
35906+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
35907+ reiser4_release_reserved(inode->i_sb);
35908+
35909+ /* reiser4_cut_tree_object() was interrupted probably because
35910+ * current atom requires commit, we have to release
35911+ * transaction handle to allow atom commit. */
35912+ reiser4_txn_restart_current();
35913+ continue;
35914+ }
35915+ if (result
35916+ && !(result == CBK_COORD_NOTFOUND && new_size == 0
35917+ && inode->i_size == 0))
35918+ break;
35919+
35920+ set_key_offset(&smallest_removed, new_size);
35921+ /* Final sd update after the file gets its correct size */
35922+ result = update_actor(inode, &smallest_removed, update_sd);
35923+ break;
35924+ }
35925+
35926+ /* the below does up(sbinfo->delete_mutex). Do not get folled */
35927+ reiser4_release_reserved(inode->i_sb);
35928+
35929+ return result;
35930+}
35931+
35932+int find_or_create_extent(struct page *page);
35933+
35934+/* part of truncate_file_body: it is called when truncate is used to make file
35935+ shorter */
35936+static int shorten_file(struct inode *inode, loff_t new_size)
35937+{
35938+ int result;
35939+ struct page *page;
35940+ int padd_from;
35941+ unsigned long index;
35942+ char *kaddr;
35943+ unix_file_info_t *uf_info;
35944+
35945+ /*
35946+ * all items of ordinary reiser4 file are grouped together. That is why
35947+ * we can use reiser4_cut_tree. Plan B files (for instance) can not be
35948+ * truncated that simply
35949+ */
35950+ result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
35951+ get_key_offset(reiser4_max_key()),
35952+ reiser4_update_file_size);
35953+ if (result)
35954+ return result;
35955+
35956+ uf_info = unix_file_inode_data(inode);
35957+ assert("vs-1105", new_size == inode->i_size);
35958+ if (new_size == 0) {
35959+ uf_info->container = UF_CONTAINER_EMPTY;
35960+ return 0;
35961+ }
35962+
35963+ result = find_file_state(inode, uf_info);
35964+ if (result)
35965+ return result;
35966+ if (uf_info->container == UF_CONTAINER_TAILS)
35967+ /*
35968+ * No need to worry about zeroing last page after new file
35969+ * end
35970+ */
35971+ return 0;
35972+
35973+ padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
35974+ if (!padd_from)
35975+ /* file is truncated to page boundary */
35976+ return 0;
35977+
35978+ result = reserve_partial_page(reiser4_tree_by_inode(inode));
35979+ if (result) {
35980+ reiser4_release_reserved(inode->i_sb);
35981+ return result;
35982+ }
35983+
35984+ /* last page is partially truncated - zero its content */
35985+ index = (inode->i_size >> PAGE_CACHE_SHIFT);
35986+ page = read_mapping_page(inode->i_mapping, index, NULL);
35987+ if (IS_ERR(page)) {
35988+ /*
35989+ * the below does up(sbinfo->delete_mutex). Do not get
35990+ * confused
35991+ */
35992+ reiser4_release_reserved(inode->i_sb);
35993+ if (likely(PTR_ERR(page) == -EINVAL)) {
35994+ /* looks like file is built of tail items */
35995+ return 0;
35996+ }
35997+ return PTR_ERR(page);
35998+ }
35999+ wait_on_page_locked(page);
36000+ if (!PageUptodate(page)) {
36001+ page_cache_release(page);
36002+ /*
36003+ * the below does up(sbinfo->delete_mutex). Do not get
36004+ * confused
36005+ */
36006+ reiser4_release_reserved(inode->i_sb);
36007+ return RETERR(-EIO);
36008+ }
36009+
36010+ /*
36011+ * if page correspons to hole extent unit - unallocated one will be
36012+ * created here. This is not necessary
36013+ */
36014+ result = find_or_create_extent(page);
36015+
36016+ /*
36017+ * FIXME: cut_file_items has already updated inode. Probably it would
36018+ * be better to update it here when file is really truncated
36019+ */
36020+ if (result) {
36021+ page_cache_release(page);
36022+ /*
36023+ * the below does up(sbinfo->delete_mutex). Do not get
36024+ * confused
36025+ */
36026+ reiser4_release_reserved(inode->i_sb);
36027+ return result;
36028+ }
36029+
36030+ lock_page(page);
36031+ assert("vs-1066", PageLocked(page));
36032+ kaddr = kmap_atomic(page, KM_USER0);
36033+ memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
36034+ flush_dcache_page(page);
36035+ kunmap_atomic(kaddr, KM_USER0);
36036+ unlock_page(page);
36037+ page_cache_release(page);
36038+ /* the below does up(sbinfo->delete_mutex). Do not get confused */
36039+ reiser4_release_reserved(inode->i_sb);
36040+ return 0;
36041+}
36042+
36043+/**
36044+ * should_have_notail
36045+ * @uf_info:
36046+ * @new_size:
36047+ *
36048+ * Calls formatting plugin to see whether file of size @new_size has to be
36049+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
36050+ */
36051+static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
36052+{
36053+ if (!uf_info->tplug)
36054+ return 1;
36055+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
36056+ new_size);
36057+
36058+}
36059+
36060+/**
36061+ * truncate_file_body - change length of file
36062+ * @inode: inode of file
36063+ * @new_size: new file length
36064+ *
36065+ * Adjusts items file @inode is built of to match @new_size. It may either cut
36066+ * items or add them to represent a hole at the end of file. The caller has to
36067+ * obtain exclusive access to the file.
36068+ */
36069+static int truncate_file_body(struct inode *inode, loff_t new_size)
36070+{
36071+ int result;
36072+
36073+ if (inode->i_size < new_size) {
36074+ /* expanding truncate */
36075+ struct dentry dentry;
36076+ struct file file;
36077+ unix_file_info_t *uf_info;
36078+
36079+ dentry.d_inode = inode;
36080+ file.f_dentry = &dentry;
36081+ file.private_data = NULL;
36082+ file.f_pos = new_size;
36083+ file.private_data = NULL;
36084+ uf_info = unix_file_inode_data(inode);
36085+ result = find_file_state(inode, uf_info);
36086+ if (result)
36087+ return result;
36088+
36089+ if (should_have_notail(uf_info, new_size)) {
36090+ /*
36091+ * file of size @new_size has to be built of
36092+ * extents. If it is built of tails - convert to
36093+ * extents
36094+ */
36095+ if (uf_info->container == UF_CONTAINER_TAILS) {
36096+ /*
36097+ * if file is being convered by another process
36098+ * - wait until it completes
36099+ */
36100+ while (1) {
36101+ if (reiser4_inode_get_flag(inode,
36102+ REISER4_PART_IN_CONV)) {
36103+ drop_exclusive_access(uf_info);
36104+ schedule();
36105+ get_exclusive_access(uf_info);
36106+ continue;
36107+ }
36108+ break;
36109+ }
36110+
36111+ if (uf_info->container == UF_CONTAINER_TAILS) {
36112+ result = tail2extent(uf_info);
36113+ if (result)
36114+ return result;
36115+ }
36116+ }
36117+ result = reiser4_write_extent(&file, NULL, 0,
36118+ &new_size);
36119+ if (result)
36120+ return result;
36121+ uf_info->container = UF_CONTAINER_EXTENTS;
36122+ } else {
36123+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
36124+ result = reiser4_write_extent(&file, NULL, 0,
36125+ &new_size);
36126+ if (result)
36127+ return result;
36128+ } else {
36129+ result = reiser4_write_tail(&file, NULL, 0,
36130+ &new_size);
36131+ if (result)
36132+ return result;
36133+ uf_info->container = UF_CONTAINER_TAILS;
36134+ }
36135+ }
36136+ BUG_ON(result > 0);
36137+ INODE_SET_FIELD(inode, i_size, new_size);
36138+ file_update_time(&file);
36139+ result = reiser4_update_sd(inode);
36140+ BUG_ON(result != 0);
36141+ reiser4_free_file_fsdata(&file);
36142+ } else
36143+ result = shorten_file(inode, new_size);
36144+ return result;
36145+}
36146+
36147+/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
36148+
36149+/**
36150+ * load_file_hint - copy hint from struct file to local variable
36151+ * @file: file to get hint from
36152+ * @hint: structure to fill
36153+ *
36154+ * Reiser4 specific portion of struct file may contain information (hint)
36155+ * stored on exiting from previous read or write. That information includes
36156+ * seal of znode and coord within that znode where previous read or write
36157+ * stopped. This function copies that information to @hint if it was stored or
36158+ * initializes @hint by 0s otherwise.
36159+ */
36160+int load_file_hint(struct file *file, hint_t *hint)
36161+{
36162+ reiser4_file_fsdata *fsdata;
36163+
36164+ if (file) {
36165+ fsdata = reiser4_get_file_fsdata(file);
36166+ if (IS_ERR(fsdata))
36167+ return PTR_ERR(fsdata);
36168+
36169+ spin_lock_inode(file->f_dentry->d_inode);
36170+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
36171+ *hint = fsdata->reg.hint;
36172+ init_lh(&hint->lh);
36173+ hint->ext_coord.lh = &hint->lh;
36174+ spin_unlock_inode(file->f_dentry->d_inode);
36175+ /*
36176+ * force re-validation of the coord on the first
36177+ * iteration of the read/write loop.
36178+ */
36179+ hint->ext_coord.valid = 0;
36180+ assert("nikita-19892", coords_equal(&hint->seal.coord1,
36181+ &hint->ext_coord.
36182+ coord));
36183+ return 0;
36184+ }
36185+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
36186+ spin_unlock_inode(file->f_dentry->d_inode);
36187+ }
36188+ hint_init_zero(hint);
36189+ return 0;
36190+}
36191+
36192+/**
36193+ * save_file_hint - copy hint to reiser4 private struct file's part
36194+ * @file: file to save hint in
36195+ * @hint: hint to save
36196+ *
36197+ * This copies @hint to reiser4 private part of struct file. It can help
36198+ * speedup future accesses to the file.
36199+ */
36200+void save_file_hint(struct file *file, const hint_t *hint)
36201+{
36202+ reiser4_file_fsdata *fsdata;
36203+
36204+ assert("edward-1337", hint != NULL);
36205+
36206+ if (!file || !reiser4_seal_is_set(&hint->seal))
36207+ return;
36208+ fsdata = reiser4_get_file_fsdata(file);
36209+ assert("vs-965", !IS_ERR(fsdata));
36210+ assert("nikita-19891",
36211+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
36212+ assert("vs-30", hint->lh.owner == NULL);
36213+ spin_lock_inode(file->f_dentry->d_inode);
36214+ fsdata->reg.hint = *hint;
36215+ spin_unlock_inode(file->f_dentry->d_inode);
36216+ return;
36217+}
36218+
36219+void reiser4_unset_hint(hint_t * hint)
36220+{
36221+ assert("vs-1315", hint);
36222+ hint->ext_coord.valid = 0;
36223+ reiser4_seal_done(&hint->seal);
36224+ done_lh(&hint->lh);
36225+}
36226+
36227+/* coord must be set properly. So, that reiser4_set_hint
36228+ has nothing to do */
36229+void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
36230+ znode_lock_mode mode)
36231+{
36232+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
36233+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
36234+
36235+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
36236+ hint->offset = get_key_offset(key);
36237+ hint->mode = mode;
36238+ done_lh(&hint->lh);
36239+}
36240+
36241+int hint_is_set(const hint_t * hint)
36242+{
36243+ return reiser4_seal_is_set(&hint->seal);
36244+}
36245+
36246+#if REISER4_DEBUG
36247+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
36248+{
36249+ return (get_key_locality(k1) == get_key_locality(k2) &&
36250+ get_key_type(k1) == get_key_type(k2) &&
36251+ get_key_band(k1) == get_key_band(k2) &&
36252+ get_key_ordering(k1) == get_key_ordering(k2) &&
36253+ get_key_objectid(k1) == get_key_objectid(k2));
36254+}
36255+#endif
36256+
36257+static int
36258+hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
36259+ znode_lock_mode lock_mode)
36260+{
36261+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
36262+ /* hint either not set or set by different operation */
36263+ return RETERR(-E_REPEAT);
36264+
36265+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
36266+
36267+ if (check_key && get_key_offset(key) != hint->offset)
36268+ /* hint is set for different key */
36269+ return RETERR(-E_REPEAT);
36270+
36271+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
36272+ return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
36273+ hint->ext_coord.lh, lock_mode,
36274+ ZNODE_LOCK_LOPRI);
36275+}
36276+
36277+/**
36278+ * find_or_create_extent -
36279+ * @page:
36280+ *
36281+ *
36282+ */
36283+/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
36284+ unallocated extent if it does not exist yet, initialize jnode, capture page */
36285+int find_or_create_extent(struct page *page)
36286+{
36287+ int result;
36288+ struct inode *inode;
36289+ int plugged_hole;
36290+
36291+ jnode *node;
36292+
36293+ assert("vs-1065", page->mapping && page->mapping->host);
36294+ inode = page->mapping->host;
36295+
36296+ lock_page(page);
36297+ node = jnode_of_page(page);
36298+ if (IS_ERR(node)) {
36299+ unlock_page(page);
36300+ return PTR_ERR(node);
36301+ }
36302+ JF_SET(node, JNODE_WRITE_PREPARED);
36303+ unlock_page(page);
36304+
36305+ if (node->blocknr == 0) {
36306+ plugged_hole = 0;
36307+ result = reiser4_update_extent(inode, node, page_offset(page),
36308+ &plugged_hole);
36309+ if (result) {
36310+ JF_CLR(node, JNODE_WRITE_PREPARED);
36311+ jput(node);
36312+ warning("", "reiser4_update_extent failed: %d", result);
36313+ return result;
36314+ }
36315+ if (plugged_hole)
36316+ reiser4_update_sd(inode);
36317+ } else {
36318+ spin_lock_jnode(node);
36319+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
36320+ BUG_ON(result != 0);
36321+ jnode_make_dirty_locked(node);
36322+ spin_unlock_jnode(node);
36323+ }
36324+
36325+ BUG_ON(node->atom == NULL);
36326+ JF_CLR(node, JNODE_WRITE_PREPARED);
36327+ jput(node);
36328+
36329+ if (get_current_context()->entd) {
36330+ entd_context *ent = get_entd_context(node->tree->super);
36331+
36332+ if (ent->cur_request->page == page)
36333+ ent->cur_request->node = node;
36334+ }
36335+ return 0;
36336+}
36337+
36338+/**
36339+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
36340+ * @inode: inode to check
36341+ *
36342+ * Returns true if inode's mapping has dirty pages which do not belong to any
36343+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36344+ * tree or were eflushed and can be found via jnodes tagged
36345+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36346+ */
36347+static int has_anonymous_pages(struct inode *inode)
36348+{
36349+ int result;
36350+
36351+ read_lock_irq(&inode->i_mapping->tree_lock);
36352+ result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36353+ read_unlock_irq(&inode->i_mapping->tree_lock);
36354+ return result;
36355+}
36356+
36357+/**
36358+ * capture_page_and_create_extent -
36359+ * @page: page to be captured
36360+ *
36361+ * Grabs space for extent creation and stat data update and calls function to
36362+ * do actual work.
36363+ */
36364+static int capture_page_and_create_extent(struct page *page)
36365+{
36366+ int result;
36367+ struct inode *inode;
36368+
36369+ assert("vs-1084", page->mapping && page->mapping->host);
36370+ inode = page->mapping->host;
36371+ assert("vs-1139",
36372+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36373+ /* page belongs to file */
36374+ assert("vs-1393",
36375+ inode->i_size > page_offset(page));
36376+
36377+ /* page capture may require extent creation (if it does not exist yet)
36378+ and stat data's update (number of blocks changes on extent
36379+ creation) */
36380+ grab_space_enable();
36381+ result = reiser4_grab_space(2 * estimate_one_insert_into_item
36382+ (reiser4_tree_by_inode(inode)),
36383+ BA_CAN_COMMIT);
36384+ if (likely(!result))
36385+ result = find_or_create_extent(page);
36386+
36387+ if (result != 0)
36388+ SetPageError(page);
36389+ return result;
36390+}
36391+
36392+/* this is implementation of method commit_write of struct
36393+ address_space_operations for unix file plugin */
36394+int
36395+commit_write_unix_file(struct file *file, struct page *page,
36396+ unsigned from, unsigned to)
36397+{
36398+ reiser4_context *ctx;
36399+ struct inode *inode;
36400+ int result;
36401+
36402+ assert("umka-3101", file != NULL);
36403+ assert("umka-3102", page != NULL);
36404+ assert("umka-3093", PageLocked(page));
36405+
36406+ SetPageUptodate(page);
36407+
36408+ inode = page->mapping->host;
36409+ ctx = reiser4_init_context(page->mapping->host->i_sb);
36410+ if (IS_ERR(ctx))
36411+ return PTR_ERR(ctx);
36412+ page_cache_get(page);
36413+ unlock_page(page);
36414+ result = capture_page_and_create_extent(page);
36415+ lock_page(page);
36416+ page_cache_release(page);
36417+
36418+ /* don't commit transaction under inode semaphore */
36419+ context_set_commit_async(ctx);
36420+ reiser4_exit_context(ctx);
36421+ return result;
36422+}
36423+
36424+/*
36425+ * Support for "anonymous" pages and jnodes.
36426+ *
36427+ * When file is write-accessed through mmap pages can be dirtied from the user
36428+ * level. In this case kernel is not notified until one of following happens:
36429+ *
36430+ * (1) msync()
36431+ *
36432+ * (2) truncate() (either explicit or through unlink)
36433+ *
36434+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
36435+ * starting write-back.
36436+ *
36437+ * As a result of (3) ->writepage may be called on a dirty page without
36438+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36439+ * (iozone) generate huge number of anonymous pages. Emergency flush handles
36440+ * this situation by creating jnode for anonymous page, starting IO on the
36441+ * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36442+ * memory. Such jnode is also called anonymous.
36443+ *
36444+ * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36445+ * tree. This is done by capture_anonymous_*() functions below.
36446+ */
36447+
36448+/**
36449+ * capture_anonymous_page - involve page into transaction
36450+ * @pg: page to deal with
36451+ *
36452+ * Takes care that @page has corresponding metadata in the tree, creates jnode
36453+ * for @page and captures it. On success 1 is returned.
36454+ */
36455+static int capture_anonymous_page(struct page *page)
36456+{
36457+ int result;
36458+
36459+ if (PageWriteback(page))
36460+ /* FIXME: do nothing? */
36461+ return 0;
36462+
36463+ result = capture_page_and_create_extent(page);
36464+ if (result == 0) {
36465+ result = 1;
36466+ } else
36467+ warning("nikita-3329",
36468+ "Cannot capture anon page: %i", result);
36469+
36470+ return result;
36471+}
36472+
36473+/**
36474+ * capture_anonymous_pages - find and capture pages dirtied via mmap
36475+ * @mapping: address space where to look for pages
36476+ * @index: start index
36477+ * @to_capture: maximum number of pages to capture
36478+ *
36479+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36480+ * captures (involves into atom) them, returns number of captured pages,
36481+ * updates @index to next page after the last captured one.
36482+ */
36483+static int
36484+capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36485+ unsigned int to_capture)
36486+{
36487+ int result;
36488+ struct pagevec pvec;
36489+ unsigned int i, count;
36490+ int nr;
36491+
36492+ pagevec_init(&pvec, 0);
36493+ count = min(pagevec_space(&pvec), to_capture);
36494+ nr = 0;
36495+
36496+ /* find pages tagged MOVED */
36497+ write_lock_irq(&mapping->tree_lock);
36498+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36499+ (void **)pvec.pages, *index, count,
36500+ PAGECACHE_TAG_REISER4_MOVED);
36501+ if (pagevec_count(&pvec) == 0) {
36502+ /*
36503+ * there are no pages tagged MOVED in mapping->page_tree
36504+ * starting from *index
36505+ */
36506+ write_unlock_irq(&mapping->tree_lock);
36507+ *index = (pgoff_t)-1;
36508+ return 0;
36509+ }
36510+
36511+ /* clear MOVED tag for all found pages */
36512+ for (i = 0; i < pagevec_count(&pvec); i++) {
36513+ void *p;
36514+
36515+ page_cache_get(pvec.pages[i]);
36516+ p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36517+ PAGECACHE_TAG_REISER4_MOVED);
36518+ assert("vs-49", p == pvec.pages[i]);
36519+ }
36520+ write_unlock_irq(&mapping->tree_lock);
36521+
36522+
36523+ *index = pvec.pages[i - 1]->index + 1;
36524+
36525+ for (i = 0; i < pagevec_count(&pvec); i++) {
36526+ /*
36527+ * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36528+ * reiser4_set_page_dirty_internal which is called when jnode is
36529+ * captured
36530+ */
36531+ result = capture_anonymous_page(pvec.pages[i]);
36532+ if (result == 1)
36533+ nr++;
36534+ else {
36535+ if (result < 0) {
36536+ warning("vs-1454",
36537+ "failed to capture page: "
36538+ "result=%d, captured=%d)\n",
36539+ result, i);
36540+
36541+ /*
36542+ * set MOVED tag to all pages which left not
36543+ * captured
36544+ */
36545+ write_lock_irq(&mapping->tree_lock);
36546+ for (; i < pagevec_count(&pvec); i ++) {
36547+ radix_tree_tag_set(&mapping->page_tree,
36548+ pvec.pages[i]->index,
36549+ PAGECACHE_TAG_REISER4_MOVED);
36550+ }
36551+ write_unlock_irq(&mapping->tree_lock);
36552+
36553+ pagevec_release(&pvec);
36554+ return result;
36555+ } else {
36556+ /*
36557+ * result == 0. capture_anonymous_page returns
36558+ * 0 for Writeback-ed page. Set MOVED tag on
36559+ * that page
36560+ */
36561+ write_lock_irq(&mapping->tree_lock);
36562+ radix_tree_tag_set(&mapping->page_tree,
36563+ pvec.pages[i]->index,
36564+ PAGECACHE_TAG_REISER4_MOVED);
36565+ write_unlock_irq(&mapping->tree_lock);
36566+ if (i == 0)
36567+ *index = pvec.pages[0]->index;
36568+ else
36569+ *index = pvec.pages[i - 1]->index + 1;
36570+ }
36571+ }
36572+ }
36573+ pagevec_release(&pvec);
36574+ return nr;
36575+}
36576+
36577+/**
36578+ * capture_anonymous_jnodes - find and capture anonymous jnodes
36579+ * @mapping: address space where to look for jnodes
36580+ * @from: start index
36581+ * @to: end index
36582+ * @to_capture: maximum number of jnodes to capture
36583+ *
36584+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36585+ * the range of indexes @from-@to and captures them, returns number of captured
36586+ * jnodes, updates @from to next jnode after the last captured one.
36587+ */
36588+static int
36589+capture_anonymous_jnodes(struct address_space *mapping,
36590+ pgoff_t *from, pgoff_t to, int to_capture)
36591+{
36592+ *from = to;
36593+ return 0;
36594+}
36595+
36596+/*
36597+ * Commit atom of the jnode of a page.
36598+ */
36599+static int sync_page(struct page *page)
36600+{
36601+ int result;
36602+ do {
36603+ jnode *node;
36604+ txn_atom *atom;
36605+
36606+ lock_page(page);
36607+ node = jprivate(page);
36608+ if (node != NULL) {
36609+ spin_lock_jnode(node);
36610+ atom = jnode_get_atom(node);
36611+ spin_unlock_jnode(node);
36612+ } else
36613+ atom = NULL;
36614+ unlock_page(page);
36615+ result = reiser4_sync_atom(atom);
36616+ } while (result == -E_REPEAT);
36617+ /*
36618+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36619+ * handle the case where more pages get added to the atom while we are
36620+ * syncing it?
36621+ */
36622+ assert("nikita-3485", ergo(result == 0,
36623+ get_current_context()->trans->atom == NULL));
36624+ return result;
36625+}
36626+
36627+/*
36628+ * Commit atoms of pages on @pages list.
36629+ * call sync_page for each page from mapping's page tree
36630+ */
36631+static int sync_page_list(struct inode *inode)
36632+{
36633+ int result;
36634+ struct address_space *mapping;
36635+ unsigned long from; /* start index for radix_tree_gang_lookup */
36636+ unsigned int found; /* return value for radix_tree_gang_lookup */
36637+
36638+ mapping = inode->i_mapping;
36639+ from = 0;
36640+ result = 0;
36641+ read_lock_irq(&mapping->tree_lock);
36642+ while (result == 0) {
36643+ struct page *page;
36644+
36645+ found =
36646+ radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36647+ from, 1);
36648+ assert("", found < 2);
36649+ if (found == 0)
36650+ break;
36651+
36652+ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36653+ sys_fsync */
36654+ page_cache_get(page);
36655+ read_unlock_irq(&mapping->tree_lock);
36656+
36657+ from = page->index + 1;
36658+
36659+ result = sync_page(page);
36660+
36661+ page_cache_release(page);
36662+ read_lock_irq(&mapping->tree_lock);
36663+ }
36664+
36665+ read_unlock_irq(&mapping->tree_lock);
36666+ return result;
36667+}
36668+
36669+static int commit_file_atoms(struct inode *inode)
36670+{
36671+ int result;
36672+ unix_file_info_t *uf_info;
36673+
36674+ uf_info = unix_file_inode_data(inode);
36675+
36676+ get_exclusive_access(uf_info);
36677+ /*
36678+ * find what items file is made from
36679+ */
36680+ result = find_file_state(inode, uf_info);
36681+ drop_exclusive_access(uf_info);
36682+ if (result != 0)
36683+ return result;
36684+
36685+ /*
36686+ * file state cannot change because we are under ->i_mutex
36687+ */
36688+ switch (uf_info->container) {
36689+ case UF_CONTAINER_EXTENTS:
36690+ /* find_file_state might open join an atom */
36691+ reiser4_txn_restart_current();
36692+ result =
36693+ /*
36694+ * when we are called by
36695+ * filemap_fdatawrite->
36696+ * do_writepages()->
36697+ * reiser4_writepages()
36698+ *
36699+ * inode->i_mapping->dirty_pages are spices into
36700+ * ->io_pages, leaving ->dirty_pages dirty.
36701+ *
36702+ * When we are called from
36703+ * reiser4_fsync()->sync_unix_file(), we have to
36704+ * commit atoms of all pages on the ->dirty_list.
36705+ *
36706+ * So for simplicity we just commit ->io_pages and
36707+ * ->dirty_pages.
36708+ */
36709+ sync_page_list(inode);
36710+ break;
36711+ case UF_CONTAINER_TAILS:
36712+ /*
36713+ * NOTE-NIKITA probably we can be smarter for tails. For now
36714+ * just commit all existing atoms.
36715+ */
36716+ result = txnmgr_force_commit_all(inode->i_sb, 0);
36717+ break;
36718+ case UF_CONTAINER_EMPTY:
36719+ result = 0;
36720+ break;
36721+ case UF_CONTAINER_UNKNOWN:
36722+ default:
36723+ result = -EIO;
36724+ break;
36725+ }
36726+
36727+ /*
36728+ * commit current transaction: there can be captured nodes from
36729+ * find_file_state() and finish_conversion().
36730+ */
36731+ reiser4_txn_restart_current();
36732+ return result;
36733+}
36734+
36735+/**
36736+ * writepages_unix_file - writepages of struct address_space_operations
36737+ * @mapping:
36738+ * @wbc:
36739+ *
36740+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36741+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36742+ * created by reiser4_writepage.
36743+ */
36744+int writepages_unix_file(struct address_space *mapping,
36745+ struct writeback_control *wbc)
36746+{
36747+ int result;
36748+ unix_file_info_t *uf_info;
36749+ pgoff_t pindex, jindex, nr_pages;
36750+ long to_capture;
36751+ struct inode *inode;
36752+
36753+ inode = mapping->host;
36754+ if (!has_anonymous_pages(inode)) {
36755+ result = 0;
36756+ goto end;
36757+ }
36758+ jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
36759+ result = 0;
36760+ nr_pages =
36761+ (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
36762+ uf_info = unix_file_inode_data(inode);
36763+
36764+ do {
36765+ reiser4_context *ctx;
36766+
36767+ if (wbc->sync_mode != WB_SYNC_ALL)
36768+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36769+ else
36770+ to_capture = CAPTURE_APAGE_BURST;
36771+
36772+ ctx = reiser4_init_context(inode->i_sb);
36773+ if (IS_ERR(ctx)) {
36774+ result = PTR_ERR(ctx);
36775+ break;
36776+ }
36777+ /* avoid recursive calls to ->sync_inodes */
36778+ ctx->nobalance = 1;
36779+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36780+ assert("", LOCK_CNT_NIL(inode_sem_w));
36781+ assert("", LOCK_CNT_NIL(inode_sem_r));
36782+
36783+ reiser4_txn_restart_current();
36784+
36785+ /* we have to get nonexclusive access to the file */
36786+ if (get_current_context()->entd) {
36787+ /*
36788+ * use nonblocking version of nonexclusive_access to
36789+ * avoid deadlock which might look like the following:
36790+ * process P1 holds NEA on file F1 and called entd to
36791+ * reclaim some memory. Entd works for P1 and is going
36792+ * to capture pages of file F2. To do that entd has to
36793+ * get NEA to F2. F2 is held by process P2 which also
36794+ * called entd. But entd is serving P1 at the moment
36795+ * and P2 has to wait. Process P3 trying to get EA to
36796+ * file F2. Existence of pending EA request to file F2
36797+ * makes impossible for entd to get NEA to file
36798+ * F2. Neither of these process can continue. Using
36799+ * nonblocking version of gettign NEA is supposed to
36800+ * avoid this deadlock.
36801+ */
36802+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
36803+ result = RETERR(-EBUSY);
36804+ reiser4_exit_context(ctx);
36805+ break;
36806+ }
36807+ } else
36808+ get_nonexclusive_access(uf_info);
36809+
36810+ while (to_capture > 0) {
36811+ pgoff_t start;
36812+
36813+ assert("vs-1727", jindex <= pindex);
36814+ if (pindex == jindex) {
36815+ start = pindex;
36816+ result =
36817+ capture_anonymous_pages(inode->i_mapping,
36818+ &pindex,
36819+ to_capture);
36820+ if (result <= 0)
36821+ break;
36822+ to_capture -= result;
36823+ wbc->nr_to_write -= result;
36824+ if (start + result == pindex) {
36825+ jindex = pindex;
36826+ continue;
36827+ }
36828+ if (to_capture <= 0)
36829+ break;
36830+ }
36831+ /* deal with anonymous jnodes between jindex and pindex */
36832+ result =
36833+ capture_anonymous_jnodes(inode->i_mapping, &jindex,
36834+ pindex, to_capture);
36835+ if (result < 0)
36836+ break;
36837+ to_capture -= result;
36838+ get_current_context()->nr_captured += result;
36839+
36840+ if (jindex == (pgoff_t) - 1) {
36841+ assert("vs-1728", pindex == (pgoff_t) - 1);
36842+ break;
36843+ }
36844+ }
36845+ if (to_capture <= 0)
36846+ /* there may be left more pages */
36847+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
36848+
36849+ drop_nonexclusive_access(uf_info);
36850+ if (result < 0) {
36851+ /* error happened */
36852+ reiser4_exit_context(ctx);
36853+ return result;
36854+ }
36855+ if (wbc->sync_mode != WB_SYNC_ALL) {
36856+ reiser4_exit_context(ctx);
36857+ return 0;
36858+ }
36859+ result = commit_file_atoms(inode);
36860+ reiser4_exit_context(ctx);
36861+ if (pindex >= nr_pages && jindex == pindex)
36862+ break;
36863+ } while (1);
36864+
36865+ end:
36866+ if (is_in_reiser4_context()) {
36867+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
36868+ /*
36869+ * there are already pages to flush, flush them out, do
36870+ * not delay until end of reiser4_sync_inodes
36871+ */
36872+ reiser4_writeout(inode->i_sb, wbc);
36873+ get_current_context()->nr_captured = 0;
36874+ }
36875+ }
36876+ return result;
36877+}
36878+
36879+/*
36880+ * ->sync() method for unix file.
36881+ *
36882+ * We are trying to be smart here. Instead of committing all atoms (original
36883+ * solution), we scan dirty pages of this file and commit all atoms they are
36884+ * part of.
36885+ *
36886+ * Situation is complicated by anonymous pages: i.e., extent-less pages
36887+ * dirtied through mmap. Fortunately sys_fsync() first calls
36888+ * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
36889+ * all missing extents and capture anonymous pages.
36890+ */
36891+int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
36892+{
36893+ reiser4_context *ctx;
36894+ txn_atom *atom;
36895+ reiser4_block_nr reserve;
36896+
36897+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
36898+ if (IS_ERR(ctx))
36899+ return PTR_ERR(ctx);
36900+
36901+ reserve = estimate_update_common(dentry->d_inode);
36902+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
36903+ reiser4_exit_context(ctx);
36904+ return RETERR(-ENOSPC);
36905+ }
36906+ write_sd_by_inode_common(dentry->d_inode);
36907+
36908+ atom = get_current_atom_locked();
36909+ spin_lock_txnh(ctx->trans);
36910+ force_commit_atom(ctx->trans);
36911+ reiser4_exit_context(ctx);
36912+ return 0;
36913+}
36914+
36915+/**
36916+ * readpage_unix_file_nolock - readpage of struct address_space_operations
36917+ * @file:
36918+ * @page:
36919+ *
36920+ * Compose a key and search for item containing information about @page
36921+ * data. If item is found - its readpage method is called.
36922+ */
36923+int readpage_unix_file(struct file *file, struct page *page)
36924+{
36925+ reiser4_context *ctx;
36926+ int result;
36927+ struct inode *inode;
36928+ reiser4_key key;
36929+ item_plugin *iplug;
36930+ hint_t *hint;
36931+ lock_handle *lh;
36932+ coord_t *coord;
36933+
36934+ assert("vs-1062", PageLocked(page));
36935+ assert("vs-976", !PageUptodate(page));
36936+ assert("vs-1061", page->mapping && page->mapping->host);
36937+
36938+ if (page->mapping->host->i_size <= page_offset(page)) {
36939+ /* page is out of file already */
36940+ unlock_page(page);
36941+ return -EINVAL;
36942+ }
36943+
36944+ inode = page->mapping->host;
36945+ ctx = reiser4_init_context(inode->i_sb);
36946+ if (IS_ERR(ctx)) {
36947+ unlock_page(page);
36948+ return PTR_ERR(ctx);
36949+ }
36950+
36951+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36952+ if (hint == NULL) {
36953+ unlock_page(page);
36954+ reiser4_exit_context(ctx);
36955+ return RETERR(-ENOMEM);
36956+ }
36957+
36958+ result = load_file_hint(file, hint);
36959+ if (result) {
36960+ kfree(hint);
36961+ unlock_page(page);
36962+ reiser4_exit_context(ctx);
36963+ return result;
36964+ }
36965+ lh = &hint->lh;
36966+
36967+ /* get key of first byte of the page */
36968+ key_by_inode_and_offset_common(inode, page_offset(page), &key);
36969+
36970+ /* look for file metadata corresponding to first byte of page */
36971+ page_cache_get(page);
36972+ unlock_page(page);
36973+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
36974+ lock_page(page);
36975+ page_cache_release(page);
36976+
36977+ if (page->mapping == NULL) {
36978+ /*
36979+ * readpage allows truncate to run concurrently. Page was
36980+ * truncated while it was not locked
36981+ */
36982+ done_lh(lh);
36983+ kfree(hint);
36984+ unlock_page(page);
36985+ reiser4_txn_restart(ctx);
36986+ reiser4_exit_context(ctx);
36987+ return -EINVAL;
36988+ }
36989+
36990+ if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
36991+ if (result == CBK_COORD_FOUND &&
36992+ hint->ext_coord.coord.between != AT_UNIT)
36993+ /* file is truncated */
36994+ result = -EINVAL;
36995+ done_lh(lh);
36996+ kfree(hint);
36997+ unlock_page(page);
36998+ reiser4_txn_restart(ctx);
36999+ reiser4_exit_context(ctx);
37000+ return result;
37001+ }
37002+
37003+ /*
37004+ * item corresponding to page is found. It can not be removed because
37005+ * znode lock is held
37006+ */
37007+ if (PageUptodate(page)) {
37008+ done_lh(lh);
37009+ kfree(hint);
37010+ unlock_page(page);
37011+ reiser4_txn_restart(ctx);
37012+ reiser4_exit_context(ctx);
37013+ return 0;
37014+ }
37015+
37016+ coord = &hint->ext_coord.coord;
37017+ result = zload(coord->node);
37018+ if (result) {
37019+ done_lh(lh);
37020+ kfree(hint);
37021+ unlock_page(page);
37022+ reiser4_txn_restart(ctx);
37023+ reiser4_exit_context(ctx);
37024+ return result;
37025+ }
37026+
37027+ validate_extended_coord(&hint->ext_coord, page_offset(page));
37028+
37029+ if (!coord_is_existing_unit(coord)) {
37030+ /* this indicates corruption */
37031+ warning("vs-280",
37032+ "Looking for page %lu of file %llu (size %lli). "
37033+ "No file items found (%d). File is corrupted?\n",
37034+ page->index, (unsigned long long)get_inode_oid(inode),
37035+ inode->i_size, result);
37036+ zrelse(coord->node);
37037+ done_lh(lh);
37038+ kfree(hint);
37039+ unlock_page(page);
37040+ reiser4_txn_restart(ctx);
37041+ reiser4_exit_context(ctx);
37042+ return RETERR(-EIO);
37043+ }
37044+
37045+ /*
37046+ * get plugin of found item or use plugin if extent if there are no
37047+ * one
37048+ */
37049+ iplug = item_plugin_by_coord(coord);
37050+ if (iplug->s.file.readpage)
37051+ result = iplug->s.file.readpage(coord, page);
37052+ else
37053+ result = RETERR(-EINVAL);
37054+
37055+ if (!result) {
37056+ set_key_offset(&key,
37057+ (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
37058+ /* FIXME should call reiser4_set_hint() */
37059+ reiser4_unset_hint(hint);
37060+ } else {
37061+ unlock_page(page);
37062+ reiser4_unset_hint(hint);
37063+ }
37064+ assert("vs-979",
37065+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
37066+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
37067+
37068+ zrelse(coord->node);
37069+ done_lh(lh);
37070+
37071+ save_file_hint(file, hint);
37072+ kfree(hint);
37073+
37074+ /*
37075+ * FIXME: explain why it is needed. HINT: page allocation in write can
37076+ * not be done when atom is not NULL because reiser4_writepage can not
37077+ * kick entd and have to eflush
37078+ */
37079+ reiser4_txn_restart(ctx);
37080+ reiser4_exit_context(ctx);
37081+ return result;
37082+}
37083+
37084+struct uf_readpages_context {
37085+ lock_handle lh;
37086+ coord_t coord;
37087+};
37088+
37089+/* A callback function for readpages_unix_file/read_cache_pages.
37090+ * If the file is build of tails, then return error (-ENOENT).
37091+ *
37092+ * @data -- a pointer to reiser4_readpages_context object,
37093+ * to save the twig lock and the coord between
37094+ * read_cache_page iterations.
37095+ * @page -- page to start read.
37096+ */
37097+static int uf_readpages_filler(void * data, struct page * page)
37098+{
37099+ struct uf_readpages_context *rc = data;
37100+ jnode * node;
37101+ int ret = 0;
37102+ reiser4_extent *ext;
37103+ __u64 ext_index;
37104+ int cbk_done = 0;
37105+ struct address_space * mapping = page->mapping;
37106+
37107+ if (PageUptodate(page)) {
37108+ unlock_page(page);
37109+ return 0;
37110+ }
37111+ if (rc->lh.node == 0) {
37112+ /* no twig lock - have to do tree search. */
37113+ reiser4_key key;
37114+ repeat:
37115+ unlock_page(page);
37116+ key_by_inode_and_offset_common(
37117+ mapping->host, page_offset(page), &key);
37118+ ret = coord_by_key(
37119+ &get_super_private(mapping->host->i_sb)->tree,
37120+ &key, &rc->coord, &rc->lh,
37121+ ZNODE_READ_LOCK, FIND_EXACT,
37122+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
37123+ if (ret)
37124+ return ret;
37125+ lock_page(page);
37126+ cbk_done = 1;
37127+ }
37128+ ret = zload(rc->coord.node);
37129+ if (ret) {
37130+ unlock_page(page);
37131+ return ret;
37132+ }
37133+ if (!coord_is_existing_item(&rc->coord) ||
37134+ !item_is_extent(&rc->coord)) {
37135+ zrelse(rc->coord.node);
37136+ unlock_page(page);
37137+ return RETERR(-EIO);
37138+ }
37139+ ext = extent_by_coord(&rc->coord);
37140+ ext_index = extent_unit_index(&rc->coord);
37141+ if (page->index < ext_index ||
37142+ page->index >= ext_index + extent_get_width(ext)) {
37143+ /* the page index doesn't belong to the extent unit
37144+ which the coord points to - release the lock and
37145+ repeat with tree search. */
37146+ zrelse(rc->coord.node);
37147+ done_lh(&rc->lh);
37148+ /* we can be here after a CBK call only in case of
37149+ corruption of the tree or the tree lookup algorithm bug. */
37150+ if (unlikely(cbk_done)) {
37151+ unlock_page(page);
37152+ return RETERR(-EIO);
37153+ }
37154+ goto repeat;
37155+ }
37156+ node = jnode_of_page(page);
37157+ if (unlikely(IS_ERR(node))) {
37158+ zrelse(rc->coord.node);
37159+ unlock_page(page);
37160+ return PTR_ERR(node);
37161+ }
37162+ ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
37163+ jput(node);
37164+ zrelse(rc->coord.node);
37165+ if (ret)
37166+ unlock_page(page);
37167+ return ret;
37168+}
37169+
37170+/**
37171+ * readpages_unix_file - called by the readahead code, starts reading for each
37172+ * page of given list of pages
37173+ */
37174+int readpages_unix_file(
37175+ struct file *file, struct address_space *mapping,
37176+ struct list_head *pages, unsigned nr_pages)
37177+{
37178+ reiser4_context *ctx;
37179+ struct uf_readpages_context rc;
37180+ int ret;
37181+
37182+ ctx = reiser4_init_context(mapping->host->i_sb);
37183+ if (IS_ERR(ctx)) {
37184+ put_pages_list(pages);
37185+ return PTR_ERR(ctx);
37186+ }
37187+ init_lh(&rc.lh);
37188+ ret = read_cache_pages(mapping, pages, uf_readpages_filler, &rc);
37189+ done_lh(&rc.lh);
37190+ context_set_commit_async(ctx);
37191+ /* close the transaction to protect further page allocation from deadlocks */
37192+ reiser4_txn_restart(ctx);
37193+ reiser4_exit_context(ctx);
37194+ return ret;
37195+}
37196+
37197+static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
37198+ loff_t count UNUSED_ARG)
37199+{
37200+ /* We should reserve one block, because of updating of the stat data
37201+ item */
37202+ assert("vs-1249",
37203+ inode_file_plugin(inode)->estimate.update ==
37204+ estimate_update_common);
37205+ return estimate_update_common(inode);
37206+}
37207+
37208+/* this is called with nonexclusive access obtained, file's container can not change */
37209+static ssize_t read_file(hint_t *hint, struct file *file, /* file to read from to */
37210+ char __user *buf, /* address of user-space buffer */
37211+ size_t count, /* number of bytes to read */
37212+ loff_t *off)
37213+{
37214+ int result;
37215+ struct inode *inode;
37216+ flow_t flow;
37217+ int (*read_f) (struct file *, flow_t *, hint_t *);
37218+ coord_t *coord;
37219+ znode *loaded;
37220+
37221+ inode = file->f_dentry->d_inode;
37222+
37223+ /* build flow */
37224+ assert("vs-1250",
37225+ inode_file_plugin(inode)->flow_by_inode ==
37226+ flow_by_inode_unix_file);
37227+ result =
37228+ flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
37229+ *off, READ_OP, &flow);
37230+ if (unlikely(result))
37231+ return result;
37232+
37233+ /* get seal and coord sealed with it from reiser4 private data
37234+ of struct file. The coord will tell us where our last read
37235+ of this file finished, and the seal will help to determine
37236+ if that location is still valid.
37237+ */
37238+ coord = &hint->ext_coord.coord;
37239+ while (flow.length && result == 0) {
37240+ result =
37241+ find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
37242+ if (cbk_errored(result))
37243+ /* error happened */
37244+ break;
37245+
37246+ if (coord->between != AT_UNIT) {
37247+ /* there were no items corresponding to given offset */
37248+ done_lh(hint->ext_coord.lh);
37249+ break;
37250+ }
37251+
37252+ loaded = coord->node;
37253+ result = zload(loaded);
37254+ if (unlikely(result)) {
37255+ done_lh(hint->ext_coord.lh);
37256+ break;
37257+ }
37258+
37259+ if (hint->ext_coord.valid == 0)
37260+ validate_extended_coord(&hint->ext_coord,
37261+ get_key_offset(&flow.key));
37262+
37263+ assert("vs-4", hint->ext_coord.valid == 1);
37264+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
37265+ /* call item's read method */
37266+ read_f = item_plugin_by_coord(coord)->s.file.read;
37267+ result = read_f(file, &flow, hint);
37268+ zrelse(loaded);
37269+ done_lh(hint->ext_coord.lh);
37270+ }
37271+
37272+ return (count - flow.length) ? (count - flow.length) : result;
37273+}
37274+
37275+static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
37276+
37277+/**
37278+ * read_unix_file - read of struct file_operations
37279+ * @file: file to read from
37280+ * @buf: address of user-space buffer
37281+ * @read_amount: number of bytes to read
37282+ * @off: position in file to read from
37283+ *
37284+ * This is implementation of vfs's read method of struct file_operations for
37285+ * unix file plugin.
37286+ */
37287+ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37288+ loff_t *off)
37289+{
37290+ reiser4_context *ctx;
37291+ ssize_t result;
37292+ struct inode *inode;
37293+ unix_file_info_t *uf_info;
37294+
37295+ if (unlikely(read_amount == 0))
37296+ return 0;
37297+
37298+ assert("umka-072", file != NULL);
37299+ assert("umka-074", off != NULL);
37300+ inode = file->f_dentry->d_inode;
37301+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37302+
37303+ ctx = reiser4_init_context(inode->i_sb);
37304+ if (IS_ERR(ctx))
37305+ return PTR_ERR(ctx);
37306+ uf_info = unix_file_inode_data(inode);
37307+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37308+ get_exclusive_access(uf_info);
37309+ result = find_file_state(inode, uf_info);
37310+ if (unlikely(result != 0))
37311+ goto out;
37312+ } else
37313+ get_nonexclusive_access(uf_info);
37314+ result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
37315+ BA_CAN_COMMIT);
37316+ if (unlikely(result != 0))
37317+ goto out;
37318+ if (uf_info->container == UF_CONTAINER_EXTENTS){
37319+ result = do_sync_read(file, buf, read_amount, off);
37320+ } else if (uf_info->container == UF_CONTAINER_TAILS ||
37321+ reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
37322+ reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37323+ result = read_unix_file_container_tails(file, buf, read_amount, off);
37324+ } else {
37325+ assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
37326+ result = 0;
37327+ }
37328+out:
37329+ drop_access(uf_info);
37330+ context_set_commit_async(ctx);
37331+ reiser4_exit_context(ctx);
37332+ return result;
37333+}
37334+
37335+static ssize_t read_unix_file_container_tails(
37336+ struct file *file, char __user *buf, size_t read_amount, loff_t *off)
37337+{
37338+ int result;
37339+ struct inode *inode;
37340+ hint_t *hint;
37341+ unix_file_info_t *uf_info;
37342+ size_t count, read, left;
37343+ loff_t size;
37344+
37345+ assert("umka-072", file != NULL);
37346+ assert("umka-074", off != NULL);
37347+ inode = file->f_dentry->d_inode;
37348+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37349+
37350+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
37351+ if (hint == NULL)
37352+ return RETERR(-ENOMEM);
37353+
37354+ result = load_file_hint(file, hint);
37355+ if (result) {
37356+ kfree(hint);
37357+ return result;
37358+ }
37359+
37360+ left = read_amount;
37361+ count = 0;
37362+ uf_info = unix_file_inode_data(inode);
37363+ while (left > 0) {
37364+ reiser4_txn_restart_current();
37365+ size = i_size_read(inode);
37366+ if (*off >= size)
37367+ /* position to read from is past the end of file */
37368+ break;
37369+ if (*off + left > size)
37370+ left = size - *off;
37371+ /* faultin user page */
37372+ result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
37373+ if (result)
37374+ return RETERR(-EFAULT);
37375+
37376+ read = read_file(hint, file, buf,
37377+ left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37378+ off);
37379+ if (read < 0) {
37380+ result = read;
37381+ break;
37382+ }
37383+ left -= read;
37384+ buf += read;
37385+
37386+ /* update position in a file */
37387+ *off += read;
37388+ /* total number of read bytes */
37389+ count += read;
37390+ }
37391+ done_lh(&hint->lh);
37392+ save_file_hint(file, hint);
37393+ kfree(hint);
37394+ if (count)
37395+ file_accessed(file);
37396+ /* return number of read bytes or error code if nothing is read */
37397+ return count ? count : result;
37398+}
37399+
37400+/* This function takes care about @file's pages. First of all it checks if
37401+ filesystems readonly and if so gets out. Otherwise, it throws out all
37402+ pages of file if it was mapped for read and going to be mapped for write
37403+ and consists of tails. This is done in order to not manage few copies
37404+ of the data (first in page cache and second one in tails them selves)
37405+ for the case of mapping files consisting tails.
37406+
37407+ Here also tail2extent conversion is performed if it is allowed and file
37408+ is going to be written or mapped for write. This functions may be called
37409+ from write_unix_file() or mmap_unix_file(). */
37410+static int check_pages_unix_file(struct file *file, struct inode *inode)
37411+{
37412+ reiser4_invalidate_pages(inode->i_mapping, 0,
37413+ (inode->i_size + PAGE_CACHE_SIZE -
37414+ 1) >> PAGE_CACHE_SHIFT, 0);
37415+ return unpack(file, inode, 0 /* not forever */ );
37416+}
37417+
37418+/**
37419+ * mmap_unix_file - mmap of struct file_operations
37420+ * @file: file to mmap
37421+ * @vma:
37422+ *
37423+ * This is implementation of vfs's mmap method of struct file_operations for
37424+ * unix file plugin. It converts file to extent if necessary. Sets
37425+ * reiser4_inode's flag - REISER4_HAS_MMAP.
37426+ */
37427+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37428+{
37429+ reiser4_context *ctx;
37430+ int result;
37431+ struct inode *inode;
37432+ unix_file_info_t *uf_info;
37433+ reiser4_block_nr needed;
37434+
37435+ inode = file->f_dentry->d_inode;
37436+ ctx = reiser4_init_context(inode->i_sb);
37437+ if (IS_ERR(ctx))
37438+ return PTR_ERR(ctx);
37439+
37440+ uf_info = unix_file_inode_data(inode);
37441+
37442+ get_exclusive_access(uf_info);
37443+
37444+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37445+ /*
37446+ * we need file built of extent items. If it is still built of
37447+ * tail items we have to convert it. Find what items the file
37448+ * is built of
37449+ */
37450+ result = find_file_state(inode, uf_info);
37451+ if (result != 0) {
37452+ drop_exclusive_access(uf_info);
37453+ reiser4_exit_context(ctx);
37454+ return result;
37455+ }
37456+
37457+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37458+ uf_info->container == UF_CONTAINER_EXTENTS ||
37459+ uf_info->container == UF_CONTAINER_EMPTY));
37460+ if (uf_info->container == UF_CONTAINER_TAILS) {
37461+ /*
37462+ * invalidate all pages and convert file from tails to
37463+ * extents
37464+ */
37465+ result = check_pages_unix_file(file, inode);
37466+ if (result) {
37467+ drop_exclusive_access(uf_info);
37468+ reiser4_exit_context(ctx);
37469+ return result;
37470+ }
37471+ }
37472+ }
37473+
37474+ /*
37475+ * generic_file_mmap will do update_atime. Grab space for stat data
37476+ * update.
37477+ */
37478+ needed = inode_file_plugin(inode)->estimate.update(inode);
37479+ result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37480+ if (result) {
37481+ drop_exclusive_access(uf_info);
37482+ reiser4_exit_context(ctx);
37483+ return result;
37484+ }
37485+
37486+ result = generic_file_mmap(file, vma);
37487+ if (result == 0) {
37488+ /* mark file as having mapping. */
37489+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
37490+ }
37491+
37492+ drop_exclusive_access(uf_info);
37493+ reiser4_exit_context(ctx);
37494+ return result;
37495+}
37496+
37497+/**
37498+ * find_first_item
37499+ * @inode:
37500+ *
37501+ * Finds file item which is responsible for first byte in the file.
37502+ */
37503+static int find_first_item(struct inode *inode)
37504+{
37505+ coord_t coord;
37506+ lock_handle lh;
37507+ reiser4_key key;
37508+ int result;
37509+
37510+ coord_init_zero(&coord);
37511+ init_lh(&lh);
37512+ inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37513+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37514+ inode);
37515+ if (result == CBK_COORD_FOUND) {
37516+ if (coord.between == AT_UNIT) {
37517+ result = zload(coord.node);
37518+ if (result == 0) {
37519+ result = item_id_by_coord(&coord);
37520+ zrelse(coord.node);
37521+ if (result != EXTENT_POINTER_ID &&
37522+ result != FORMATTING_ID)
37523+ result = RETERR(-EIO);
37524+ }
37525+ } else
37526+ result = RETERR(-EIO);
37527+ }
37528+ done_lh(&lh);
37529+ return result;
37530+}
37531+
37532+/**
37533+ * open_unix_file
37534+ * @inode:
37535+ * @file:
37536+ *
37537+ * If filesystem is not readonly - complete uncompleted tail conversion if
37538+ * there was one
37539+ */
37540+int open_unix_file(struct inode *inode, struct file *file)
37541+{
37542+ int result;
37543+ reiser4_context *ctx;
37544+ unix_file_info_t *uf_info;
37545+
37546+ if (IS_RDONLY(inode))
37547+ return 0;
37548+
37549+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
37550+ return 0;
37551+
37552+ ctx = reiser4_init_context(inode->i_sb);
37553+ if (IS_ERR(ctx))
37554+ return PTR_ERR(ctx);
37555+
37556+ uf_info = unix_file_inode_data(inode);
37557+ get_exclusive_access(uf_info);
37558+
37559+ /*
37560+ * it may happen that another process is doing tail conversion. Wait
37561+ * until it completes
37562+ */
37563+ while (1) {
37564+ if (reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37565+ drop_exclusive_access(uf_info);
37566+ schedule();
37567+ get_exclusive_access(uf_info);
37568+ continue;
37569+ }
37570+ break;
37571+ }
37572+
37573+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37574+ /*
37575+ * other process completed the conversion
37576+ */
37577+ drop_exclusive_access(uf_info);
37578+ reiser4_exit_context(ctx);
37579+ return 0;
37580+ }
37581+
37582+ /*
37583+ * file left in semi converted state after unclean shutdown or another
37584+ * thread is doing conversion and dropped exclusive access which doing
37585+ * balance dirty pages. Complete the conversion
37586+ */
37587+ result = find_first_item(inode);
37588+ if (result == EXTENT_POINTER_ID)
37589+ /*
37590+ * first item is extent, therefore there was incomplete
37591+ * tail2extent conversion. Complete it
37592+ */
37593+ result = tail2extent(unix_file_inode_data(inode));
37594+ else if (result == FORMATTING_ID)
37595+ /*
37596+ * first item is formatting item, therefore there was
37597+ * incomplete extent2tail conversion. Complete it
37598+ */
37599+ result = extent2tail(unix_file_inode_data(inode));
37600+ else
37601+ result = -EIO;
37602+
37603+ assert("vs-1712",
37604+ ergo(result == 0,
37605+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
37606+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
37607+ drop_exclusive_access(uf_info);
37608+ reiser4_exit_context(ctx);
37609+ return result;
37610+}
37611+
37612+#define NEITHER_OBTAINED 0
37613+#define EA_OBTAINED 1
37614+#define NEA_OBTAINED 2
37615+
37616+static void drop_access(unix_file_info_t *uf_info)
37617+{
37618+ if (uf_info->exclusive_use)
37619+ drop_exclusive_access(uf_info);
37620+ else
37621+ drop_nonexclusive_access(uf_info);
37622+}
37623+
37624+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37625+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37626+
37627+/**
37628+ * write_unix_file - write of struct file_operations
37629+ * @file: file to write to
37630+ * @buf: address of user-space buffer
37631+ * @write_amount: number of bytes to write
37632+ * @off: position in file to write to
37633+ *
37634+ * This is implementation of vfs's write method of struct file_operations for
37635+ * unix file plugin.
37636+ */
37637+ssize_t write_unix_file(struct file *file, const char __user *buf,
37638+ size_t count, loff_t *pos)
37639+{
37640+ int result;
37641+ reiser4_context *ctx;
37642+ struct inode *inode;
37643+ unix_file_info_t *uf_info;
37644+ ssize_t written;
37645+ int try_free_space;
37646+ int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37647+ size_t left;
37648+ ssize_t (*write_op)(struct file *, const char __user *, size_t,
37649+ loff_t *pos);
37650+ int ea;
37651+ loff_t new_size;
37652+
37653+ inode = file->f_dentry->d_inode;
37654+ ctx = reiser4_init_context(inode->i_sb);
37655+ if (IS_ERR(ctx))
37656+ return PTR_ERR(ctx);
37657+
37658+ mutex_lock(&inode->i_mutex);
37659+
37660+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37661+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
37662+
37663+ /* check amount of bytes to write and writing position */
37664+ result = generic_write_checks(file, pos, &count, 0);
37665+ if (result) {
37666+ mutex_unlock(&inode->i_mutex);
37667+ context_set_commit_async(ctx);
37668+ reiser4_exit_context(ctx);
37669+ return result;
37670+ }
37671+
37672+ result = remove_suid(file->f_dentry);
37673+ if (result) {
37674+ mutex_unlock(&inode->i_mutex);
37675+ context_set_commit_async(ctx);
37676+ reiser4_exit_context(ctx);
37677+ return result;
37678+ }
37679+ /* remove_suid might create a transaction */
37680+ reiser4_txn_restart(ctx);
37681+
37682+ uf_info = unix_file_inode_data(inode);
37683+
37684+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
37685+ written = 0;
37686+ try_free_space = 0;
37687+ left = count;
37688+ ea = NEITHER_OBTAINED;
37689+
37690+ new_size = i_size_read(inode);
37691+ if (*pos + count > new_size)
37692+ new_size = *pos + count;
37693+
37694+ while (left) {
37695+ if (left < to_write)
37696+ to_write = left;
37697+
37698+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37699+ get_exclusive_access(uf_info);
37700+ ea = EA_OBTAINED;
37701+ if (uf_info->container != UF_CONTAINER_EMPTY) {
37702+ /* file is made not empty by another process */
37703+ drop_exclusive_access(uf_info);
37704+ ea = NEITHER_OBTAINED;
37705+ continue;
37706+ }
37707+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37708+ /*
37709+ * get exclusive access directly just to not have to
37710+ * re-obtain it if file will appear empty
37711+ */
37712+ get_exclusive_access(uf_info);
37713+ ea = EA_OBTAINED;
37714+ result = find_file_state(inode, uf_info);
37715+ if (result) {
37716+ drop_exclusive_access(uf_info);
37717+ ea = NEITHER_OBTAINED;
37718+ break;
37719+ }
37720+ } else {
37721+ get_nonexclusive_access(uf_info);
37722+ ea = NEA_OBTAINED;
37723+ }
37724+
37725+ /* either EA or NEA is obtained. Choose item write method */
37726+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
37727+ /* file is built of extent items */
37728+ write_op = reiser4_write_extent;
37729+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37730+ /* file is empty */
37731+ if (should_have_notail(uf_info, new_size))
37732+ write_op = reiser4_write_extent;
37733+ else
37734+ write_op = reiser4_write_tail;
37735+ } else {
37736+ /* file is built of tail items */
37737+ if (should_have_notail(uf_info, new_size)) {
37738+ if (ea == NEA_OBTAINED) {
37739+ drop_nonexclusive_access(uf_info);
37740+ get_exclusive_access(uf_info);
37741+ ea = EA_OBTAINED;
37742+ }
37743+ if (uf_info->container == UF_CONTAINER_TAILS) {
37744+ /*
37745+ * if file is being convered by another
37746+ * process - wait until it completes
37747+ */
37748+ while (1) {
37749+ if (reiser4_inode_get_flag(inode,
37750+ REISER4_PART_IN_CONV)) {
37751+ drop_exclusive_access(uf_info);
37752+ schedule();
37753+ get_exclusive_access(uf_info);
37754+ continue;
37755+ }
37756+ break;
37757+ }
37758+ if (uf_info->container == UF_CONTAINER_TAILS) {
37759+ result = tail2extent(uf_info);
37760+ if (result)
37761+ break;
37762+ }
37763+ }
37764+ drop_exclusive_access(uf_info);
37765+ ea = NEITHER_OBTAINED;
37766+ continue;
37767+ }
37768+ write_op = reiser4_write_tail;
37769+ }
37770+
37771+ written = write_op(file, buf, to_write, pos);
37772+ if (written == -ENOSPC && try_free_space) {
37773+ drop_access(uf_info);
37774+ txnmgr_force_commit_all(inode->i_sb, 0);
37775+ try_free_space = 0;
37776+ continue;
37777+ }
37778+ if (written < 0) {
37779+ drop_access(uf_info);
37780+ result = written;
37781+ break;
37782+ }
37783+ /* something is written. */
37784+ if (uf_info->container == UF_CONTAINER_EMPTY) {
37785+ assert("", ea == EA_OBTAINED);
37786+ uf_info->container =
37787+ (write_op == reiser4_write_extent) ?
37788+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37789+ } else {
37790+ assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37791+ write_op == reiser4_write_extent));
37792+ assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37793+ write_op == reiser4_write_tail));
37794+ }
37795+ if (*pos + written > inode->i_size)
37796+ INODE_SET_FIELD(inode, i_size, *pos + written);
37797+ file_update_time(file);
37798+ result = reiser4_update_sd(inode);
37799+ if (result) {
37800+ mutex_unlock(&inode->i_mutex);
37801+ current->backing_dev_info = NULL;
37802+ drop_access(uf_info);
37803+ context_set_commit_async(ctx);
37804+ reiser4_exit_context(ctx);
37805+ return result;
37806+ }
37807+ drop_access(uf_info);
37808+ ea = NEITHER_OBTAINED;
37809+ reiser4_txn_restart(ctx);
37810+ current->journal_info = NULL;
37811+ /*
37812+ * tell VM how many pages were dirtied. Maybe number of pages
37813+ * which were dirty already should not be counted
37814+ */
37815+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
37816+ (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
37817+ current->journal_info = ctx;
37818+
37819+ left -= written;
37820+ buf += written;
37821+ *pos += written;
37822+ }
37823+
37824+ mutex_unlock(&inode->i_mutex);
37825+
37826+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37827+ reiser4_txn_restart_current();
37828+ grab_space_enable();
37829+ result = sync_unix_file(file, file->f_dentry,
37830+ 0 /* data and stat data */ );
37831+ if (result)
37832+ warning("reiser4-7", "failed to sync file %llu",
37833+ (unsigned long long)get_inode_oid(inode));
37834+ }
37835+
37836+ current->backing_dev_info = NULL;
37837+
37838+ reiser4_exit_context(ctx);
37839+
37840+ /*
37841+ * return number of written bytes or error code if nothing is
37842+ * written. Note, that it does not work correctly in case when
37843+ * sync_unix_file returns error
37844+ */
37845+ return (count - left) ? (count - left) : result;
37846+}
37847+
37848+/**
37849+ * release_unix_file - release of struct file_operations
37850+ * @inode: inode of released file
37851+ * @file: file to release
37852+ *
37853+ * Implementation of release method of struct file_operations for unix file
37854+ * plugin. If last reference to indode is released - convert all extent items
37855+ * into tail items if necessary. Frees reiser4 specific file data.
37856+ */
37857+int release_unix_file(struct inode *inode, struct file *file)
37858+{
37859+ reiser4_context *ctx;
37860+ unix_file_info_t *uf_info;
37861+ int result;
37862+ int in_reiser4;
37863+
37864+ in_reiser4 = is_in_reiser4_context();
37865+
37866+ ctx = reiser4_init_context(inode->i_sb);
37867+ if (IS_ERR(ctx))
37868+ return PTR_ERR(ctx);
37869+
37870+ result = 0;
37871+ if (in_reiser4 == 0) {
37872+ uf_info = unix_file_inode_data(inode);
37873+
37874+ get_exclusive_access(uf_info);
37875+ if (atomic_read(&file->f_dentry->d_count) == 1 &&
37876+ uf_info->container == UF_CONTAINER_EXTENTS &&
37877+ !should_have_notail(uf_info, inode->i_size) &&
37878+ !rofs_inode(inode)) {
37879+ result = extent2tail(uf_info);
37880+ if (result != 0) {
37881+ warning("nikita-3233",
37882+ "Failed (%d) to convert in %s (%llu)",
37883+ result, __FUNCTION__,
37884+ (unsigned long long)
37885+ get_inode_oid(inode));
37886+ }
37887+ }
37888+ drop_exclusive_access(uf_info);
37889+ } else {
37890+ /*
37891+ we are within reiser4 context already. How latter is
37892+ possible? Simple:
37893+
37894+ (gdb) bt
37895+ #0 get_exclusive_access ()
37896+ #2 0xc01e56d3 in release_unix_file ()
37897+ #3 0xc01c3643 in reiser4_release ()
37898+ #4 0xc014cae0 in __fput ()
37899+ #5 0xc013ffc3 in remove_vm_struct ()
37900+ #6 0xc0141786 in exit_mmap ()
37901+ #7 0xc0118480 in mmput ()
37902+ #8 0xc0133205 in oom_kill ()
37903+ #9 0xc01332d1 in out_of_memory ()
37904+ #10 0xc013bc1d in try_to_free_pages ()
37905+ #11 0xc013427b in __alloc_pages ()
37906+ #12 0xc013f058 in do_anonymous_page ()
37907+ #13 0xc013f19d in do_no_page ()
37908+ #14 0xc013f60e in handle_mm_fault ()
37909+ #15 0xc01131e5 in do_page_fault ()
37910+ #16 0xc0104935 in error_code ()
37911+ #17 0xc025c0c6 in __copy_to_user_ll ()
37912+ #18 0xc01d496f in reiser4_read_tail ()
37913+ #19 0xc01e4def in read_unix_file ()
37914+ #20 0xc01c3504 in reiser4_read ()
37915+ #21 0xc014bd4f in vfs_read ()
37916+ #22 0xc014bf66 in sys_read ()
37917+ */
37918+ warning("vs-44", "out of memory?");
37919+ }
37920+
37921+ reiser4_free_file_fsdata(file);
37922+
37923+ reiser4_exit_context(ctx);
37924+ return result;
37925+}
37926+
37927+static void set_file_notail(struct inode *inode)
37928+{
37929+ reiser4_inode *state;
37930+ formatting_plugin *tplug;
37931+
37932+ state = reiser4_inode_data(inode);
37933+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
37934+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
37935+}
37936+
37937+/* if file is built of tails - convert it to extents */
37938+static int unpack(struct file *filp, struct inode *inode, int forever)
37939+{
37940+ int result = 0;
37941+ unix_file_info_t *uf_info;
37942+
37943+ uf_info = unix_file_inode_data(inode);
37944+ assert("vs-1628", ea_obtained(uf_info));
37945+
37946+ result = find_file_state(inode, uf_info);
37947+ if (result)
37948+ return result;
37949+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
37950+
37951+ if (uf_info->container == UF_CONTAINER_TAILS) {
37952+ /*
37953+ * if file is being convered by another process - wait until it
37954+ * completes
37955+ */
37956+ while (1) {
37957+ if (reiser4_inode_get_flag(inode,
37958+ REISER4_PART_IN_CONV)) {
37959+ drop_exclusive_access(uf_info);
37960+ schedule();
37961+ get_exclusive_access(uf_info);
37962+ continue;
37963+ }
37964+ break;
37965+ }
37966+ if (uf_info->container == UF_CONTAINER_TAILS) {
37967+ result = tail2extent(uf_info);
37968+ if (result)
37969+ return result;
37970+ }
37971+ }
37972+ if (forever) {
37973+ /* safe new formatting plugin in stat data */
37974+ __u64 tograb;
37975+
37976+ set_file_notail(inode);
37977+
37978+ grab_space_enable();
37979+ tograb = inode_file_plugin(inode)->estimate.update(inode);
37980+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
37981+ result = reiser4_update_sd(inode);
37982+ }
37983+
37984+ return result;
37985+}
37986+
37987+/* implentation of vfs' ioctl method of struct file_operations for unix file
37988+ plugin
37989+*/
37990+int
37991+ioctl_unix_file(struct inode *inode, struct file *filp,
37992+ unsigned int cmd, unsigned long arg UNUSED_ARG)
37993+{
37994+ reiser4_context *ctx;
37995+ int result;
37996+
37997+ ctx = reiser4_init_context(inode->i_sb);
37998+ if (IS_ERR(ctx))
37999+ return PTR_ERR(ctx);
38000+
38001+ switch (cmd) {
38002+ case REISER4_IOC_UNPACK:
38003+ get_exclusive_access(unix_file_inode_data(inode));
38004+ result = unpack(filp, inode, 1 /* forever */ );
38005+ drop_exclusive_access(unix_file_inode_data(inode));
38006+ break;
38007+
38008+ default:
38009+ result = RETERR(-ENOSYS);
38010+ break;
38011+ }
38012+ reiser4_exit_context(ctx);
38013+ return result;
38014+}
38015+
38016+/* implentation of vfs' bmap method of struct address_space_operations for unix
38017+ file plugin
38018+*/
38019+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
38020+{
38021+ reiser4_context *ctx;
38022+ sector_t result;
38023+ reiser4_key key;
38024+ coord_t coord;
38025+ lock_handle lh;
38026+ struct inode *inode;
38027+ item_plugin *iplug;
38028+ sector_t block;
38029+
38030+ inode = mapping->host;
38031+
38032+ ctx = reiser4_init_context(inode->i_sb);
38033+ if (IS_ERR(ctx))
38034+ return PTR_ERR(ctx);
38035+ key_by_inode_and_offset_common(inode,
38036+ (loff_t) lblock * current_blocksize,
38037+ &key);
38038+
38039+ init_lh(&lh);
38040+ result =
38041+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
38042+ if (cbk_errored(result)) {
38043+ done_lh(&lh);
38044+ reiser4_exit_context(ctx);
38045+ return result;
38046+ }
38047+
38048+ result = zload(coord.node);
38049+ if (result) {
38050+ done_lh(&lh);
38051+ reiser4_exit_context(ctx);
38052+ return result;
38053+ }
38054+
38055+ iplug = item_plugin_by_coord(&coord);
38056+ if (iplug->s.file.get_block) {
38057+ result = iplug->s.file.get_block(&coord, lblock, &block);
38058+ if (result == 0)
38059+ result = block;
38060+ } else
38061+ result = RETERR(-EINVAL);
38062+
38063+ zrelse(coord.node);
38064+ done_lh(&lh);
38065+ reiser4_exit_context(ctx);
38066+ return result;
38067+}
38068+
38069+/**
38070+ * flow_by_inode_unix_file - initizlize structure flow
38071+ * @inode: inode of file for which read or write is abou
38072+ * @buf: buffer to perform read to or write from
38073+ * @user: flag showing whether @buf is user space or kernel space
38074+ * @size: size of buffer @buf
38075+ * @off: start offset fro read or write
38076+ * @op: READ or WRITE
38077+ * @flow:
38078+ *
38079+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
38080+ */
38081+int flow_by_inode_unix_file(struct inode *inode,
38082+ const char __user *buf, int user,
38083+ loff_t size, loff_t off,
38084+ rw_op op, flow_t *flow)
38085+{
38086+ assert("nikita-1100", inode != NULL);
38087+
38088+ flow->length = size;
38089+ memcpy(&flow->data, &buf, sizeof(buf));
38090+ flow->user = user;
38091+ flow->op = op;
38092+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
38093+ assert("nikita-1932",
38094+ inode_file_plugin(inode)->key_by_inode ==
38095+ key_by_inode_and_offset_common);
38096+ /* calculate key of write position and insert it into flow->key */
38097+ return key_by_inode_and_offset_common(inode, off, &flow->key);
38098+}
38099+
38100+/* plugin->u.file.set_plug_in_sd = NULL
38101+ plugin->u.file.set_plug_in_inode = NULL
38102+ plugin->u.file.create_blank_sd = NULL */
38103+/* plugin->u.file.delete */
38104+/*
38105+ plugin->u.file.add_link = reiser4_add_link_common
38106+ plugin->u.file.rem_link = NULL */
38107+
38108+/* plugin->u.file.owns_item
38109+ this is common_file_owns_item with assertion */
38110+/* Audited by: green(2002.06.15) */
38111+int
38112+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
38113+ const coord_t * coord /* coord to check */ )
38114+{
38115+ int result;
38116+
38117+ result = owns_item_common(inode, coord);
38118+ if (!result)
38119+ return 0;
38120+ if (!plugin_of_group(item_plugin_by_coord(coord),
38121+ UNIX_FILE_METADATA_ITEM_TYPE))
38122+ return 0;
38123+ assert("vs-547",
38124+ item_id_by_coord(coord) == EXTENT_POINTER_ID ||
38125+ item_id_by_coord(coord) == FORMATTING_ID);
38126+ return 1;
38127+}
38128+
38129+static int setattr_truncate(struct inode *inode, struct iattr *attr)
38130+{
38131+ int result;
38132+ int s_result;
38133+ loff_t old_size;
38134+ reiser4_tree *tree;
38135+
38136+ inode_check_scale(inode, inode->i_size, attr->ia_size);
38137+
38138+ old_size = inode->i_size;
38139+ tree = reiser4_tree_by_inode(inode);
38140+
38141+ result = safe_link_grab(tree, BA_CAN_COMMIT);
38142+ if (result == 0)
38143+ result = safe_link_add(inode, SAFE_TRUNCATE);
38144+ if (result == 0)
38145+ result = truncate_file_body(inode, attr->ia_size);
38146+ if (result)
38147+ warning("vs-1588", "truncate_file failed: oid %lli, "
38148+ "old size %lld, new size %lld, retval %d",
38149+ (unsigned long long)get_inode_oid(inode),
38150+ old_size, attr->ia_size, result);
38151+
38152+ s_result = safe_link_grab(tree, BA_CAN_COMMIT);
38153+ if (s_result == 0)
38154+ s_result =
38155+ safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
38156+ if (s_result != 0) {
38157+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
38158+ (unsigned long long)get_inode_oid(inode), s_result);
38159+ }
38160+ safe_link_release(tree);
38161+ return result;
38162+}
38163+
38164+/* plugin->u.file.setattr method */
38165+/* This calls inode_setattr and if truncate is in effect it also takes
38166+ exclusive inode access to avoid races */
38167+int setattr_unix_file(struct dentry *dentry, /* Object to change attributes */
38168+ struct iattr *attr /* change description */ )
38169+{
38170+ int result;
38171+
38172+ if (attr->ia_valid & ATTR_SIZE) {
38173+ reiser4_context *ctx;
38174+ unix_file_info_t *uf_info;
38175+
38176+ /* truncate does reservation itself and requires exclusive
38177+ access obtained */
38178+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
38179+ if (IS_ERR(ctx))
38180+ return PTR_ERR(ctx);
38181+
38182+ uf_info = unix_file_inode_data(dentry->d_inode);
38183+ get_exclusive_access(uf_info);
38184+ result = setattr_truncate(dentry->d_inode, attr);
38185+ drop_exclusive_access(uf_info);
38186+ context_set_commit_async(ctx);
38187+ reiser4_exit_context(ctx);
38188+ } else
38189+ result = reiser4_setattr_common(dentry, attr);
38190+
38191+ return result;
38192+}
38193+
38194+/* plugin->u.file.init_inode_data */
38195+void
38196+init_inode_data_unix_file(struct inode *inode,
38197+ reiser4_object_create_data * crd, int create)
38198+{
38199+ unix_file_info_t *data;
38200+
38201+ data = unix_file_inode_data(inode);
38202+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
38203+ init_rwsem(&data->latch);
38204+ data->tplug = inode_formatting_plugin(inode);
38205+ data->exclusive_use = 0;
38206+
38207+#if REISER4_DEBUG
38208+ data->ea_owner = NULL;
38209+ atomic_set(&data->nr_neas, 0);
38210+#endif
38211+ init_inode_ordering(inode, crd, create);
38212+}
38213+
38214+/**
38215+ * delete_object_unix_file - delete_object of file_plugin
38216+ * @inode: inode to be deleted
38217+ *
38218+ * Truncates file to length 0, removes stat data and safe link.
38219+ */
38220+int delete_object_unix_file(struct inode *inode)
38221+{
38222+ unix_file_info_t *uf_info;
38223+ int result;
38224+
38225+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38226+ return 0;
38227+
38228+ /* truncate file bogy first */
38229+ uf_info = unix_file_inode_data(inode);
38230+ get_exclusive_access(uf_info);
38231+ result = truncate_file_body(inode, 0 /* size */ );
38232+ drop_exclusive_access(uf_info);
38233+
38234+ if (result)
38235+ warning("", "failed to truncate file (%llu) on removal: %d",
38236+ get_inode_oid(inode), result);
38237+
38238+ /* remove stat data and safe link */
38239+ return reiser4_delete_object_common(inode);
38240+}
38241+
38242+/**
38243+ * sendfile_unix_file - sendfile of struct file_operations
38244+ * @file: file to be sent
38245+ * @ppos: position to start from
38246+ * @count: number of bytes to send
38247+ * @actor: function to copy data
38248+ * @target: where to copy read data
38249+ *
38250+ * Reads @count bytes from @file and calls @actor for every page read. This is
38251+ * needed for loop back devices support.
38252+ */
38253+ssize_t
38254+sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
38255+ read_actor_t actor, void *target)
38256+{
38257+ reiser4_context *ctx;
38258+ ssize_t result;
38259+ struct inode *inode;
38260+ unix_file_info_t *uf_info;
38261+
38262+ inode = file->f_dentry->d_inode;
38263+ ctx = reiser4_init_context(inode->i_sb);
38264+ if (IS_ERR(ctx))
38265+ return PTR_ERR(ctx);
38266+
38267+ /*
38268+ * generic_file_sndfile may want to call update_atime. Grab space for
38269+ * stat data update
38270+ */
38271+ result = reiser4_grab_space(estimate_update_common(inode),
38272+ BA_CAN_COMMIT);
38273+ if (result)
38274+ goto error;
38275+ mutex_lock(&inode->i_mutex);
38276+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
38277+ mutex_unlock(&inode->i_mutex);
38278+
38279+ uf_info = unix_file_inode_data(inode);
38280+ get_nonexclusive_access(uf_info);
38281+ result = generic_file_sendfile(file, ppos, count, actor, target);
38282+ drop_nonexclusive_access(uf_info);
38283+ error:
38284+ reiser4_exit_context(ctx);
38285+ return result;
38286+}
38287+
38288+int
38289+prepare_write_unix_file(struct file *file, struct page *page,
38290+ unsigned from, unsigned to)
38291+{
38292+ reiser4_context *ctx;
38293+ unix_file_info_t *uf_info;
38294+ int ret;
38295+
38296+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
38297+ if (IS_ERR(ctx))
38298+ return PTR_ERR(ctx);
38299+
38300+ uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38301+ get_exclusive_access(uf_info);
38302+ ret = find_file_state(file->f_dentry->d_inode, uf_info);
38303+ if (ret == 0) {
38304+ if (uf_info->container == UF_CONTAINER_TAILS)
38305+ ret = -EINVAL;
38306+ else
38307+ ret = do_prepare_write(file, page, from, to);
38308+ }
38309+ drop_exclusive_access(uf_info);
38310+
38311+ /* don't commit transaction under inode semaphore */
38312+ context_set_commit_async(ctx);
38313+ reiser4_exit_context(ctx);
38314+ return ret;
38315+}
38316+
38317+/*
38318+ * Local variables:
38319+ * c-indentation-style: "K&R"
38320+ * mode-name: "LC"
38321+ * c-basic-offset: 8
38322+ * tab-width: 8
38323+ * fill-column: 79
38324+ * scroll-step: 1
38325+ * End:
38326+ */
38327diff --git a/fs/reiser4/plugin/file/file.h b/fs/reiser4/plugin/file/file.h
38328new file mode 100644
38329index 0000000..e486a88
38330--- /dev/null
38331+++ b/fs/reiser4/plugin/file/file.h
38332@@ -0,0 +1,272 @@
38333+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38334+ * reiser4/README */
38335+
38336+/* this file contains declarations of methods implementing
38337+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
38338+ and SYMLINK_FILE_PLUGIN_ID) */
38339+
38340+#if !defined( __REISER4_FILE_H__ )
38341+#define __REISER4_FILE_H__
38342+
38343+/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38344+
38345+/* inode operations */
38346+int setattr_unix_file(struct dentry *, struct iattr *);
38347+
38348+/* file operations */
38349+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38350+ loff_t *off);
38351+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38352+ loff_t * off);
38353+int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38354+ unsigned long arg);
38355+int mmap_unix_file(struct file *, struct vm_area_struct *);
38356+int open_unix_file(struct inode *, struct file *);
38357+int release_unix_file(struct inode *, struct file *);
38358+int sync_unix_file(struct file *, struct dentry *, int datasync);
38359+ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38360+ read_actor_t, void *target);
38361+
38362+/* address space operations */
38363+int readpage_unix_file(struct file *, struct page *);
38364+int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
38365+int writepages_unix_file(struct address_space *, struct writeback_control *);
38366+int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38367+ unsigned to);
38368+int commit_write_unix_file(struct file *, struct page *, unsigned from,
38369+ unsigned to);
38370+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38371+
38372+/* file plugin operations */
38373+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38374+ int user, loff_t, loff_t, rw_op, flow_t *);
38375+int owns_item_unix_file(const struct inode *, const coord_t *);
38376+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38377+ int create);
38378+int delete_object_unix_file(struct inode *);
38379+
38380+/*
38381+ * all the write into unix file is performed by item write method. Write method
38382+ * of unix file plugin only decides which item plugin (extent or tail) and in
38383+ * which mode (one from the enum below) to call
38384+ */
38385+typedef enum {
38386+ FIRST_ITEM = 1,
38387+ APPEND_ITEM = 2,
38388+ OVERWRITE_ITEM = 3
38389+} write_mode_t;
38390+
38391+/* unix file may be in one the following states */
38392+typedef enum {
38393+ UF_CONTAINER_UNKNOWN = 0,
38394+ UF_CONTAINER_TAILS = 1,
38395+ UF_CONTAINER_EXTENTS = 2,
38396+ UF_CONTAINER_EMPTY = 3
38397+} file_container_t;
38398+
38399+struct formatting_plugin;
38400+struct inode;
38401+
38402+/* unix file plugin specific part of reiser4 inode */
38403+typedef struct unix_file_info {
38404+ /*
38405+ * this read-write lock protects file containerization change. Accesses
38406+ * which do not change file containerization (see file_container_t)
38407+ * (read, readpage, writepage, write (until tail conversion is
38408+ * involved)) take read-lock. Accesses which modify file
38409+ * containerization (truncate, conversion from tail to extent and back)
38410+ * take write-lock.
38411+ */
38412+ struct rw_semaphore latch;
38413+ /* this enum specifies which items are used to build the file */
38414+ file_container_t container;
38415+ /*
38416+ * plugin which controls when file is to be converted to extents and
38417+ * back to tail
38418+ */
38419+ struct formatting_plugin *tplug;
38420+ /* if this is set, file is in exclusive use */
38421+ int exclusive_use;
38422+#if REISER4_DEBUG
38423+ /* pointer to task struct of thread owning exclusive access to file */
38424+ void *ea_owner;
38425+ atomic_t nr_neas;
38426+ void *last_reader;
38427+#endif
38428+} unix_file_info_t;
38429+
38430+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38431+void get_exclusive_access(unix_file_info_t *);
38432+void drop_exclusive_access(unix_file_info_t *);
38433+void get_nonexclusive_access(unix_file_info_t *);
38434+void drop_nonexclusive_access(unix_file_info_t *);
38435+int try_to_get_nonexclusive_access(unix_file_info_t *);
38436+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38437+ struct inode *);
38438+int find_file_item_nohint(coord_t *, lock_handle *,
38439+ const reiser4_key *, znode_lock_mode,
38440+ struct inode *);
38441+
38442+int load_file_hint(struct file *, hint_t *);
38443+void save_file_hint(struct file *, const hint_t *);
38444+
38445+#include "../item/extent.h"
38446+#include "../item/tail.h"
38447+#include "../item/ctail.h"
38448+
38449+struct uf_coord {
38450+ coord_t coord;
38451+ lock_handle *lh;
38452+ int valid;
38453+ union {
38454+ extent_coord_extension_t extent;
38455+ tail_coord_extension_t tail;
38456+ ctail_coord_extension_t ctail;
38457+ } extension;
38458+};
38459+
38460+#include "../../forward.h"
38461+#include "../../seal.h"
38462+#include "../../lock.h"
38463+
38464+/*
38465+ * This structure is used to speed up file operations (reads and writes). A
38466+ * hint is a suggestion about where a key resolved to last time. A seal
38467+ * indicates whether a node has been modified since a hint was last recorded.
38468+ * You check the seal, and if the seal is still valid, you can use the hint
38469+ * without traversing the tree again.
38470+ */
38471+struct hint {
38472+ seal_t seal; /* a seal over last file item accessed */
38473+ uf_coord_t ext_coord;
38474+ loff_t offset;
38475+ znode_lock_mode mode;
38476+ lock_handle lh;
38477+};
38478+
38479+static inline int hint_is_valid(hint_t * hint)
38480+{
38481+ return hint->ext_coord.valid;
38482+}
38483+
38484+static inline void hint_set_valid(hint_t * hint)
38485+{
38486+ hint->ext_coord.valid = 1;
38487+}
38488+
38489+static inline void hint_clr_valid(hint_t * hint)
38490+{
38491+ hint->ext_coord.valid = 0;
38492+}
38493+
38494+int load_file_hint(struct file *, hint_t *);
38495+void save_file_hint(struct file *, const hint_t *);
38496+void hint_init_zero(hint_t *);
38497+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38498+int hint_is_set(const hint_t *);
38499+void reiser4_unset_hint(hint_t *);
38500+
38501+int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
38502+int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38503+ loff_t cur_size, int (*update_actor) (struct inode *,
38504+ reiser4_key *, int));
38505+#if REISER4_DEBUG
38506+
38507+/* return 1 is exclusive access is obtained, 0 - otherwise */
38508+static inline int ea_obtained(unix_file_info_t * uf_info)
38509+{
38510+ int ret;
38511+
38512+ ret = down_read_trylock(&uf_info->latch);
38513+ if (ret)
38514+ up_read(&uf_info->latch);
38515+ return !ret;
38516+}
38517+
38518+#endif
38519+
38520+/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38521+int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
38522+ reiser4_object_create_data *);
38523+void destroy_inode_symlink(struct inode *);
38524+
38525+/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID
38526+ file plugin */
38527+
38528+/* inode operations */
38529+int setattr_cryptcompress(struct dentry *, struct iattr *);
38530+int prot_setattr_cryptcompress(struct dentry *, struct iattr *);
38531+
38532+/* file operations */
38533+ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38534+ loff_t * off);
38535+ssize_t prot_read_cryptcompress(struct file *, char __user *buf,
38536+ size_t read_amount, loff_t * off);
38537+
38538+ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38539+ loff_t * off, int * conv);
38540+ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38541+ loff_t * off);
38542+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38543+int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *);
38544+ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38545+ read_actor_t actor, void *target);
38546+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38547+ read_actor_t actor, void *target);
38548+
38549+int release_cryptcompress(struct inode *, struct file *);
38550+int prot_release_cryptcompress(struct inode *, struct file *);
38551+
38552+/* address space operations */
38553+extern int readpage_cryptcompress(struct file *, struct page *);
38554+extern int writepages_cryptcompress(struct address_space *,
38555+ struct writeback_control *);
38556+/* file plugin operations */
38557+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38558+ int user, loff_t, loff_t, rw_op, flow_t *);
38559+int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38560+int create_cryptcompress(struct inode *, struct inode *,
38561+ reiser4_object_create_data *);
38562+int delete_object_cryptcompress(struct inode *);
38563+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38564+ int create);
38565+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38566+ const reiser4_key * to_key,
38567+ reiser4_key * smallest_removed,
38568+ struct inode *object, int truncate,
38569+ int *progress);
38570+void destroy_inode_cryptcompress(struct inode *);
38571+int open_object_cryptcompress(struct inode * inode, struct file * file);
38572+
38573+extern reiser4_plugin_ops cryptcompress_plugin_ops;
38574+
38575+#define WRITE_GRANULARITY 32
38576+
38577+int tail2extent(unix_file_info_t *);
38578+int extent2tail(unix_file_info_t *);
38579+
38580+int goto_right_neighbor(coord_t *, lock_handle *);
38581+int find_or_create_extent(struct page *);
38582+int equal_to_ldk(znode *, const reiser4_key *);
38583+
38584+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
38585+
38586+static inline int cbk_errored(int cbk_result)
38587+{
38588+ return (cbk_result != CBK_COORD_NOTFOUND
38589+ && cbk_result != CBK_COORD_FOUND);
38590+}
38591+
38592+/* __REISER4_FILE_H__ */
38593+#endif
38594+
38595+/*
38596+ * Local variables:
38597+ * c-indentation-style: "K&R"
38598+ * mode-name: "LC"
38599+ * c-basic-offset: 8
38600+ * tab-width: 8
38601+ * fill-column: 79
38602+ * scroll-step: 1
38603+ * End:
38604+*/
38605diff --git a/fs/reiser4/plugin/file/file_conversion.c b/fs/reiser4/plugin/file/file_conversion.c
38606new file mode 100644
38607index 0000000..2e07b66
38608--- /dev/null
38609+++ b/fs/reiser4/plugin/file/file_conversion.c
38610@@ -0,0 +1,594 @@
38611+/* Copyright 2001, 2002, 2003 by Hans Reiser,
38612+ licensing governed by reiser4/README */
38613+
38614+/* This file contains hooks that converts (*) cryptcompress files to unix-files,
38615+ and a set of protected (**) methods of a cryptcompress file plugin to perform
38616+ such conversion.
38617+
38618+(*)
38619+ The conversion is performed for incompressible files to reduce cpu and memory
38620+ usage. If first logical cluster (64K by default) of a file is incompressible,
38621+ then we make a desicion, that the whole file is incompressible.
38622+ The conversion can be enabled via installing a special compression mode
38623+ plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for
38624+ details).
38625+
38626+(**)
38627+ The protection means serialization of critical sections (readers and writers
38628+ of @pset->file)
38629+*/
38630+
38631+#include "../../inode.h"
38632+#include "../cluster.h"
38633+#include "file.h"
38634+
38635+#define conversion_enabled(inode) \
38636+ (inode_compression_mode_plugin(inode) == \
38637+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
38638+
38639+
38640+/* Located sections (readers and writers of @pset->file) are not
38641+ permanently critical: cryptcompress file can be converted only
38642+ if the conversion is enabled (see the macrio above). And we don't
38643+ convert unix files at all.
38644+ The following helper macro is a sanity check to decide if we
38645+ need to protect a located section.
38646+*/
38647+#define should_protect(inode) \
38648+ (inode_file_plugin(inode) == \
38649+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
38650+ conversion_enabled(inode))
38651+
38652+/* All protected methods have prefix "prot" in their names.
38653+ It is convenient to construct them by usual (unprotected) ones
38654+ using the following common macros:
38655+*/
38656+
38657+/* Macro for passive protection.
38658+ method_cryptcompress contains only readers */
38659+#define PROT_PASSIVE(type, method, args) \
38660+({ \
38661+ type _result; \
38662+ struct rw_semaphore * guard = \
38663+ &reiser4_inode_data(inode)->conv_sem; \
38664+ \
38665+ if (should_protect(inode)) { \
38666+ down_read(guard); \
38667+ if (!should_protect(inode)) \
38668+ up_read(guard); \
38669+ } \
38670+ if (inode_file_plugin(inode) == \
38671+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38672+ _result = method ## _unix_file args; \
38673+ else \
38674+ _result = method ## _cryptcompress args; \
38675+ if (should_protect(inode)) \
38676+ up_read(guard); \
38677+ _result; \
38678+})
38679+
38680+#define PROT_PASSIVE_VOID(method, args) \
38681+({ \
38682+ struct rw_semaphore * guard = \
38683+ &reiser4_inode_data(inode)->conv_sem; \
38684+ \
38685+ if (should_protect(inode)) { \
38686+ down_read(guard); \
38687+ if (!should_protect(inode)) \
38688+ up_read(guard); \
38689+ } \
38690+ if (inode_file_plugin(inode) == \
38691+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38692+ method ## _unix_file args; \
38693+ else \
38694+ method ## _cryptcompress args; \
38695+ if (should_protect(inode)) \
38696+ up_read(guard); \
38697+})
38698+
38699+/* Macro for active protection.
38700+ active_expr contains readers and writers; after its
38701+ evaluation conversion should be disabled */
38702+#define PROT_ACTIVE(type, method, args, active_expr) \
38703+({ \
38704+ type _result = 0; \
38705+ struct rw_semaphore * guard = \
38706+ &reiser4_inode_data(inode)->conv_sem; \
38707+ reiser4_context * ctx = reiser4_init_context(inode->i_sb); \
38708+ if (IS_ERR(ctx)) \
38709+ return PTR_ERR(ctx); \
38710+ \
38711+ if (should_protect(inode)) { \
38712+ down_write(guard); \
38713+ if (should_protect(inode)) \
38714+ _result = active_expr; \
38715+ up_write(guard); \
38716+ } \
38717+ if (_result == 0) { \
38718+ if (inode_file_plugin(inode) == \
38719+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID)) \
38720+ _result = method ## _unix_file args; \
38721+ else \
38722+ _result = method ## _cryptcompress args; \
38723+ } \
38724+ reiser4_exit_context(ctx); \
38725+ _result; \
38726+})
38727+
38728+/* Pass management to the unix-file plugin with "notail" policy */
38729+static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
38730+{
38731+ int result;
38732+ reiser4_inode *info;
38733+ unix_file_info_t * uf;
38734+ info = reiser4_inode_data(inode);
38735+
38736+ result = aset_set_unsafe(&info->pset,
38737+ PSET_FILE,
38738+ (reiser4_plugin *)
38739+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
38740+ if (result)
38741+ return result;
38742+ result = aset_set_unsafe(&info->pset,
38743+ PSET_FORMATTING,
38744+ (reiser4_plugin *)
38745+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
38746+ if (result)
38747+ return result;
38748+ /* get rid of non-standard plugins */
38749+ info->plugin_mask &= ~cryptcompress_mask;
38750+ /* get rid of plugin stat-data extension */
38751+ info->extmask &= ~(1 << PLUGIN_STAT);
38752+
38753+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
38754+
38755+ /* FIXME use init_inode_data_unix_file() instead,
38756+ but aviod init_inode_ordering() */
38757+ /* Init unix-file specific part of inode */
38758+ uf = unix_file_inode_data(inode);
38759+ uf->container = UF_CONTAINER_UNKNOWN;
38760+ init_rwsem(&uf->latch);
38761+ uf->tplug = inode_formatting_plugin(inode);
38762+ uf->exclusive_use = 0;
38763+#if REISER4_DEBUG
38764+ uf->ea_owner = NULL;
38765+ atomic_set(&uf->nr_neas, 0);
38766+#endif
38767+ inode->i_op =
38768+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops;
38769+ inode->i_fop =
38770+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops;
38771+ inode->i_mapping->a_ops =
38772+ &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops;
38773+ file->f_op = inode->i_fop;
38774+ return 0;
38775+}
38776+
38777+#if REISER4_DEBUG
38778+static int disabled_conversion_inode_ok(struct inode * inode)
38779+{
38780+ __u64 extmask = reiser4_inode_data(inode)->extmask;
38781+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
38782+
38783+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
38784+ (extmask & (1 << UNIX_STAT)) &&
38785+ (extmask & (1 << LARGE_TIMES_STAT)) &&
38786+ (extmask & (1 << PLUGIN_STAT)) &&
38787+ (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
38788+}
38789+#endif
38790+
38791+/* Assign another mode that will control
38792+ compression at flush time only */
38793+static int disable_conversion_no_update_sd(struct inode * inode)
38794+{
38795+ int result;
38796+ result =
38797+ force_plugin_pset(inode,
38798+ PSET_COMPRESSION_MODE,
38799+ (reiser4_plugin *)compression_mode_plugin_by_id
38800+ (LATTD_COMPRESSION_MODE_ID));
38801+ assert("edward-1500",
38802+ ergo(!result, disabled_conversion_inode_ok(inode)));
38803+ return result;
38804+}
38805+
38806+/* Disable future attempts to check/convert. This function is called by
38807+ conversion hooks. */
38808+static int disable_conversion(struct inode * inode)
38809+{
38810+ return disable_conversion_no_update_sd(inode);
38811+}
38812+
38813+static int check_position(struct inode * inode,
38814+ loff_t pos /* initial position in the file */,
38815+ reiser4_cluster_t * clust,
38816+ int * check_compress)
38817+{
38818+ assert("edward-1505", conversion_enabled(inode));
38819+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
38820+ /* if file size is more then cluster size, then compressible
38821+ status must be figured out (i.e. compression was disabled,
38822+ or file plugin was converted to unix_file) */
38823+
38824+ if (pos > inode->i_size)
38825+ /* first logical cluster will contain a (partial) hole */
38826+ return disable_conversion(inode);
38827+ if (inode->i_size == inode_cluster_size(inode))
38828+ *check_compress = 1;
38829+ return 0;
38830+}
38831+
38832+static void start_check_compressibility(struct inode * inode,
38833+ reiser4_cluster_t * clust,
38834+ hint_t * hint)
38835+{
38836+ assert("edward-1507", clust->index == 1);
38837+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
38838+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
38839+
38840+ hint_init_zero(hint);
38841+ clust->hint = hint;
38842+ clust->index --;
38843+ clust->nr_pages = count_to_nrpages(fsize_to_count(clust, inode));
38844+
38845+ /* first logical cluster (of index #0) must be complete */
38846+ assert("edward-1510", fsize_to_count(clust, inode) ==
38847+ inode_cluster_size(inode));
38848+}
38849+
38850+static void finish_check_compressibility(struct inode * inode,
38851+ reiser4_cluster_t * clust,
38852+ hint_t * hint)
38853+{
38854+ reiser4_unset_hint(clust->hint);
38855+ clust->hint = hint;
38856+ clust->index ++;
38857+}
38858+
38859+#if REISER4_DEBUG
38860+static int prepped_dclust_ok(hint_t * hint)
38861+{
38862+ reiser4_key key;
38863+ coord_t * coord = &hint->ext_coord.coord;
38864+
38865+ item_key_by_coord(coord, &key);
38866+ return (item_id_by_coord(coord) == CTAIL_ID &&
38867+ !coord_is_unprepped_ctail(coord) &&
38868+ (get_key_offset(&key) + nr_units_ctail(coord) ==
38869+ dclust_get_extension_dsize(hint)));
38870+}
38871+#endif
38872+
38873+#define fifty_persent(size) (size >> 1)
38874+/* evaluation of data compressibility */
38875+#define data_is_compressible(osize, isize) \
38876+ (osize < fifty_persent(isize))
38877+
38878+/* This is called only once per file life.
38879+ Read first logical cluster (of index #0) and estimate its compressibility.
38880+ Save estimation result in @compressible */
38881+static int read_check_compressibility(struct inode * inode,
38882+ reiser4_cluster_t * clust,
38883+ int * compressible)
38884+{
38885+ int i;
38886+ int result;
38887+ __u32 dst_len;
38888+ hint_t tmp_hint;
38889+ hint_t * cur_hint = clust->hint;
38890+
38891+ start_check_compressibility(inode, clust, &tmp_hint);
38892+
38893+ result = grab_cluster_pages(inode, clust);
38894+ if (result)
38895+ return result;
38896+ /* Read page cluster here */
38897+ for (i = 0; i < clust->nr_pages; i++) {
38898+ struct page *page = clust->pages[i];
38899+ lock_page(page);
38900+ result = do_readpage_ctail(inode, clust, page,
38901+ ZNODE_READ_LOCK);
38902+ unlock_page(page);
38903+ if (result)
38904+ goto error;
38905+ }
38906+ tfm_cluster_clr_uptodate(&clust->tc);
38907+
38908+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
38909+
38910+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
38911+ /* lenght of compressed data is known, no need to compress */
38912+ assert("edward-1511",
38913+ znode_is_write_locked(tmp_hint.ext_coord.coord.node));
38914+ assert("edward-1512",
38915+ WITH_DATA(tmp_hint.ext_coord.coord.node,
38916+ prepped_dclust_ok(&tmp_hint)));
38917+ dst_len = dclust_get_extension_dsize(&tmp_hint);
38918+ }
38919+ else {
38920+ tfm_cluster_t * tc = &clust->tc;
38921+ compression_plugin * cplug = inode_compression_plugin(inode);
38922+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
38923+ if (result)
38924+ goto error;
38925+ for (i = 0; i < clust->nr_pages; i++) {
38926+ char *data;
38927+ lock_page(clust->pages[i]);
38928+ BUG_ON(!PageUptodate(clust->pages[i]));
38929+ data = kmap(clust->pages[i]);
38930+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
38931+ data, PAGE_CACHE_SIZE);
38932+ kunmap(clust->pages[i]);
38933+ unlock_page(clust->pages[i]);
38934+ }
38935+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
38936+ if (result)
38937+ goto error;
38938+ result = grab_coa(tc, cplug);
38939+ if (result)
38940+ goto error;
38941+ tc->len = tc->lsize = fsize_to_count(clust, inode);
38942+ assert("edward-1513", tc->len == inode_cluster_size(inode));
38943+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
38944+ cplug->compress(get_coa(tc, cplug->h.id, tc->act),
38945+ tfm_input_data(clust), tc->len,
38946+ tfm_output_data(clust), &dst_len);
38947+ assert("edward-1514",
38948+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
38949+ }
38950+ finish_check_compressibility(inode, clust, cur_hint);
38951+ *compressible = data_is_compressible(dst_len,
38952+ inode_cluster_size(inode));
38953+ return 0;
38954+ error:
38955+ reiser4_release_cluster_pages(clust);
38956+ return result;
38957+}
38958+
38959+/* Cut disk cluster of index @idx */
38960+static int cut_disk_cluster(struct inode * inode, cloff_t idx)
38961+{
38962+ reiser4_key from, to;
38963+ assert("edward-1515", inode_file_plugin(inode) ==
38964+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
38965+ key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
38966+ to = from;
38967+ set_key_offset(&to,
38968+ get_key_offset(&from) + inode_cluster_size(inode) - 1);
38969+ return reiser4_cut_tree(reiser4_tree_by_inode(inode),
38970+ &from, &to, inode, 0);
38971+}
38972+
38973+static int reserve_cryptcompress2unixfile(struct inode *inode)
38974+{
38975+ reiser4_block_nr unformatted_nodes;
38976+ reiser4_tree *tree;
38977+
38978+ tree = reiser4_tree_by_inode(inode);
38979+
38980+ /* number of unformatted nodes which will be created */
38981+ unformatted_nodes = cluster_nrpages(inode); /* N */
38982+
38983+ /*
38984+ * space required for one iteration of extent->tail conversion:
38985+ *
38986+ * 1. kill ctail items
38987+ *
38988+ * 2. insert N unformatted nodes
38989+ *
38990+ * 3. insert N (worst-case single-block
38991+ * extents) extent units.
38992+ *
38993+ * 4. drilling to the leaf level by coord_by_key()
38994+ *
38995+ * 5. possible update of stat-data
38996+ *
38997+ */
38998+ grab_space_enable();
38999+ return reiser4_grab_space
39000+ (2 * tree->height +
39001+ unformatted_nodes +
39002+ unformatted_nodes * estimate_one_insert_into_item(tree) +
39003+ 1 + estimate_one_insert_item(tree) +
39004+ inode_file_plugin(inode)->estimate.update(inode),
39005+ BA_CAN_COMMIT);
39006+}
39007+
39008+/* clear flag that indicated conversion and update
39009+ stat-data with new (unix-file - specific) info */
39010+static int complete_file_conversion(struct inode *inode)
39011+{
39012+ int result;
39013+
39014+ grab_space_enable();
39015+ result =
39016+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39017+ BA_CAN_COMMIT);
39018+ if (result == 0) {
39019+ reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
39020+ result = reiser4_update_sd(inode);
39021+ }
39022+ if (result)
39023+ warning("edward-1452",
39024+ "Converting %llu to unix-file: update sd failed (%i)",
39025+ (unsigned long long)get_inode_oid(inode), result);
39026+ return 0;
39027+}
39028+
39029+
39030+/* do conversion */
39031+static int cryptcompress2unixfile(struct file *file, struct inode * inode,
39032+ reiser4_cluster_t * clust)
39033+{
39034+ int i;
39035+ int result = 0;
39036+ cryptcompress_info_t *cr_info;
39037+ unix_file_info_t *uf_info;
39038+
39039+ assert("edward-1516", clust->pages[0]->index == 0);
39040+ assert("edward-1517", clust->hint != NULL);
39041+
39042+ /* release all cryptcompress-specific recources */
39043+ cr_info = cryptcompress_inode_data(inode);
39044+ result = reserve_cryptcompress2unixfile(inode);
39045+ if (result)
39046+ goto out;
39047+ reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
39048+ reiser4_unset_hint(clust->hint);
39049+ result = cut_disk_cluster(inode, 0);
39050+ if (result)
39051+ goto out;
39052+ /* captured jnode of cluster and assotiated resources (pages,
39053+ reserved disk space) were released by ->kill_hook() method
39054+ of the item plugin */
39055+
39056+ result = __cryptcompress2unixfile(file, inode);
39057+ if (result)
39058+ goto out;
39059+ /* At this point file is managed by unix file plugin */
39060+
39061+ uf_info = unix_file_inode_data(inode);
39062+
39063+ assert("edward-1518",
39064+ ergo(jprivate(clust->pages[0]),
39065+ !jnode_is_cluster_page(jprivate(clust->pages[0]))));
39066+ for(i = 0; i < clust->nr_pages; i++) {
39067+ assert("edward-1519", clust->pages[i]);
39068+ assert("edward-1520", PageUptodate(clust->pages[i]));
39069+
39070+ result = find_or_create_extent(clust->pages[i]);
39071+ if (result)
39072+ break;
39073+ }
39074+ if (!result) {
39075+ uf_info->container = UF_CONTAINER_EXTENTS;
39076+ complete_file_conversion(inode);
39077+ }
39078+ out:
39079+ all_grabbed2free();
39080+ if (result)
39081+ warning("edward-1453", "Failed to convert file %llu: %i",
39082+ (unsigned long long)get_inode_oid(inode), result);
39083+ return result;
39084+}
39085+
39086+/* Check, then perform or disable conversion if needed */
39087+int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
39088+ reiser4_cluster_t * clust, int * progress)
39089+{
39090+ int result;
39091+ int check_compress = 0;
39092+ int compressible = 0;
39093+
39094+ if (!conversion_enabled(inode))
39095+ return 0;
39096+ result = check_position(inode, pos, clust, &check_compress);
39097+ if (result || !check_compress)
39098+ return result;
39099+ result = read_check_compressibility(inode, clust, &compressible);
39100+ if (result)
39101+ return result;
39102+
39103+ /* At this point page cluster is grabbed and uptodate */
39104+ if (!compressible) {
39105+ result = cryptcompress2unixfile(file, inode, clust);
39106+ if (result == 0)
39107+ *progress = 1;
39108+ }
39109+ else
39110+ result = disable_conversion(inode);
39111+
39112+ reiser4_release_cluster_pages(clust);
39113+ return result;
39114+}
39115+
39116+static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
39117+{
39118+ return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
39119+}
39120+
39121+/* Protected methods of cryptcompress file plugin constructed
39122+ by the macros above */
39123+
39124+/* Wrappers with active protection for:
39125+ . write_cryptcompress;
39126+ . setattr_cryptcompress;
39127+*/
39128+
39129+ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf,
39130+ size_t count, loff_t *off)
39131+{
39132+ int prot = 0;
39133+ int conv = 0;
39134+ ssize_t written_cr = 0;
39135+ ssize_t written_uf = 0;
39136+ struct inode * inode = file->f_dentry->d_inode;
39137+ struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
39138+
39139+ if (should_protect(inode)) {
39140+ prot = 1;
39141+ down_write(guard);
39142+ }
39143+ written_cr = write_cryptcompress(file, buf, count, off, &conv);
39144+ if (prot)
39145+ up_write(guard);
39146+ if (written_cr < 0)
39147+ return written_cr;
39148+ if (conv)
39149+ written_uf = write_unix_file(file, buf + written_cr,
39150+ count - written_cr, off);
39151+ return written_cr + (written_uf < 0 ? 0 : written_uf);
39152+}
39153+
39154+int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
39155+{
39156+ struct inode * inode = dentry->d_inode;
39157+ return PROT_ACTIVE(int, setattr, (dentry, attr),
39158+ setattr_conversion_hook(inode, attr));
39159+}
39160+
39161+/* Wrappers with passive protection for:
39162+ . read_cryptcomperess;
39163+ . mmap_cryptcompress;
39164+ . release_cryptcompress;
39165+ . sendfile_cryptcompress;
39166+ . delete_object_cryptcompress.
39167+*/
39168+ssize_t prot_read_cryptcompress(struct file * file, char __user * buf,
39169+ size_t size, loff_t * off)
39170+{
39171+ struct inode * inode = file->f_dentry->d_inode;
39172+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
39173+}
39174+
39175+int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
39176+{
39177+ struct inode *inode = file->f_dentry->d_inode;
39178+ return PROT_PASSIVE(int, mmap, (file, vma));
39179+}
39180+
39181+int prot_release_cryptcompress(struct inode *inode, struct file *file)
39182+{
39183+ return PROT_PASSIVE(int, release, (inode, file));
39184+}
39185+
39186+ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos,
39187+ size_t count, read_actor_t actor,
39188+ void *target)
39189+{
39190+ struct inode * inode = file->f_dentry->d_inode;
39191+ return PROT_PASSIVE(ssize_t, sendfile,
39192+ (file, ppos, count, actor, target));
39193+}
39194+
39195+/*
39196+ Local variables:
39197+ c-indentation-style: "K&R"
39198+ mode-name: "LC"
39199+ c-basic-offset: 8
39200+ tab-width: 8
39201+ fill-column: 80
39202+ scroll-step: 1
39203+ End:
39204+*/
39205diff --git a/fs/reiser4/plugin/file/invert.c b/fs/reiser4/plugin/file/invert.c
39206new file mode 100644
39207index 0000000..7349878
39208--- /dev/null
39209+++ b/fs/reiser4/plugin/file/invert.c
39210@@ -0,0 +1,493 @@
39211+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39212+
39213+/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
39214+ buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert
39215+ provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files
39216+ when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it
39217+ to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to
39218+ make that easy for you by providing those delimiters in what you read from it.
39219+
39220+ When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a
39221+ bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
39222+ would create those files. But which files? Well, that must be specified in the body of the invert using a special
39223+ syntax, and that specification is called the invert of the assignment.
39224+
39225+ When written to, an invert performs the assignment command that is written
39226+ to it, and modifies its own body to contain the invert of that
39227+ assignment.
39228+
39229+ In other words, writing to an invert file what you have read from it
39230+ is the identity operation.
39231+
39232+ Malformed assignments cause write errors. Partial writes are not
39233+ supported in v4.0, but will be.
39234+
39235+ Example:
39236+
39237+ If an invert contains:
39238+
39239+ /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
39240+
39241+======================
39242+Each element in this definition should be an invert, and all files
39243+should be called recursively - too. This is bad. If one of the
39244+included files in not a regular or invert file, then we can't read
39245+main file.
39246+
39247+I think to make it is possible easier:
39248+
39249+internal structure of invert file should be like symlink file. But
39250+read and write method should be explitely indicated in i/o operation..
39251+
39252+By default we read and write (if probably) as symlink and if we
39253+specify ..invert at reading time that too we can specify it at write time.
39254+
39255+example:
39256+/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
39257+will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
39258+
39259+read of /my_invert_file/..invert will be
39260+/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
39261+
39262+but read of /my_invert_file/ will be
39263+The contents of filenameAsome text stored in the invertThe contents of filenameB
39264+
39265+we also can creat this file as
39266+/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
39267+will create /my_invert_file , and use existing files /filenameA and /filenameB.
39268+
39269+and when we will read it will be as previously invert file.
39270+
39271+This is correct?
39272+
39273+ vv
39274+DEMIDOV-FIXME-HANS:
39275+
39276+Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
39277+
39278+Do you agree? Discuss it on reiserfs-list....
39279+
39280+-Hans
39281+=======================
39282+
39283+ Then a read will return:
39284+
39285+ /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
39286+
39287+ and a write of the line above to the invert will set the contents of
39288+ the invert and filenameA and filenameB to their original values.
39289+
39290+ Note that the contents of an invert have no influence on the effect
39291+ of a write unless the write is a partial write (and a write of a
39292+ shorter file without using truncate first is a partial write).
39293+
39294+ truncate() has no effect on filenameA and filenameB, it merely
39295+ resets the value of the invert.
39296+
39297+ Writes to subfiles via the invert are implemented by preceding them
39298+ with truncates.
39299+
39300+ Parse failures cause write failures.
39301+
39302+ Questions to ponder: should the invert be acted on prior to file
39303+ close when writing to an open filedescriptor?
39304+
39305+ Example:
39306+
39307+ If an invert contains:
39308+
39309+ "(This text and a pair of quotes are all that is here.)
39310+
39311+Then a read will return:
39312+
39313+ "(This text and a pair of quotes are all that is here.)
39314+
39315+*/
39316+
39317+/* OPEN method places a struct file in memory associated with invert body
39318+ and returns something like file descriptor to the user for the future access
39319+ to the invert file.
39320+ During opening we parse the body of invert and get a list of the 'entryes'
39321+ (that describes all its subfiles) and place pointer on the first struct in
39322+ reiserfs-specific part of invert inode (arbitrary decision).
39323+
39324+ Each subfile is described by the struct inv_entry that has a pointer @sd on
39325+ in-core based stat-data and a pointer on struct file @f (if we find that the
39326+ subfile uses more then one unformated node (arbitrary decision), we load
39327+ struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
39328+ of some other information we need)
39329+
39330+ Since READ and WRITE methods for inverts were formulated in assignment
39331+ language, they don't contain arguments 'size' and 'offset' that make sense
39332+ only in ordinary read/write methods.
39333+
39334+ READ method is a combination of two methods:
39335+ 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
39336+ with @f != 0, this method uses pointer on struct file as an argument
39337+ 2) read method for inode-less files with @sd != 0, this method uses
39338+ in-core based stat-data instead struct file as an argument.
39339+ in the first case we don't use pagecache, just copy data that we got after
39340+ cbk() into userspace.
39341+
39342+ WRITE method for invert files is more complex.
39343+ Besides declared WRITE-interface in assignment languageb above we need
39344+ to have an opportunity to edit unwrapped body of invert file with some
39345+ text editor, it means we need GENERIC WRITE METHOD for invert file:
39346+
39347+ my_invert_file/..invert <- "string"
39348+
39349+ this method parses "string" and looks for correct subfile signatures, also
39350+ the parsing process splits this "string" on the set of flows in accordance
39351+ with the set of subfiles specified by this signarure.
39352+ The found list of signatures #S is compared with the opened one #I of invert
39353+ file. If it doesn't have this one (#I==0, it will be so for instance if we
39354+ have just create this invert file) the write method assignes found signature
39355+ (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
39356+ itself to the some write methods for ordinary or light-weight, or call itself
39357+ recursively for invert files with corresponding flows.
39358+ I am not sure, but the list of signatures looks like what mr.Demidov means
39359+ by 'delimiters'.
39360+
39361+ The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
39362+ and cause delete (create new) subfiles (arbitrary decision - it may looks
39363+ too complex, but this interface will be the completest). The order of entries
39364+ of list #S (#I) and inherited order on #I (#S) must coincide.
39365+ The other parsing results give malformed signature that aborts READ method
39366+ and releases all resources.
39367+
39368+ Format of subfile (entry) signature:
39369+
39370+ "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
39371+
39372+ Legend:
39373+
39374+ START_MAGIC - keyword indicates the start of subfile signature;
39375+
39376+ <> indicates the start of 'subfile metadata', that is the pair
39377+ (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
39378+
39379+ TYPE - the string "type" indicates the start of one of the three words:
39380+ - ORDINARY_FILE,
39381+ - LIGHT_WEIGHT_FILE,
39382+ - INVERT_FILE;
39383+
39384+ LOOKUP_ARG - lookup argument depends on previous type:
39385+ */
39386+
39387+ /************************************************************/
39388+ /* TYPE * LOOKUP ARGUMENT */
39389+ /************************************************************/
39390+ /* LIGH_WEIGHT_FILE * stat-data key */
39391+ /************************************************************/
39392+ /* ORDINARY_FILE * filename */
39393+ /************************************************************/
39394+ /* INVERT_FILE * filename */
39395+ /************************************************************/
39396+
39397+ /* where:
39398+ *stat-data key - the string contains stat data key of this subfile, it will be
39399+ passed to fast-access lookup method for light-weight files;
39400+ *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
39401+ for ordinary and invert files;
39402+
39403+ SUBFILE_BODY - data of this subfile (it will go to the flow)
39404+ END_MAGIC - the keyword indicates the end of subfile signature.
39405+
39406+ The other simbols inside the signature interpreted as 'unformatted content',
39407+ which is available with VFS's read_link() (arbitraruy decision).
39408+
39409+ NOTE: Parse method for a body of invert file uses mentioned signatures _without_
39410+ subfile bodies.
39411+
39412+ Now the only unclear thing is WRITE in regular light-weight subfile A that we
39413+ can describe only in assignment language:
39414+
39415+ A <- "some_string"
39416+
39417+ I guess we don't want to change stat-data and body items of file A
39418+ if this file exist, and size(A) != size("some_string") because this operation is
39419+ expencive, so we only do the partial write if size(A) > size("some_string")
39420+ and do truncate of the "some_string", and then do A <- "truncated string", if
39421+ size(A) < size("some_string"). This decision is also arbitrary..
39422+ */
39423+
39424+/* here is infrastructure for formated flows */
39425+
39426+#define SUBFILE_HEADER_MAGIC 0x19196605
39427+#define FLOW_HEADER_MAGIC 0x01194304
39428+
39429+#include "../plugin.h"
39430+#include "../../debug.h"
39431+#include "../../forward.h"
39432+#include "../object.h"
39433+#include "../item/item.h"
39434+#include "../item/static_stat.h"
39435+#include "../../dformat.h"
39436+#include "../znode.h"
39437+#include "../inode.h"
39438+
39439+#include <linux/types.h>
39440+#include <linux/fs.h> /* for struct file */
39441+#include <linux/list.h> /* for struct list_head */
39442+
39443+typedef enum {
39444+ LIGHT_WEIGHT_FILE,
39445+ ORDINARY_FILE,
39446+ INVERT_FILE
39447+} inv_entry_type;
39448+
39449+typedef struct flow_header {
39450+ d32 fl_magic;
39451+ d16 fl_nr; /* number of subfiles in the flow */
39452+};
39453+
39454+typedef struct subfile_header {
39455+ d32 sh_magic; /* subfile magic */
39456+ d16 sh_type; /* type of subfile: light-weight, ordinary, invert */
39457+ d16 sh_arg_len; /* lenght of lookup argument (filename, key) */
39458+ d32 sh_body_len; /* lenght of subfile body */
39459+};
39460+
39461+/* functions to get/set fields of flow header */
39462+
39463+static void fl_set_magic(flow_header * fh, __u32 value)
39464+{
39465+ cputod32(value, &fh->fh_magic);
39466+}
39467+
39468+static __u32 fl_get_magic(flow_header * fh)
39469+{
39470+ return d32tocpu(&fh->fh_magic);
39471+}
39472+static void fl_set_number(flow_header * fh, __u16 value)
39473+{
39474+ cputod16(value, &fh->fh_nr);
39475+}
39476+static unsigned fl_get_number(flow_header * fh)
39477+{
39478+ return d16tocpu(&fh->fh_nr);
39479+}
39480+
39481+/* functions to get/set fields of subfile header */
39482+
39483+static void sh_set_magic(subfile_header * sh, __u32 value)
39484+{
39485+ cputod32(value, &sh->sh_magic);
39486+}
39487+
39488+static __u32 sh_get_magic(subfile_header * sh)
39489+{
39490+ return d32tocpu(&sh->sh_magic);
39491+}
39492+static void sh_set_type(subfile_header * sh, __u16 value)
39493+{
39494+ cputod16(value, &sh->sh_magic);
39495+}
39496+static unsigned sh_get_type(subfile_header * sh)
39497+{
39498+ return d16tocpu(&sh->sh_magic);
39499+}
39500+static void sh_set_arg_len(subfile_header * sh, __u16 value)
39501+{
39502+ cputod16(value, &sh->sh_arg_len);
39503+}
39504+static unsigned sh_get_arg_len(subfile_header * sh)
39505+{
39506+ return d16tocpu(&sh->sh_arg_len);
39507+}
39508+static void sh_set_body_len(subfile_header * sh, __u32 value)
39509+{
39510+ cputod32(value, &sh->sh_body_len);
39511+}
39512+
39513+static __u32 sh_get_body_len(subfile_header * sh)
39514+{
39515+ return d32tocpu(&sh->sh_body_len);
39516+}
39517+
39518+/* in-core minimal stat-data, light-weight analog of inode */
39519+
39520+struct incore_sd_base {
39521+ umode_t isd_mode;
39522+ nlink_t isd_nlink;
39523+ loff_t isd_size;
39524+ char *isd_data; /* 'subflow' to write */
39525+};
39526+
39527+/* open invert create a list of invert entries,
39528+ every entry is represented by structure inv_entry */
39529+
39530+struct inv_entry {
39531+ struct list_head *ie_list;
39532+ struct file *ie_file; /* this is NULL if the file doesn't
39533+ have unformated nodes */
39534+ struct incore_sd_base *ie_sd; /* inode-less analog of struct file */
39535+};
39536+
39537+/* allocate and init invert entry */
39538+
39539+static struct inv_entry *allocate_inv_entry(void)
39540+{
39541+ struct inv_entry *inv_entry;
39542+
39543+ inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
39544+ if (!inv_entry)
39545+ return ERR_PTR(RETERR(-ENOMEM));
39546+ inv_entry->ie_file = NULL;
39547+ inv_entry->ie_sd = NULL;
39548+ INIT_LIST_HEAD(&inv_entry->ie_list);
39549+ return inv_entry;
39550+}
39551+
39552+static int put_inv_entry(struct inv_entry *ientry)
39553+{
39554+ int result = 0;
39555+
39556+ assert("edward-96", ientry != NULL);
39557+ assert("edward-97", ientry->ie_list != NULL);
39558+
39559+ list_del(ientry->ie_list);
39560+ if (ientry->ie_sd != NULL) {
39561+ kfree(ientry->ie_sd);
39562+ kfree(ientry);
39563+ }
39564+ if (ientry->ie_file != NULL)
39565+ result = filp_close(ientry->file, NULL);
39566+ return result;
39567+}
39568+
39569+static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39570+{
39571+ struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39572+ assert("edward-99", inv_entry->ie_inode = NULL);
39573+ assert("edward-100", inv_entry->ie_sd = NULL);
39574+
39575+ isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39576+ if (!isd_base)
39577+ return RETERR(-ENOMEM);
39578+ inv_entry->ie_sd = isd_base;
39579+ return 0;
39580+}
39581+
39582+/* this can be installed as ->init_inv_entry () method of
39583+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39584+ Copies data from on-disk stat-data format into light-weight analog of inode .
39585+ Doesn't hanlde stat-data extensions. */
39586+
39587+static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39588+{
39589+ reiser4_stat_data_base *sd_base;
39590+
39591+ assert("edward-101", inv_entry != NULL);
39592+ assert("edward-101", inv_entry->ie_sd != NULL);
39593+ assert("edward-102", sd != NULL);
39594+
39595+ sd_base = (reiser4_stat_data_base *) sd;
39596+ inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39597+ inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39598+ inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39599+ inv_entry->incore_sd_base->isd_data = NULL;
39600+}
39601+
39602+/* initialise incore stat-data */
39603+
39604+static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39605+{
39606+ reiser4_plugin *plugin = item_plugin_by_coord(coord);
39607+ void *body = item_body_by_coord(coord);
39608+
39609+ assert("edward-103", inv_entry != NULL);
39610+ assert("edward-104", plugin != NULL);
39611+ assert("edward-105", body != NULL);
39612+
39613+ sd_base_load(inv_entry, body);
39614+}
39615+
39616+/* takes a key or filename and allocates new invert_entry,
39617+ init and adds it into the list,
39618+ we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39619+
39620+int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */
39621+ inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */
39622+ const reiser4_key * key, /* key of invert entry stat-data */
39623+ char *filename, /* filename of the file to be opened */
39624+ int flags, int mode)
39625+{
39626+ int result;
39627+ struct inv_entry *ientry;
39628+
39629+ assert("edward-107", invert_inode != NULL);
39630+
39631+ ientry = allocate_inv_entry();
39632+ if (IS_ERR(ientry))
39633+ return (PTR_ERR(ientry));
39634+
39635+ if (type == LIGHT_WEIGHT_FILE) {
39636+ coord_t coord;
39637+ lock_handle lh;
39638+
39639+ assert("edward-108", key != NULL);
39640+
39641+ init_coord(&coord);
39642+ init_lh(&lh);
39643+ result =
39644+ lookup_sd_by_key(reiser4_tree_by_inode(invert_inode),
39645+ ZNODE_READ_LOCK, &coord, &lh, key);
39646+ if (result == 0)
39647+ init_incore_sd_base(ientry, coord);
39648+
39649+ done_lh(&lh);
39650+ done_coord(&coord);
39651+ return (result);
39652+ } else {
39653+ struct file *file = filp_open(filename, flags, mode);
39654+ /* FIXME_EDWARD here we need to check if we
39655+ did't follow to any mount point */
39656+
39657+ assert("edward-108", filename != NULL);
39658+
39659+ if (IS_ERR(file))
39660+ return (PTR_ERR(file));
39661+ ientry->ie_file = file;
39662+ return 0;
39663+ }
39664+}
39665+
39666+/* takes inode of invert, reads the body of this invert, parses it,
39667+ opens all invert entries and return pointer on the first inv_entry */
39668+
39669+struct inv_entry *open_invert(struct file *invert_file)
39670+{
39671+
39672+}
39673+
39674+ssize_t subfile_read(struct *invert_entry, flow * f)
39675+{
39676+
39677+}
39678+
39679+ssize_t subfile_write(struct *invert_entry, flow * f)
39680+{
39681+
39682+}
39683+
39684+ssize_t invert_read(struct *file, flow * f)
39685+{
39686+
39687+}
39688+
39689+ssize_t invert_write(struct *file, flow * f)
39690+{
39691+
39692+}
39693+
39694+/* Make Linus happy.
39695+ Local variables:
39696+ c-indentation-style: "K&R"
39697+ mode-name: "LC"
39698+ c-basic-offset: 8
39699+ tab-width: 8
39700+ fill-column: 120
39701+ scroll-step: 1
39702+ End:
39703+*/
39704diff --git a/fs/reiser4/plugin/file/symfile.c b/fs/reiser4/plugin/file/symfile.c
39705new file mode 100644
39706index 0000000..814dfb8
39707--- /dev/null
39708+++ b/fs/reiser4/plugin/file/symfile.c
39709@@ -0,0 +1,87 @@
39710+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39711+
39712+/* Symfiles are a generalization of Unix symlinks.
39713+
39714+ A symfile when read behaves as though you took its contents and
39715+ substituted them into the reiser4 naming system as the right hand side
39716+ of an assignment, and then read that which you had assigned to it.
39717+
39718+ A key issue for symfiles is how to implement writes through to
39719+ subfiles. In general, one must have some method of determining what
39720+ of that which is written to the symfile is written to what subfile.
39721+ This can be done by use of custom plugin methods written by users, or
39722+ by using a few general methods we provide for those willing to endure
39723+ the insertion of delimiters into what is read.
39724+
39725+ Writing to symfiles without delimiters to denote what is written to
39726+ what subfile is not supported by any plugins we provide in this
39727+ release. Our most sophisticated support for writes is that embodied
39728+ by the invert plugin (see invert.c).
39729+
39730+ A read only version of the /etc/passwd file might be
39731+ constructed as a symfile whose contents are as follows:
39732+
39733+ /etc/passwd/userlines/*
39734+
39735+ or
39736+
39737+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39738+
39739+ or
39740+
39741+ /etc/passwd/userlines/(demidov+edward+reiser+root)
39742+
39743+ A symfile with contents
39744+
39745+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39746+
39747+ will return when read
39748+
39749+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39750+
39751+ and write of what has been read will not be possible to implement as
39752+ an identity operation because there are no delimiters denoting the
39753+ boundaries of what is to be written to what subfile.
39754+
39755+ Note that one could make this a read/write symfile if one specified
39756+ delimiters, and the write method understood those delimiters delimited
39757+ what was written to subfiles.
39758+
39759+ So, specifying the symfile in a manner that allows writes:
39760+
39761+ /etc/passwd/userlines/demidov+"(
39762+ )+/etc/passwd/userlines/edward+"(
39763+ )+/etc/passwd/userlines/reiser+"(
39764+ )+/etc/passwd/userlines/root+"(
39765+ )
39766+
39767+ or
39768+
39769+ /etc/passwd/userlines/(demidov+"(
39770+ )+edward+"(
39771+ )+reiser+"(
39772+ )+root+"(
39773+ ))
39774+
39775+ and the file demidov might be specified as:
39776+
39777+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39778+
39779+ or
39780+
39781+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39782+
39783+ Notice that if the file demidov has a carriage return in it, the
39784+ parsing fails, but then if you put carriage returns in the wrong place
39785+ in a normal /etc/passwd file it breaks things also.
39786+
39787+ Note that it is forbidden to have no text between two interpolations
39788+ if one wants to be able to define what parts of a write go to what
39789+ subfiles referenced in an interpolation.
39790+
39791+ If one wants to be able to add new lines by writing to the file, one
39792+ must either write a custom plugin for /etc/passwd that knows how to
39793+ name an added line, or one must use an invert, or one must use a more
39794+ sophisticated symfile syntax that we are not planning to write for
39795+ version 4.0.
39796+*/
39797diff --git a/fs/reiser4/plugin/file/symlink.c b/fs/reiser4/plugin/file/symlink.c
39798new file mode 100644
39799index 0000000..bcf3ef8
39800--- /dev/null
39801+++ b/fs/reiser4/plugin/file/symlink.c
39802@@ -0,0 +1,95 @@
39803+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39804+
39805+#include "../../inode.h"
39806+
39807+#include <linux/types.h>
39808+#include <linux/fs.h>
39809+
39810+/* file plugin methods specific for symlink files
39811+ (SYMLINK_FILE_PLUGIN_ID) */
39812+
39813+/* this is implementation of create_object method of file plugin for
39814+ SYMLINK_FILE_PLUGIN_ID
39815+ */
39816+
39817+/**
39818+ * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39819+ * @symlink: inode of symlink object
39820+ * @dir: inode of parent directory
39821+ * @info: parameters of new object
39822+ *
39823+ * Inserts stat data with symlink extension where into the tree.
39824+ */
39825+int reiser4_create_symlink(struct inode *symlink,
39826+ struct inode *dir UNUSED_ARG,
39827+ reiser4_object_create_data *data /* info passed to us
39828+ * this is filled by
39829+ * reiser4() syscall
39830+ * in particular */)
39831+{
39832+ int result;
39833+
39834+ assert("nikita-680", symlink != NULL);
39835+ assert("nikita-681", S_ISLNK(symlink->i_mode));
39836+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
39837+ assert("nikita-682", dir != NULL);
39838+ assert("nikita-684", data != NULL);
39839+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39840+
39841+ /*
39842+ * stat data of symlink has symlink extension in which we store
39843+ * symlink content, that is, path symlink is pointing to.
39844+ */
39845+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39846+
39847+ assert("vs-838", symlink->i_private == NULL);
39848+ symlink->i_private = (void *)data->name;
39849+
39850+ assert("vs-843", symlink->i_size == 0);
39851+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39852+
39853+ /* insert stat data appended with data->name */
39854+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39855+ if (result) {
39856+ /* FIXME-VS: Make sure that symlink->i_private is not attached
39857+ to kmalloced data */
39858+ INODE_SET_FIELD(symlink, i_size, 0);
39859+ } else {
39860+ assert("vs-849", symlink->i_private
39861+ && reiser4_inode_get_flag(symlink,
39862+ REISER4_GENERIC_PTR_USED));
39863+ assert("vs-850",
39864+ !memcmp((char *)symlink->i_private, data->name,
39865+ (size_t) symlink->i_size + 1));
39866+ }
39867+ return result;
39868+}
39869+
39870+/* this is implementation of destroy_inode method of file plugin for
39871+ SYMLINK_FILE_PLUGIN_ID
39872+ */
39873+void destroy_inode_symlink(struct inode *inode)
39874+{
39875+ assert("edward-799",
39876+ inode_file_plugin(inode) ==
39877+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39878+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39879+ assert("edward-801", reiser4_inode_get_flag(inode,
39880+ REISER4_GENERIC_PTR_USED));
39881+ assert("vs-839", S_ISLNK(inode->i_mode));
39882+
39883+ kfree(inode->i_private);
39884+ inode->i_private = NULL;
39885+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39886+}
39887+
39888+/*
39889+ Local variables:
39890+ c-indentation-style: "K&R"
39891+ mode-name: "LC"
39892+ c-basic-offset: 8
39893+ tab-width: 8
39894+ fill-column: 80
39895+ scroll-step: 1
39896+ End:
39897+*/
39898diff --git a/fs/reiser4/plugin/file/tail_conversion.c b/fs/reiser4/plugin/file/tail_conversion.c
39899new file mode 100644
39900index 0000000..b57776f
39901--- /dev/null
39902+++ b/fs/reiser4/plugin/file/tail_conversion.c
39903@@ -0,0 +1,726 @@
39904+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39905+
39906+#include "../../inode.h"
39907+#include "../../super.h"
39908+#include "../../page_cache.h"
39909+#include "../../carry.h"
39910+#include "../../safe_link.h"
39911+#include "../../vfs_ops.h"
39912+
39913+#include <linux/writeback.h>
39914+
39915+/* this file contains:
39916+ tail2extent and extent2tail */
39917+
39918+/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39919+void get_exclusive_access(unix_file_info_t * uf_info)
39920+{
39921+ assert("nikita-3028", reiser4_schedulable());
39922+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39923+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39924+ /*
39925+ * "deadlock avoidance": sometimes we commit a transaction under
39926+ * rw-semaphore on a file. Such commit can deadlock with another
39927+ * thread that captured some block (hence preventing atom from being
39928+ * committed) and waits on rw-semaphore.
39929+ */
39930+ reiser4_txn_restart_current();
39931+ LOCK_CNT_INC(inode_sem_w);
39932+ down_write(&uf_info->latch);
39933+ uf_info->exclusive_use = 1;
39934+ assert("vs-1713", uf_info->ea_owner == NULL);
39935+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39936+ ON_DEBUG(uf_info->ea_owner = current);
39937+}
39938+
39939+void drop_exclusive_access(unix_file_info_t * uf_info)
39940+{
39941+ assert("vs-1714", uf_info->ea_owner == current);
39942+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39943+ ON_DEBUG(uf_info->ea_owner = NULL);
39944+ uf_info->exclusive_use = 0;
39945+ up_write(&uf_info->latch);
39946+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39947+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39948+ LOCK_CNT_DEC(inode_sem_w);
39949+ reiser4_txn_restart_current();
39950+}
39951+
39952+/**
39953+ * nea_grabbed - do something when file semaphore is down_read-ed
39954+ * @uf_info:
39955+ *
39956+ * This is called when nonexclisive access is obtained on file. All it does is
39957+ * for debugging purposes.
39958+ */
39959+static void nea_grabbed(unix_file_info_t *uf_info)
39960+{
39961+#if REISER4_DEBUG
39962+ LOCK_CNT_INC(inode_sem_r);
39963+ assert("vs-1716", uf_info->ea_owner == NULL);
39964+ atomic_inc(&uf_info->nr_neas);
39965+ uf_info->last_reader = current;
39966+#endif
39967+}
39968+
39969+/**
39970+ * get_nonexclusive_access - get nonexclusive access to a file
39971+ * @uf_info: unix file specific part of inode to obtain access to
39972+ *
39973+ * Nonexclusive access is obtained on a file before read, write, readpage.
39974+ */
39975+void get_nonexclusive_access(unix_file_info_t *uf_info)
39976+{
39977+ assert("nikita-3029", reiser4_schedulable());
39978+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
39979+
39980+ down_read(&uf_info->latch);
39981+ nea_grabbed(uf_info);
39982+}
39983+
39984+/**
39985+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39986+ * @uf_info: unix file specific part of inode to obtain access to
39987+ *
39988+ * Non-blocking version of nonexclusive access obtaining.
39989+ */
39990+int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39991+{
39992+ int result;
39993+
39994+ result = down_read_trylock(&uf_info->latch);
39995+ if (result)
39996+ nea_grabbed(uf_info);
39997+ return result;
39998+}
39999+
40000+void drop_nonexclusive_access(unix_file_info_t * uf_info)
40001+{
40002+ assert("vs-1718", uf_info->ea_owner == NULL);
40003+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
40004+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
40005+
40006+ up_read(&uf_info->latch);
40007+
40008+ LOCK_CNT_DEC(inode_sem_r);
40009+ reiser4_txn_restart_current();
40010+}
40011+
40012+/* part of tail2extent. Cut all items covering @count bytes starting from
40013+ @offset */
40014+/* Audited by: green(2002.06.15) */
40015+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
40016+{
40017+ reiser4_key from, to;
40018+
40019+ /* AUDIT: How about putting an assertion here, what would check
40020+ all provided range is covered by tail items only? */
40021+ /* key of first byte in the range to be cut */
40022+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
40023+
40024+ /* key of last byte in that range */
40025+ to = from;
40026+ set_key_offset(&to, (__u64) (offset + count - 1));
40027+
40028+ /* cut everything between those keys */
40029+ return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
40030+ inode, 0);
40031+}
40032+
40033+static void release_all_pages(struct page **pages, unsigned nr_pages)
40034+{
40035+ unsigned i;
40036+
40037+ for (i = 0; i < nr_pages; i++) {
40038+ if (pages[i] == NULL) {
40039+ unsigned j;
40040+ for (j = i + 1; j < nr_pages; j++)
40041+ assert("vs-1620", pages[j] == NULL);
40042+ break;
40043+ }
40044+ page_cache_release(pages[i]);
40045+ pages[i] = NULL;
40046+ }
40047+}
40048+
40049+/* part of tail2extent. replace tail items with extent one. Content of tail
40050+ items (@count bytes) being cut are copied already into
40051+ pages. extent_writepage method is called to create extents corresponding to
40052+ those pages */
40053+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
40054+{
40055+ int result;
40056+ unsigned i;
40057+ STORE_COUNTERS;
40058+
40059+ if (nr_pages == 0)
40060+ return 0;
40061+
40062+ assert("vs-596", pages[0]);
40063+
40064+ /* cut copied items */
40065+ result = cut_formatting_items(inode, page_offset(pages[0]), count);
40066+ if (result)
40067+ return result;
40068+
40069+ CHECK_COUNTERS;
40070+
40071+ /* put into tree replacement for just removed items: extent item, namely */
40072+ for (i = 0; i < nr_pages; i++) {
40073+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
40074+ pages[i]->index,
40075+ mapping_gfp_mask(inode->
40076+ i_mapping));
40077+ if (result)
40078+ break;
40079+ unlock_page(pages[i]);
40080+ result = find_or_create_extent(pages[i]);
40081+ if (result)
40082+ break;
40083+ SetPageUptodate(pages[i]);
40084+ }
40085+ return result;
40086+}
40087+
40088+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
40089+ * items */
40090+
40091+static int reserve_tail2extent_iteration(struct inode *inode)
40092+{
40093+ reiser4_block_nr unformatted_nodes;
40094+ reiser4_tree *tree;
40095+
40096+ tree = reiser4_tree_by_inode(inode);
40097+
40098+ /* number of unformatted nodes which will be created */
40099+ unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
40100+
40101+ /*
40102+ * space required for one iteration of extent->tail conversion:
40103+ *
40104+ * 1. kill N tail items
40105+ *
40106+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
40107+ *
40108+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
40109+ * extents) extent units.
40110+ *
40111+ * 4. drilling to the leaf level by coord_by_key()
40112+ *
40113+ * 5. possible update of stat-data
40114+ *
40115+ */
40116+ grab_space_enable();
40117+ return reiser4_grab_space
40118+ (2 * tree->height +
40119+ TAIL2EXTENT_PAGE_NUM +
40120+ TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
40121+ 1 + estimate_one_insert_item(tree) +
40122+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
40123+}
40124+
40125+/* clear stat data's flag indicating that conversion is being converted */
40126+static int complete_conversion(struct inode *inode)
40127+{
40128+ int result;
40129+
40130+ grab_space_enable();
40131+ result =
40132+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
40133+ BA_CAN_COMMIT);
40134+ if (result == 0) {
40135+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
40136+ result = reiser4_update_sd(inode);
40137+ }
40138+ if (result)
40139+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
40140+ (unsigned long long)get_inode_oid(inode), result);
40141+ return 0;
40142+}
40143+
40144+/**
40145+ * find_start
40146+ * @inode:
40147+ * @id:
40148+ * @offset:
40149+ *
40150+ * this is used by tail2extent and extent2tail to detect where previous
40151+ * uncompleted conversion stopped
40152+ */
40153+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
40154+{
40155+ int result;
40156+ lock_handle lh;
40157+ coord_t coord;
40158+ unix_file_info_t *ufo;
40159+ int found;
40160+ reiser4_key key;
40161+
40162+ ufo = unix_file_inode_data(inode);
40163+ init_lh(&lh);
40164+ result = 0;
40165+ found = 0;
40166+ inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
40167+ do {
40168+ init_lh(&lh);
40169+ result = find_file_item_nohint(&coord, &lh, &key,
40170+ ZNODE_READ_LOCK, inode);
40171+
40172+ if (result == CBK_COORD_FOUND) {
40173+ if (coord.between == AT_UNIT) {
40174+ /*coord_clear_iplug(&coord); */
40175+ result = zload(coord.node);
40176+ if (result == 0) {
40177+ if (item_id_by_coord(&coord) == id)
40178+ found = 1;
40179+ else
40180+ item_plugin_by_coord(&coord)->s.
40181+ file.append_key(&coord,
40182+ &key);
40183+ zrelse(coord.node);
40184+ }
40185+ } else
40186+ result = RETERR(-ENOENT);
40187+ }
40188+ done_lh(&lh);
40189+ } while (result == 0 && !found);
40190+ *offset = get_key_offset(&key);
40191+ return result;
40192+}
40193+
40194+/**
40195+ * tail2extent
40196+ * @uf_info:
40197+ *
40198+ *
40199+ */
40200+int tail2extent(unix_file_info_t *uf_info)
40201+{
40202+ int result;
40203+ reiser4_key key; /* key of next byte to be moved to page */
40204+ char *p_data; /* data of page */
40205+ unsigned page_off = 0, /* offset within the page where to copy data */
40206+ count; /* number of bytes of item which can be
40207+ * copied to page */
40208+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
40209+ struct page *page;
40210+ int done; /* set to 1 when all file is read */
40211+ char *item;
40212+ int i;
40213+ struct inode *inode;
40214+ int first_iteration;
40215+ int bytes;
40216+ __u64 offset;
40217+
40218+ assert("nikita-3362", ea_obtained(uf_info));
40219+ inode = unix_file_info_to_inode(uf_info);
40220+ assert("nikita-3412", !IS_RDONLY(inode));
40221+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
40222+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
40223+
40224+ offset = 0;
40225+ first_iteration = 1;
40226+ result = 0;
40227+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
40228+ /*
40229+ * file is marked on disk as there was a conversion which did
40230+ * not complete due to either crash or some error. Find which
40231+ * offset tail conversion stopped at
40232+ */
40233+ result = find_start(inode, FORMATTING_ID, &offset);
40234+ if (result == -ENOENT) {
40235+ /* no tail items found, everything is converted */
40236+ uf_info->container = UF_CONTAINER_EXTENTS;
40237+ complete_conversion(inode);
40238+ return 0;
40239+ } else if (result != 0)
40240+ /* some other error */
40241+ return result;
40242+ first_iteration = 0;
40243+ }
40244+
40245+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
40246+
40247+ /* get key of first byte of a file */
40248+ inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
40249+
40250+ done = 0;
40251+ while (done == 0) {
40252+ memset(pages, 0, sizeof(pages));
40253+ result = reserve_tail2extent_iteration(inode);
40254+ if (result != 0)
40255+ goto out;
40256+ if (first_iteration) {
40257+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
40258+ reiser4_update_sd(inode);
40259+ first_iteration = 0;
40260+ }
40261+ bytes = 0;
40262+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
40263+ assert("vs-598",
40264+ (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
40265+ page = alloc_page(reiser4_ctx_gfp_mask_get());
40266+ if (!page) {
40267+ result = RETERR(-ENOMEM);
40268+ goto error;
40269+ }
40270+
40271+ page->index =
40272+ (unsigned long)(get_key_offset(&key) >>
40273+ PAGE_CACHE_SHIFT);
40274+ /*
40275+ * usually when one is going to longterm lock znode (as
40276+ * find_file_item does, for instance) he must not hold
40277+ * locked pages. However, there is an exception for
40278+ * case tail2extent. Pages appearing here are not
40279+ * reachable to everyone else, they are clean, they do
40280+ * not have jnodes attached so keeping them locked do
40281+ * not risk deadlock appearance
40282+ */
40283+ assert("vs-983", !PagePrivate(page));
40284+ reiser4_invalidate_pages(inode->i_mapping, page->index,
40285+ 1, 0);
40286+
40287+ for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
40288+ coord_t coord;
40289+ lock_handle lh;
40290+
40291+ /* get next item */
40292+ /* FIXME: we might want to readahead here */
40293+ init_lh(&lh);
40294+ result =
40295+ find_file_item_nohint(&coord, &lh, &key,
40296+ ZNODE_READ_LOCK,
40297+ inode);
40298+ if (result != CBK_COORD_FOUND) {
40299+ /*
40300+ * error happened of not items of file
40301+ * were found
40302+ */
40303+ done_lh(&lh);
40304+ page_cache_release(page);
40305+ goto error;
40306+ }
40307+
40308+ if (coord.between == AFTER_UNIT) {
40309+ /*
40310+ * end of file is reached. Padd page
40311+ * with zeros
40312+ */
40313+ done_lh(&lh);
40314+ done = 1;
40315+ p_data = kmap_atomic(page, KM_USER0);
40316+ memset(p_data + page_off, 0,
40317+ PAGE_CACHE_SIZE - page_off);
40318+ kunmap_atomic(p_data, KM_USER0);
40319+ break;
40320+ }
40321+
40322+ result = zload(coord.node);
40323+ if (result) {
40324+ page_cache_release(page);
40325+ done_lh(&lh);
40326+ goto error;
40327+ }
40328+ assert("vs-856", coord.between == AT_UNIT);
40329+ item = ((char *)item_body_by_coord(&coord)) +
40330+ coord.unit_pos;
40331+
40332+ /* how many bytes to copy */
40333+ count =
40334+ item_length_by_coord(&coord) -
40335+ coord.unit_pos;
40336+ /* limit length of copy to end of page */
40337+ if (count > PAGE_CACHE_SIZE - page_off)
40338+ count = PAGE_CACHE_SIZE - page_off;
40339+
40340+ /*
40341+ * copy item (as much as will fit starting from
40342+ * the beginning of the item) into the page
40343+ */
40344+ p_data = kmap_atomic(page, KM_USER0);
40345+ memcpy(p_data + page_off, item, count);
40346+ kunmap_atomic(p_data, KM_USER0);
40347+
40348+ page_off += count;
40349+ bytes += count;
40350+ set_key_offset(&key,
40351+ get_key_offset(&key) + count);
40352+
40353+ zrelse(coord.node);
40354+ done_lh(&lh);
40355+ } /* end of loop which fills one page by content of
40356+ * formatting items */
40357+
40358+ if (page_off) {
40359+ /* something was copied into page */
40360+ pages[i] = page;
40361+ } else {
40362+ page_cache_release(page);
40363+ assert("vs-1648", done == 1);
40364+ break;
40365+ }
40366+ } /* end of loop through pages of one conversion iteration */
40367+
40368+ if (i > 0) {
40369+ result = replace(inode, pages, i, bytes);
40370+ release_all_pages(pages, sizeof_array(pages));
40371+ if (result)
40372+ goto error;
40373+ /*
40374+ * we have to drop exclusive access to avoid deadlock
40375+ * which may happen because called by
40376+ * reiser4_writepages capture_unix_file requires to get
40377+ * non-exclusive access to a file. It is safe to drop
40378+ * EA in the middle of tail2extent conversion because
40379+ * write_unix_file/unix_setattr(truncate)/release_unix_file(extent2tail)
40380+ * are serialized by reiser4_inode->mutex_write semaphore and
40381+ * because read_unix_file works (should at least) on
40382+ * partially converted files
40383+ */
40384+ drop_exclusive_access(uf_info);
40385+ /* throttle the conversion */
40386+ reiser4_throttle_write(inode);
40387+ get_exclusive_access(uf_info);
40388+
40389+ /*
40390+ * nobody is allowed to complete conversion but a
40391+ * process which started it
40392+ */
40393+ assert("", reiser4_inode_get_flag(inode,
40394+ REISER4_PART_MIXED));
40395+ }
40396+ }
40397+
40398+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40399+
40400+ if (result == 0) {
40401+ /* file is converted to extent items */
40402+ assert("vs-1697", reiser4_inode_get_flag(inode,
40403+ REISER4_PART_MIXED));
40404+
40405+ uf_info->container = UF_CONTAINER_EXTENTS;
40406+ complete_conversion(inode);
40407+ } else {
40408+ /*
40409+ * conversion is not complete. Inode was already marked as
40410+ * REISER4_PART_CONV and stat-data were updated at the first
40411+ * iteration of the loop above.
40412+ */
40413+ error:
40414+ release_all_pages(pages, sizeof_array(pages));
40415+ warning("nikita-2282", "Partial conversion of %llu: %i",
40416+ (unsigned long long)get_inode_oid(inode), result);
40417+ }
40418+
40419+ out:
40420+ return result;
40421+}
40422+
40423+static int reserve_extent2tail_iteration(struct inode *inode)
40424+{
40425+ reiser4_tree *tree;
40426+
40427+ tree = reiser4_tree_by_inode(inode);
40428+ /*
40429+ * reserve blocks for (in this order):
40430+ *
40431+ * 1. removal of extent item
40432+ *
40433+ * 2. insertion of tail by insert_flow()
40434+ *
40435+ * 3. drilling to the leaf level by coord_by_key()
40436+ *
40437+ * 4. possible update of stat-data
40438+ */
40439+ grab_space_enable();
40440+ return reiser4_grab_space
40441+ (estimate_one_item_removal(tree) +
40442+ estimate_insert_flow(tree->height) +
40443+ 1 + estimate_one_insert_item(tree) +
40444+ inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
40445+}
40446+
40447+/* for every page of file: read page, cut part of extent pointing to this page,
40448+ put data of page tree by tail item */
40449+int extent2tail(unix_file_info_t *uf_info)
40450+{
40451+ int result;
40452+ struct inode *inode;
40453+ struct page *page;
40454+ unsigned long num_pages, i;
40455+ unsigned long start_page;
40456+ reiser4_key from;
40457+ reiser4_key to;
40458+ unsigned count;
40459+ __u64 offset;
40460+
40461+ assert("nikita-3362", ea_obtained(uf_info));
40462+ inode = unix_file_info_to_inode(uf_info);
40463+ assert("nikita-3412", !IS_RDONLY(inode));
40464+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
40465+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
40466+
40467+ offset = 0;
40468+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
40469+ /*
40470+ * file is marked on disk as there was a conversion which did
40471+ * not complete due to either crash or some error. Find which
40472+ * offset tail conversion stopped at
40473+ */
40474+ result = find_start(inode, EXTENT_POINTER_ID, &offset);
40475+ if (result == -ENOENT) {
40476+ /* no extent found, everything is converted */
40477+ uf_info->container = UF_CONTAINER_TAILS;
40478+ complete_conversion(inode);
40479+ return 0;
40480+ } else if (result != 0)
40481+ /* some other error */
40482+ return result;
40483+ }
40484+
40485+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
40486+
40487+ /* number of pages in the file */
40488+ num_pages =
40489+ (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
40490+ start_page = offset >> PAGE_CACHE_SHIFT;
40491+
40492+ inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
40493+ to = from;
40494+
40495+ result = 0;
40496+ for (i = 0; i < num_pages; i++) {
40497+ __u64 start_byte;
40498+
40499+ result = reserve_extent2tail_iteration(inode);
40500+ if (result != 0)
40501+ break;
40502+ if (i == 0 && offset == 0) {
40503+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
40504+ reiser4_update_sd(inode);
40505+ }
40506+
40507+ page = read_mapping_page(inode->i_mapping,
40508+ (unsigned)(i + start_page), NULL);
40509+ if (IS_ERR(page)) {
40510+ result = PTR_ERR(page);
40511+ break;
40512+ }
40513+
40514+ wait_on_page_locked(page);
40515+
40516+ if (!PageUptodate(page)) {
40517+ page_cache_release(page);
40518+ result = RETERR(-EIO);
40519+ break;
40520+ }
40521+
40522+ /* cut part of file we have read */
40523+ start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
40524+ set_key_offset(&from, start_byte);
40525+ set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
40526+ /*
40527+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
40528+ * commits during over-long truncates. But
40529+ * extent->tail conversion should be performed in one
40530+ * transaction.
40531+ */
40532+ result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
40533+ &to, inode, 0);
40534+
40535+ if (result) {
40536+ page_cache_release(page);
40537+ break;
40538+ }
40539+
40540+ /* put page data into tree via tail_write */
40541+ count = PAGE_CACHE_SIZE;
40542+ if ((i == (num_pages - 1)) &&
40543+ (inode->i_size & ~PAGE_CACHE_MASK))
40544+ /* last page can be incompleted */
40545+ count = (inode->i_size & ~PAGE_CACHE_MASK);
40546+ while (count) {
40547+ struct dentry dentry;
40548+ struct file file;
40549+ loff_t pos;
40550+
40551+ dentry.d_inode = inode;
40552+ file.f_dentry = &dentry;
40553+ file.private_data = NULL;
40554+ file.f_pos = start_byte;
40555+ file.private_data = NULL;
40556+ pos = start_byte;
40557+ result = reiser4_write_tail(&file,
40558+ (char __user *)kmap(page),
40559+ count, &pos);
40560+ reiser4_free_file_fsdata(&file);
40561+ if (result <= 0) {
40562+ warning("", "reiser4_write_tail failed");
40563+ page_cache_release(page);
40564+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40565+ return result;
40566+ }
40567+ count -= result;
40568+ }
40569+
40570+ /* release page */
40571+ lock_page(page);
40572+ /* page is already detached from jnode and mapping. */
40573+ assert("vs-1086", page->mapping == NULL);
40574+ assert("nikita-2690",
40575+ (!PagePrivate(page) && jprivate(page) == 0));
40576+ /* waiting for writeback completion with page lock held is
40577+ * perfectly valid. */
40578+ wait_on_page_writeback(page);
40579+ reiser4_drop_page(page);
40580+ /* release reference taken by read_cache_page() above */
40581+ page_cache_release(page);
40582+
40583+ drop_exclusive_access(uf_info);
40584+ /* throttle the conversion */
40585+ reiser4_throttle_write(inode);
40586+ get_exclusive_access(uf_info);
40587+ /*
40588+ * nobody is allowed to complete conversion but a process which
40589+ * started it
40590+ */
40591+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
40592+ }
40593+
40594+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40595+
40596+ if (i == num_pages) {
40597+ /* file is converted to formatted items */
40598+ assert("vs-1698", reiser4_inode_get_flag(inode,
40599+ REISER4_PART_MIXED));
40600+ assert("vs-1260",
40601+ inode_has_no_jnodes(reiser4_inode_data(inode)));
40602+
40603+ uf_info->container = UF_CONTAINER_TAILS;
40604+ complete_conversion(inode);
40605+ return 0;
40606+ }
40607+ /*
40608+ * conversion is not complete. Inode was already marked as
40609+ * REISER4_PART_MIXED and stat-data were updated at the first *
40610+ * iteration of the loop above.
40611+ */
40612+ warning("nikita-2282",
40613+ "Partial conversion of %llu: %lu of %lu: %i",
40614+ (unsigned long long)get_inode_oid(inode), i,
40615+ num_pages, result);
40616+
40617+ return result;
40618+}
40619+
40620+/*
40621+ * Local variables:
40622+ * c-indentation-style: "K&R"
40623+ * mode-name: "LC"
40624+ * c-basic-offset: 8
40625+ * tab-width: 8
40626+ * fill-column: 79
40627+ * scroll-step: 1
40628+ * End:
40629+ */
40630diff --git a/fs/reiser4/plugin/file_ops.c b/fs/reiser4/plugin/file_ops.c
40631new file mode 100644
40632index 0000000..ef8ba9d
40633--- /dev/null
40634+++ b/fs/reiser4/plugin/file_ops.c
40635@@ -0,0 +1,168 @@
40636+/* Copyright 2005 by Hans Reiser, licensing governed by
40637+ reiser4/README */
40638+
40639+/* this file contains typical implementations for some of methods of
40640+ struct file_operations and of struct address_space_operations
40641+*/
40642+
40643+#include "../inode.h"
40644+#include "object.h"
40645+
40646+/* file operations */
40647+
40648+/* implementation of vfs's llseek method of struct file_operations for
40649+ typical directory can be found in readdir_common.c
40650+*/
40651+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
40652+
40653+/* implementation of vfs's readdir method of struct file_operations for
40654+ typical directory can be found in readdir_common.c
40655+*/
40656+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
40657+
40658+/**
40659+ * reiser4_release_dir_common - release of struct file_operations
40660+ * @inode: inode of released file
40661+ * @file: file to release
40662+ *
40663+ * Implementation of release method of struct file_operations for typical
40664+ * directory. All it does is freeing of reiser4 specific file data.
40665+*/
40666+int reiser4_release_dir_common(struct inode *inode, struct file *file)
40667+{
40668+ reiser4_context *ctx;
40669+
40670+ ctx = reiser4_init_context(inode->i_sb);
40671+ if (IS_ERR(ctx))
40672+ return PTR_ERR(ctx);
40673+ reiser4_free_file_fsdata(file);
40674+ reiser4_exit_context(ctx);
40675+ return 0;
40676+}
40677+
40678+/* this is common implementation of vfs's fsync method of struct
40679+ file_operations
40680+*/
40681+int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
40682+{
40683+ reiser4_context *ctx;
40684+ int result;
40685+
40686+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
40687+ if (IS_ERR(ctx))
40688+ return PTR_ERR(ctx);
40689+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40690+
40691+ context_set_commit_async(ctx);
40692+ reiser4_exit_context(ctx);
40693+ return result;
40694+}
40695+
40696+/* this is common implementation of vfs's sendfile method of struct
40697+ file_operations
40698+
40699+ Reads @count bytes from @file and calls @actor for every page read. This is
40700+ needed for loop back devices support.
40701+*/
40702+#if 0
40703+ssize_t
40704+sendfile_common(struct file *file, loff_t *ppos, size_t count,
40705+ read_actor_t actor, void *target)
40706+{
40707+ reiser4_context *ctx;
40708+ ssize_t result;
40709+
40710+ ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
40711+ if (IS_ERR(ctx))
40712+ return PTR_ERR(ctx);
40713+ result = generic_file_sendfile(file, ppos, count, actor, target);
40714+ reiser4_exit_context(ctx);
40715+ return result;
40716+}
40717+#endif /* 0 */
40718+
40719+/* address space operations */
40720+
40721+/* this is common implementation of vfs's prepare_write method of struct
40722+ address_space_operations
40723+*/
40724+int
40725+prepare_write_common(struct file *file, struct page *page, unsigned from,
40726+ unsigned to)
40727+{
40728+ reiser4_context *ctx;
40729+ int result;
40730+
40731+ ctx = reiser4_init_context(page->mapping->host->i_sb);
40732+ result = do_prepare_write(file, page, from, to);
40733+
40734+ /* don't commit transaction under inode semaphore */
40735+ context_set_commit_async(ctx);
40736+ reiser4_exit_context(ctx);
40737+
40738+ return result;
40739+}
40740+
40741+/* this is helper for prepare_write_common and prepare_write_unix_file
40742+ */
40743+int
40744+do_prepare_write(struct file *file, struct page *page, unsigned from,
40745+ unsigned to)
40746+{
40747+ int result;
40748+ file_plugin *fplug;
40749+ struct inode *inode;
40750+
40751+ assert("umka-3099", file != NULL);
40752+ assert("umka-3100", page != NULL);
40753+ assert("umka-3095", PageLocked(page));
40754+
40755+ if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40756+ return 0;
40757+
40758+ inode = page->mapping->host;
40759+ fplug = inode_file_plugin(inode);
40760+
40761+ if (page->mapping->a_ops->readpage == NULL)
40762+ return RETERR(-EINVAL);
40763+
40764+ result = page->mapping->a_ops->readpage(file, page);
40765+ if (result != 0) {
40766+ SetPageError(page);
40767+ ClearPageUptodate(page);
40768+ /* All reiser4 readpage() implementations should return the
40769+ * page locked in case of error. */
40770+ assert("nikita-3472", PageLocked(page));
40771+ } else {
40772+ /*
40773+ * ->readpage() either:
40774+ *
40775+ * 1. starts IO against @page. @page is locked for IO in
40776+ * this case.
40777+ *
40778+ * 2. doesn't start IO. @page is unlocked.
40779+ *
40780+ * In either case, page should be locked.
40781+ */
40782+ lock_page(page);
40783+ /*
40784+ * IO (if any) is completed at this point. Check for IO
40785+ * errors.
40786+ */
40787+ if (!PageUptodate(page))
40788+ result = RETERR(-EIO);
40789+ }
40790+ assert("umka-3098", PageLocked(page));
40791+ return result;
40792+}
40793+
40794+/*
40795+ * Local variables:
40796+ * c-indentation-style: "K&R"
40797+ * mode-name: "LC"
40798+ * c-basic-offset: 8
40799+ * tab-width: 8
40800+ * fill-column: 79
40801+ * scroll-step: 1
40802+ * End:
40803+ */
40804diff --git a/fs/reiser4/plugin/file_ops_readdir.c b/fs/reiser4/plugin/file_ops_readdir.c
40805new file mode 100644
40806index 0000000..2bd7826
40807--- /dev/null
40808+++ b/fs/reiser4/plugin/file_ops_readdir.c
40809@@ -0,0 +1,657 @@
40810+/* Copyright 2005 by Hans Reiser, licensing governed by
40811+ * reiser4/README */
40812+
40813+#include "../inode.h"
40814+
40815+/* return true, iff @coord points to the valid directory item that is part of
40816+ * @inode directory. */
40817+static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40818+{
40819+ return plugin_of_group(item_plugin_by_coord(coord),
40820+ DIR_ENTRY_ITEM_TYPE) &&
40821+ inode_file_plugin(inode)->owns_item(inode, coord);
40822+}
40823+
40824+/* compare two logical positions within the same directory */
40825+static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40826+{
40827+ cmp_t result;
40828+
40829+ assert("nikita-2534", p1 != NULL);
40830+ assert("nikita-2535", p2 != NULL);
40831+
40832+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40833+ if (result == EQUAL_TO) {
40834+ int diff;
40835+
40836+ diff = p1->pos - p2->pos;
40837+ result =
40838+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40839+ }
40840+ return result;
40841+}
40842+
40843+/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
40844+ * necessary. */
40845+static void
40846+adjust_dir_pos(struct file *dir,
40847+ readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40848+{
40849+ dir_pos *pos;
40850+
40851+ /*
40852+ * new directory entry was added (adj == +1) or removed (adj == -1) at
40853+ * the @mod_point. Directory file descriptor @dir is doing readdir and
40854+ * is currently positioned at @readdir_spot. Latter has to be updated
40855+ * to maintain stable readdir.
40856+ */
40857+ /* directory is positioned to the beginning. */
40858+ if (readdir_spot->entry_no == 0)
40859+ return;
40860+
40861+ pos = &readdir_spot->position;
40862+ switch (dir_pos_cmp(mod_point, pos)) {
40863+ case LESS_THAN:
40864+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
40865+ * added/removed on the left (in key order) of current
40866+ * position. */
40867+ /* logical number of directory entry readdir is "looking" at
40868+ * changes */
40869+ readdir_spot->entry_no += adj;
40870+ assert("nikita-2577",
40871+ ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
40872+ if (de_id_cmp(&pos->dir_entry_key,
40873+ &mod_point->dir_entry_key) == EQUAL_TO) {
40874+ assert("nikita-2575", mod_point->pos < pos->pos);
40875+ /*
40876+ * if entry added/removed has the same key as current
40877+ * for readdir, update counter of duplicate keys in
40878+ * @readdir_spot.
40879+ */
40880+ pos->pos += adj;
40881+ }
40882+ break;
40883+ case GREATER_THAN:
40884+ /* directory is modified after @pos: nothing to do. */
40885+ break;
40886+ case EQUAL_TO:
40887+ /* cannot insert an entry readdir is looking at, because it
40888+ already exists. */
40889+ assert("nikita-2576", adj < 0);
40890+ /* directory entry to which @pos points to is being
40891+ removed.
40892+
40893+ NOTE-NIKITA: Right thing to do is to update @pos to point
40894+ to the next entry. This is complex (we are under spin-lock
40895+ for one thing). Just rewind it to the beginning. Next
40896+ readdir will have to scan the beginning of
40897+ directory. Proper solution is to use semaphore in
40898+ spin lock's stead and use rewind_right() here.
40899+
40900+ NOTE-NIKITA: now, semaphore is used, so...
40901+ */
40902+ memset(readdir_spot, 0, sizeof *readdir_spot);
40903+ }
40904+}
40905+
40906+/* scan all file-descriptors for this directory and adjust their
40907+ positions respectively. Should be used by implementations of
40908+ add_entry and rem_entry of dir plugin */
40909+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
40910+ int offset, int adj)
40911+{
40912+ reiser4_file_fsdata *scan;
40913+ dir_pos mod_point;
40914+
40915+ assert("nikita-2536", dir != NULL);
40916+ assert("nikita-2538", de != NULL);
40917+ assert("nikita-2539", adj != 0);
40918+
40919+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40920+ mod_point.pos = offset;
40921+
40922+ spin_lock_inode(dir);
40923+
40924+ /*
40925+ * new entry was added/removed in directory @dir. Scan all file
40926+ * descriptors for @dir that are currently involved into @readdir and
40927+ * update them.
40928+ */
40929+
40930+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40931+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40932+
40933+ spin_unlock_inode(dir);
40934+}
40935+
40936+/*
40937+ * traverse tree to start/continue readdir from the readdir position @pos.
40938+ */
40939+static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40940+{
40941+ reiser4_key key;
40942+ int result;
40943+ struct inode *inode;
40944+
40945+ assert("nikita-2554", pos != NULL);
40946+
40947+ inode = dir->f_dentry->d_inode;
40948+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40949+ if (result != 0)
40950+ return result;
40951+ result = reiser4_object_lookup(inode,
40952+ &key,
40953+ tap->coord,
40954+ tap->lh,
40955+ tap->mode,
40956+ FIND_EXACT,
40957+ LEAF_LEVEL, LEAF_LEVEL,
40958+ 0, &tap->ra_info);
40959+ if (result == CBK_COORD_FOUND)
40960+ result = rewind_right(tap, (int)pos->position.pos);
40961+ else {
40962+ tap->coord->node = NULL;
40963+ done_lh(tap->lh);
40964+ result = RETERR(-EIO);
40965+ }
40966+ return result;
40967+}
40968+
40969+/*
40970+ * handling of non-unique keys: calculate at what ordinal position within
40971+ * sequence of directory items with identical keys @pos is.
40972+ */
40973+static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40974+{
40975+ int result;
40976+ coord_t coord;
40977+ lock_handle lh;
40978+ tap_t scan;
40979+ de_id *did;
40980+ reiser4_key de_key;
40981+
40982+ coord_init_zero(&coord);
40983+ init_lh(&lh);
40984+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40985+ reiser4_tap_copy(&scan, tap);
40986+ reiser4_tap_load(&scan);
40987+ pos->position.pos = 0;
40988+
40989+ did = &pos->position.dir_entry_key;
40990+
40991+ if (is_valid_dir_coord(inode, scan.coord)) {
40992+
40993+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40994+
40995+ while (1) {
40996+
40997+ result = go_prev_unit(&scan);
40998+ if (result != 0)
40999+ break;
41000+
41001+ if (!is_valid_dir_coord(inode, scan.coord)) {
41002+ result = -EINVAL;
41003+ break;
41004+ }
41005+
41006+ /* get key of directory entry */
41007+ unit_key_by_coord(scan.coord, &de_key);
41008+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
41009+ /* duplicate-sequence is over */
41010+ break;
41011+ }
41012+ pos->position.pos++;
41013+ }
41014+ } else
41015+ result = RETERR(-ENOENT);
41016+ reiser4_tap_relse(&scan);
41017+ reiser4_tap_done(&scan);
41018+ return result;
41019+}
41020+
41021+/*
41022+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
41023+ */
41024+static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
41025+{
41026+ __u64 destination;
41027+ __s64 shift;
41028+ int result;
41029+ struct inode *inode;
41030+ loff_t dirpos;
41031+
41032+ assert("nikita-2553", dir != NULL);
41033+ assert("nikita-2548", pos != NULL);
41034+ assert("nikita-2551", tap->coord != NULL);
41035+ assert("nikita-2552", tap->lh != NULL);
41036+
41037+ dirpos = reiser4_get_dir_fpos(dir);
41038+ shift = dirpos - pos->fpos;
41039+ /* this is logical directory entry within @dir which we are rewinding
41040+ * to */
41041+ destination = pos->entry_no + shift;
41042+
41043+ inode = dir->f_dentry->d_inode;
41044+ if (dirpos < 0)
41045+ return RETERR(-EINVAL);
41046+ else if (destination == 0ll || dirpos == 0) {
41047+ /* rewind to the beginning of directory */
41048+ memset(pos, 0, sizeof *pos);
41049+ return dir_go_to(dir, pos, tap);
41050+ } else if (destination >= inode->i_size)
41051+ return RETERR(-ENOENT);
41052+
41053+ if (shift < 0) {
41054+ /* I am afraid of negative numbers */
41055+ shift = -shift;
41056+ /* rewinding to the left */
41057+ if (shift <= (int)pos->position.pos) {
41058+ /* destination is within sequence of entries with
41059+ duplicate keys. */
41060+ result = dir_go_to(dir, pos, tap);
41061+ } else {
41062+ shift -= pos->position.pos;
41063+ while (1) {
41064+ /* repetitions: deadlock is possible when
41065+ going to the left. */
41066+ result = dir_go_to(dir, pos, tap);
41067+ if (result == 0) {
41068+ result = rewind_left(tap, shift);
41069+ if (result == -E_DEADLOCK) {
41070+ reiser4_tap_done(tap);
41071+ continue;
41072+ }
41073+ }
41074+ break;
41075+ }
41076+ }
41077+ } else {
41078+ /* rewinding to the right */
41079+ result = dir_go_to(dir, pos, tap);
41080+ if (result == 0)
41081+ result = rewind_right(tap, shift);
41082+ }
41083+ if (result == 0) {
41084+ result = set_pos(inode, pos, tap);
41085+ if (result == 0) {
41086+ /* update pos->position.pos */
41087+ pos->entry_no = destination;
41088+ pos->fpos = dirpos;
41089+ }
41090+ }
41091+ return result;
41092+}
41093+
41094+/*
41095+ * Function that is called by common_readdir() on each directory entry while
41096+ * doing readdir. ->filldir callback may block, so we had to release long term
41097+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
41098+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
41099+ *
41100+ * Whether node is unlocked in case of any other error is undefined. It is
41101+ * guaranteed to be still locked if success (0) is returned.
41102+ *
41103+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
41104+ * unlocked.
41105+ */
41106+static int
41107+feed_entry(struct file *f,
41108+ readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
41109+{
41110+ item_plugin *iplug;
41111+ char *name;
41112+ reiser4_key sd_key;
41113+ int result;
41114+ char buf[DE_NAME_BUF_LEN];
41115+ char name_buf[32];
41116+ char *local_name;
41117+ unsigned file_type;
41118+ seal_t seal;
41119+ coord_t *coord;
41120+ reiser4_key entry_key;
41121+
41122+ coord = tap->coord;
41123+ iplug = item_plugin_by_coord(coord);
41124+
41125+ /* pointer to name within the node */
41126+ name = iplug->s.dir.extract_name(coord, buf);
41127+ assert("nikita-1371", name != NULL);
41128+
41129+ /* key of object the entry points to */
41130+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
41131+ return RETERR(-EIO);
41132+
41133+ /* we must release longterm znode lock before calling filldir to avoid
41134+ deadlock which may happen if filldir causes page fault. So, copy
41135+ name to intermediate buffer */
41136+ if (strlen(name) + 1 > sizeof(name_buf)) {
41137+ local_name = kmalloc(strlen(name) + 1,
41138+ reiser4_ctx_gfp_mask_get());
41139+ if (local_name == NULL)
41140+ return RETERR(-ENOMEM);
41141+ } else
41142+ local_name = name_buf;
41143+
41144+ strcpy(local_name, name);
41145+ file_type = iplug->s.dir.extract_file_type(coord);
41146+
41147+ unit_key_by_coord(coord, &entry_key);
41148+ reiser4_seal_init(&seal, coord, &entry_key);
41149+
41150+ longterm_unlock_znode(tap->lh);
41151+
41152+ /*
41153+ * send information about directory entry to the ->filldir() filler
41154+ * supplied to us by caller (VFS).
41155+ *
41156+ * ->filldir is entitled to do weird things. For example, ->filldir
41157+ * supplied by knfsd re-enters file system. Make sure no locks are
41158+ * held.
41159+ */
41160+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
41161+
41162+ reiser4_txn_restart_current();
41163+ result = filldir(dirent, name, (int)strlen(name),
41164+ /* offset of this entry */
41165+ f->f_pos,
41166+ /* inode number of object bounden by this entry */
41167+ oid_to_uino(get_key_objectid(&sd_key)), file_type);
41168+ if (local_name != name_buf)
41169+ kfree(local_name);
41170+ if (result < 0)
41171+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
41172+ result = 1;
41173+ else
41174+ result = reiser4_seal_validate(&seal, coord, &entry_key,
41175+ tap->lh, tap->mode,
41176+ ZNODE_LOCK_HIPRI);
41177+ return result;
41178+}
41179+
41180+static void move_entry(readdir_pos * pos, coord_t * coord)
41181+{
41182+ reiser4_key de_key;
41183+ de_id *did;
41184+
41185+ /* update @pos */
41186+ ++pos->entry_no;
41187+ did = &pos->position.dir_entry_key;
41188+
41189+ /* get key of directory entry */
41190+ unit_key_by_coord(coord, &de_key);
41191+
41192+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
41193+ /* we are within sequence of directory entries
41194+ with duplicate keys. */
41195+ ++pos->position.pos;
41196+ else {
41197+ pos->position.pos = 0;
41198+ build_de_id_by_key(&de_key, did);
41199+ }
41200+ ++pos->fpos;
41201+}
41202+
41203+/*
41204+ * STATELESS READDIR
41205+ *
41206+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
41207+ * into reiser4_file_fsdata on each directory modification (name insertion and
41208+ * removal), see reiser4_readdir_common() function below. This obviously doesn't
41209+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
41210+ * across client READDIR requests for the same directory.
41211+ *
41212+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
41213+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
41214+ * find detached reiser4_file_fsdata corresponding to previous readdir
41215+ * request. In other words, additional state is maintained on the
41216+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
41217+ *
41218+ * To efficiently detect when our ->readdir() method is called by NFS server,
41219+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
41220+ * file_is_stateless() function).
41221+ *
41222+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
41223+ * bits of NFS readdir cookie: when first readdir request comes to the given
41224+ * directory from the given client, cookie is set to 0. This situation is
41225+ * detected, global cid_counter is incremented, and stored in highest bits of
41226+ * all direntry offsets returned to the client, including last one. As the
41227+ * only valid readdir cookie is one obtained as direntry->offset, we are
41228+ * guaranteed that next readdir request (continuing current one) will have
41229+ * current cid in the highest bits of starting readdir cookie. All d_cursors
41230+ * are hashed into per-super-block hash table by (oid, cid) key.
41231+ *
41232+ * In addition d_cursors are placed into per-super-block radix tree where they
41233+ * are keyed by oid alone. This is necessary to efficiently remove them during
41234+ * rmdir.
41235+ *
41236+ * At last, currently unused d_cursors are linked into special list. This list
41237+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
41238+ *
41239+ */
41240+
41241+/*
41242+ * prepare for readdir.
41243+ */
41244+static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
41245+{
41246+ struct inode *inode;
41247+ reiser4_file_fsdata *fsdata;
41248+ int result;
41249+
41250+ assert("nikita-1359", f != NULL);
41251+ inode = f->f_dentry->d_inode;
41252+ assert("nikita-1360", inode != NULL);
41253+
41254+ if (!S_ISDIR(inode->i_mode))
41255+ return RETERR(-ENOTDIR);
41256+
41257+ /* try to find detached readdir state */
41258+ result = reiser4_attach_fsdata(f, inode);
41259+ if (result != 0)
41260+ return result;
41261+
41262+ fsdata = reiser4_get_file_fsdata(f);
41263+ assert("nikita-2571", fsdata != NULL);
41264+ if (IS_ERR(fsdata))
41265+ return PTR_ERR(fsdata);
41266+
41267+ /* add file descriptor to the readdir list hanging of directory
41268+ * inode. This list is used to scan "readdirs-in-progress" while
41269+ * inserting or removing names in the directory. */
41270+ spin_lock_inode(inode);
41271+ if (list_empty_careful(&fsdata->dir.linkage))
41272+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
41273+ *pos = &fsdata->dir.readdir;
41274+ spin_unlock_inode(inode);
41275+
41276+ /* move @tap to the current position */
41277+ return dir_rewind(f, *pos, tap);
41278+}
41279+
41280+/* this is implementation of vfs's llseek method of struct file_operations for
41281+ typical directory
41282+ See comment before reiser4_readdir_common() for explanation.
41283+*/
41284+loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
41285+{
41286+ reiser4_context *ctx;
41287+ loff_t result;
41288+ struct inode *inode;
41289+
41290+ inode = file->f_dentry->d_inode;
41291+
41292+ ctx = reiser4_init_context(inode->i_sb);
41293+ if (IS_ERR(ctx))
41294+ return PTR_ERR(ctx);
41295+
41296+ mutex_lock(&inode->i_mutex);
41297+
41298+ /* update ->f_pos */
41299+ result = default_llseek(file, off, origin);
41300+ if (result >= 0) {
41301+ int ff;
41302+ coord_t coord;
41303+ lock_handle lh;
41304+ tap_t tap;
41305+ readdir_pos *pos;
41306+
41307+ coord_init_zero(&coord);
41308+ init_lh(&lh);
41309+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
41310+
41311+ ff = dir_readdir_init(file, &tap, &pos);
41312+ reiser4_detach_fsdata(file);
41313+ if (ff != 0)
41314+ result = (loff_t) ff;
41315+ reiser4_tap_done(&tap);
41316+ }
41317+ reiser4_detach_fsdata(file);
41318+ mutex_unlock(&inode->i_mutex);
41319+
41320+ reiser4_exit_context(ctx);
41321+ return result;
41322+}
41323+
41324+/* this is common implementation of vfs's readdir method of struct
41325+ file_operations
41326+
41327+ readdir problems:
41328+
41329+ readdir(2)/getdents(2) interface is based on implicit assumption that
41330+ readdir can be restarted from any particular point by supplying file system
41331+ with off_t-full of data. That is, file system fills ->d_off field in struct
41332+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
41333+ implemented by glibc as lseek(2) on directory.
41334+
41335+ Reiser4 cannot restart readdir from 64 bits of data, because two last
41336+ components of the key of directory entry are unknown, which given 128 bits:
41337+ locality and type fields in the key of directory entry are always known, to
41338+ start readdir() from given point objectid and offset fields have to be
41339+ filled.
41340+
41341+ Traditional UNIX API for scanning through directory
41342+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
41343+ assumption that directory is structured very much like regular file, in
41344+ particular, it is implied that each name within given directory (directory
41345+ entry) can be uniquely identified by scalar offset and that such offset is
41346+ stable across the life-time of the name is identifies.
41347+
41348+ This is manifestly not so for reiser4. In reiser4 the only stable unique
41349+ identifies for the directory entry is its key that doesn't fit into
41350+ seekdir/telldir API.
41351+
41352+ solution:
41353+
41354+ Within each file descriptor participating in readdir-ing of directory
41355+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
41356+ the "current" directory entry that file descriptor looks at. It contains a
41357+ key of directory entry (plus some additional info to deal with non-unique
41358+ keys that we wouldn't dwell onto here) and a logical position of this
41359+ directory entry starting from the beginning of the directory, that is
41360+ ordinal number of this entry in the readdir order.
41361+
41362+ Obviously this logical position is not stable in the face of directory
41363+ modifications. To work around this, on each addition or removal of directory
41364+ entry all file descriptors for directory inode are scanned and their
41365+ readdir_pos are updated accordingly (adjust_dir_pos()).
41366+*/
41367+int reiser4_readdir_common(struct file *f /* directory file being read */,
41368+ void *dirent /* opaque data passed to us by VFS */,
41369+ filldir_t filld /* filler function passed to us
41370+ * by VFS */)
41371+{
41372+ reiser4_context *ctx;
41373+ int result;
41374+ struct inode *inode;
41375+ coord_t coord;
41376+ lock_handle lh;
41377+ tap_t tap;
41378+ readdir_pos *pos;
41379+
41380+ assert("nikita-1359", f != NULL);
41381+ inode = f->f_dentry->d_inode;
41382+ assert("nikita-1360", inode != NULL);
41383+
41384+ if (!S_ISDIR(inode->i_mode))
41385+ return RETERR(-ENOTDIR);
41386+
41387+ ctx = reiser4_init_context(inode->i_sb);
41388+ if (IS_ERR(ctx))
41389+ return PTR_ERR(ctx);
41390+
41391+ coord_init_zero(&coord);
41392+ init_lh(&lh);
41393+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
41394+
41395+ reiser4_readdir_readahead_init(inode, &tap);
41396+
41397+ repeat:
41398+ result = dir_readdir_init(f, &tap, &pos);
41399+ if (result == 0) {
41400+ result = reiser4_tap_load(&tap);
41401+ /* scan entries one by one feeding them to @filld */
41402+ while (result == 0) {
41403+ coord_t *coord;
41404+
41405+ coord = tap.coord;
41406+ assert("nikita-2572", coord_is_existing_unit(coord));
41407+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
41408+
41409+ result = feed_entry(f, pos, &tap, filld, dirent);
41410+ if (result > 0) {
41411+ break;
41412+ } else if (result == 0) {
41413+ ++f->f_pos;
41414+ result = go_next_unit(&tap);
41415+ if (result == -E_NO_NEIGHBOR ||
41416+ result == -ENOENT) {
41417+ result = 0;
41418+ break;
41419+ } else if (result == 0) {
41420+ if (is_valid_dir_coord(inode, coord))
41421+ move_entry(pos, coord);
41422+ else
41423+ break;
41424+ }
41425+ } else if (result == -E_REPEAT) {
41426+ /* feed_entry() had to restart. */
41427+ ++f->f_pos;
41428+ reiser4_tap_relse(&tap);
41429+ goto repeat;
41430+ } else
41431+ warning("vs-1617",
41432+ "reiser4_readdir_common: unexpected error %d",
41433+ result);
41434+ }
41435+ reiser4_tap_relse(&tap);
41436+
41437+ if (result >= 0)
41438+ f->f_version = inode->i_version;
41439+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
41440+ result = 0;
41441+ reiser4_tap_done(&tap);
41442+ reiser4_detach_fsdata(f);
41443+
41444+ /* try to update directory's atime */
41445+ if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
41446+ BA_CAN_COMMIT) != 0)
41447+ warning("", "failed to update atime on readdir: %llu",
41448+ get_inode_oid(inode));
41449+ else
41450+ file_accessed(f);
41451+
41452+ context_set_commit_async(ctx);
41453+ reiser4_exit_context(ctx);
41454+
41455+ return (result <= 0) ? result : 0;
41456+}
41457+
41458+/*
41459+ * Local variables:
41460+ * c-indentation-style: "K&R"
41461+ * mode-name: "LC"
41462+ * c-basic-offset: 8
41463+ * tab-width: 8
41464+ * fill-column: 79
41465+ * End:
41466+ */
41467diff --git a/fs/reiser4/plugin/file_plugin_common.c b/fs/reiser4/plugin/file_plugin_common.c
41468new file mode 100644
41469index 0000000..55d9047
41470--- /dev/null
41471+++ b/fs/reiser4/plugin/file_plugin_common.c
41472@@ -0,0 +1,1007 @@
41473+/* Copyright 2005 by Hans Reiser, licensing governed by
41474+ reiser4/README */
41475+
41476+/* this file contains typical implementations for most of methods of
41477+ file plugin
41478+*/
41479+
41480+#include "../inode.h"
41481+#include "object.h"
41482+#include "../safe_link.h"
41483+
41484+#include <linux/quotaops.h>
41485+
41486+static int insert_new_sd(struct inode *inode);
41487+static int update_sd(struct inode *inode);
41488+
41489+/* this is common implementation of write_sd_by_inode method of file plugin
41490+ either insert stat data or update it
41491+ */
41492+int write_sd_by_inode_common(struct inode *inode /* object to save */ )
41493+{
41494+ int result;
41495+
41496+ assert("nikita-730", inode != NULL);
41497+
41498+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
41499+ /* object doesn't have stat-data yet */
41500+ result = insert_new_sd(inode);
41501+ else
41502+ result = update_sd(inode);
41503+ if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
41504+ /* Don't issue warnings about "name is too long" */
41505+ warning("nikita-2221", "Failed to save sd for %llu: %i",
41506+ (unsigned long long)get_inode_oid(inode), result);
41507+ return result;
41508+}
41509+
41510+/* this is common implementation of key_by_inode method of file plugin
41511+ */
41512+int
41513+key_by_inode_and_offset_common(struct inode *inode, loff_t off,
41514+ reiser4_key * key)
41515+{
41516+ reiser4_key_init(key);
41517+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
41518+ set_key_ordering(key, get_inode_ordering(inode));
41519+ set_key_objectid(key, get_inode_oid(inode)); /*FIXME: inode->i_ino */
41520+ set_key_type(key, KEY_BODY_MINOR);
41521+ set_key_offset(key, (__u64) off);
41522+ return 0;
41523+}
41524+
41525+/* this is common implementation of set_plug_in_inode method of file plugin
41526+ */
41527+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
41528+ struct inode *parent /* parent object */ ,
41529+ reiser4_object_create_data * data /* creational
41530+ * data */ )
41531+{
41532+ __u64 mask;
41533+
41534+ object->i_mode = data->mode;
41535+ /* this should be plugin decision */
41536+ object->i_uid = current->fsuid;
41537+ object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
41538+
41539+ /* support for BSD style group-id assignment. See mount's manual page
41540+ description of bsdgroups ext2 mount options for more details */
41541+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
41542+ object->i_gid = parent->i_gid;
41543+ else if (parent->i_mode & S_ISGID) {
41544+ /* parent directory has sguid bit */
41545+ object->i_gid = parent->i_gid;
41546+ if (S_ISDIR(object->i_mode))
41547+ /* sguid is inherited by sub-directories */
41548+ object->i_mode |= S_ISGID;
41549+ } else
41550+ object->i_gid = current->fsgid;
41551+
41552+ /* this object doesn't have stat-data yet */
41553+ reiser4_inode_set_flag(object, REISER4_NO_SD);
41554+#if 0
41555+ /* this is now called after all inode plugins are initialized:
41556+ do_create_vfs_child after adjust_to_parent */
41557+ /* setup inode and file-operations for this inode */
41558+ setup_inode_ops(object, data);
41559+#endif
41560+ object->i_nlink = 0;
41561+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41562+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41563+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41564+ mask |= (1 << LARGE_TIMES_STAT);
41565+
41566+ reiser4_inode_data(object)->extmask = mask;
41567+ return 0;
41568+}
41569+
41570+/* this is common implementation of adjust_to_parent method of file plugin for
41571+ regular files
41572+ */
41573+int adjust_to_parent_common(struct inode *object /* new object */ ,
41574+ struct inode *parent /* parent directory */ ,
41575+ struct inode *root /* root directory */ )
41576+{
41577+ assert("nikita-2165", object != NULL);
41578+ if (parent == NULL)
41579+ parent = root;
41580+ assert("nikita-2069", parent != NULL);
41581+
41582+ /*
41583+ * inherit missing plugins from parent
41584+ */
41585+
41586+ grab_plugin_pset(object, parent, PSET_FILE);
41587+ grab_plugin_pset(object, parent, PSET_SD);
41588+ grab_plugin_pset(object, parent, PSET_FORMATTING);
41589+ grab_plugin_pset(object, parent, PSET_PERM);
41590+ return 0;
41591+}
41592+
41593+/* this is common implementation of adjust_to_parent method of file plugin for
41594+ typical directories
41595+ */
41596+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41597+ struct inode *parent /* parent directory */ ,
41598+ struct inode *root /* root directory */ )
41599+{
41600+ int result = 0;
41601+ pset_member memb;
41602+
41603+ assert("nikita-2166", object != NULL);
41604+ if (parent == NULL)
41605+ parent = root;
41606+ assert("nikita-2167", parent != NULL);
41607+
41608+ /*
41609+ * inherit missing plugins from parent
41610+ */
41611+ for (memb = 0; memb < PSET_LAST; ++memb) {
41612+ result = grab_plugin_pset(object, parent, memb);
41613+ if (result != 0)
41614+ break;
41615+ }
41616+ return result;
41617+}
41618+
41619+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41620+ struct inode *parent /* parent directory */,
41621+ struct inode *root /* root directory */)
41622+{
41623+ int result;
41624+ result = adjust_to_parent_common(object, parent, root);
41625+ if (result)
41626+ return result;
41627+ assert("edward-1416", parent != NULL);
41628+
41629+ grab_plugin_pset(object, parent, PSET_CLUSTER);
41630+ grab_plugin_pset(object, parent, PSET_CIPHER);
41631+ grab_plugin_pset(object, parent, PSET_DIGEST);
41632+ grab_plugin_pset(object, parent, PSET_COMPRESSION);
41633+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
41634+
41635+ return 0;
41636+}
41637+
41638+/* this is common implementation of create_object method of file plugin
41639+ */
41640+int reiser4_create_object_common(struct inode *object, struct inode *parent,
41641+ reiser4_object_create_data * data)
41642+{
41643+ reiser4_block_nr reserve;
41644+ assert("nikita-744", object != NULL);
41645+ assert("nikita-745", parent != NULL);
41646+ assert("nikita-747", data != NULL);
41647+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
41648+
41649+ reserve = estimate_create_common(object);
41650+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41651+ return RETERR(-ENOSPC);
41652+ return write_sd_by_inode_common(object);
41653+}
41654+
41655+static int common_object_delete_no_reserve(struct inode *inode);
41656+
41657+/**
41658+ * reiser4_delete_object_common - delete_object of file_plugin
41659+ * @inode: inode to be deleted
41660+ *
41661+ * This is common implementation of delete_object method of file_plugin. It
41662+ * applies to object its deletion consists of removing two items - stat data
41663+ * and safe-link.
41664+ */
41665+int reiser4_delete_object_common(struct inode *inode)
41666+{
41667+ int result;
41668+
41669+ assert("nikita-1477", inode != NULL);
41670+ /* FIXME: if file body deletion failed (i/o error, for instance),
41671+ inode->i_size can be != 0 here */
41672+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41673+ assert("nikita-3421", inode->i_nlink == 0);
41674+
41675+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41676+ reiser4_block_nr reserve;
41677+
41678+ /* grab space which is needed to remove 2 items from the tree:
41679+ stat data and safe-link */
41680+ reserve = 2 *
41681+ estimate_one_item_removal(reiser4_tree_by_inode(inode));
41682+ if (reiser4_grab_space_force(reserve,
41683+ BA_RESERVED | BA_CAN_COMMIT))
41684+ return RETERR(-ENOSPC);
41685+ result = common_object_delete_no_reserve(inode);
41686+ } else
41687+ result = 0;
41688+ return result;
41689+}
41690+
41691+/**
41692+ * reiser4_delete_dir_common - delete_object of file_plugin
41693+ * @inode: inode to be deleted
41694+ *
41695+ * This is common implementation of delete_object method of file_plugin for
41696+ * typical directory. It calls done method of dir_plugin to remove "." and
41697+ * removes stat data and safe-link.
41698+ */
41699+int reiser4_delete_dir_common(struct inode *inode)
41700+{
41701+ int result;
41702+ dir_plugin *dplug;
41703+
41704+ assert("", (get_current_context() &&
41705+ get_current_context()->trans->atom == NULL));
41706+
41707+ dplug = inode_dir_plugin(inode);
41708+ assert("vs-1101", dplug && dplug->done);
41709+
41710+ /* kill cursors which might be attached to inode */
41711+ reiser4_kill_cursors(inode);
41712+
41713+ /* grab space enough for removing two items */
41714+ if (reiser4_grab_space
41715+ (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
41716+ BA_RESERVED | BA_CAN_COMMIT))
41717+ return RETERR(-ENOSPC);
41718+
41719+ result = dplug->done(inode);
41720+ if (!result)
41721+ result = common_object_delete_no_reserve(inode);
41722+ return result;
41723+}
41724+
41725+/* this is common implementation of add_link method of file plugin
41726+ */
41727+int reiser4_add_link_common(struct inode *object, struct inode *parent)
41728+{
41729+ /*
41730+ * increment ->i_nlink and update ->i_ctime
41731+ */
41732+
41733+ INODE_INC_FIELD(object, i_nlink);
41734+ object->i_ctime = CURRENT_TIME;
41735+ return 0;
41736+}
41737+
41738+/* this is common implementation of rem_link method of file plugin
41739+ */
41740+int reiser4_rem_link_common(struct inode *object, struct inode *parent)
41741+{
41742+ assert("nikita-2021", object != NULL);
41743+ assert("nikita-2163", object->i_nlink > 0);
41744+
41745+ /*
41746+ * decrement ->i_nlink and update ->i_ctime
41747+ */
41748+
41749+ INODE_DEC_FIELD(object, i_nlink);
41750+ object->i_ctime = CURRENT_TIME;
41751+ return 0;
41752+}
41753+
41754+/* this is common implementation of rem_link method of file plugin for typical
41755+ directory
41756+*/
41757+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41758+{
41759+ assert("nikita-20211", object != NULL);
41760+ assert("nikita-21631", object->i_nlink > 0);
41761+
41762+ /*
41763+ * decrement ->i_nlink and update ->i_ctime
41764+ */
41765+ INODE_DEC_FIELD(object, i_nlink);
41766+ if (object->i_nlink == 1)
41767+ INODE_DEC_FIELD(object, i_nlink);
41768+ object->i_ctime = CURRENT_TIME;
41769+ return 0;
41770+}
41771+
41772+/* this is common implementation of owns_item method of file plugin
41773+ compare objectids of keys in inode and coord */
41774+int owns_item_common(const struct inode *inode, /* object to check
41775+ * against */
41776+ const coord_t * coord /* coord to check */ )
41777+{
41778+ reiser4_key item_key;
41779+ reiser4_key file_key;
41780+
41781+ assert("nikita-760", inode != NULL);
41782+ assert("nikita-761", coord != NULL);
41783+
41784+ return coord_is_existing_item(coord) &&
41785+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
41786+ get_key_objectid(item_key_by_coord(coord, &item_key)));
41787+}
41788+
41789+/* this is common implementation of owns_item method of file plugin
41790+ for typical directory
41791+*/
41792+int owns_item_common_dir(const struct inode *inode, /* object to check against */
41793+ const coord_t * coord /* coord of item to check */ )
41794+{
41795+ reiser4_key item_key;
41796+
41797+ assert("nikita-1335", inode != NULL);
41798+ assert("nikita-1334", coord != NULL);
41799+
41800+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
41801+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41802+ get_inode_oid(inode);
41803+ else
41804+ return owns_item_common(inode, coord);
41805+}
41806+
41807+/* this is common implementation of can_add_link method of file plugin
41808+ checks whether yet another hard links to this object can be added
41809+*/
41810+int can_add_link_common(const struct inode *object /* object to check */ )
41811+{
41812+ assert("nikita-732", object != NULL);
41813+
41814+ /* inode->i_nlink is unsigned int, so just check for integer
41815+ overflow */
41816+ return object->i_nlink + 1 != 0;
41817+}
41818+
41819+/* this is common implementation of can_rem_link method of file plugin for
41820+ typical directory
41821+*/
41822+int can_rem_link_common_dir(const struct inode *inode)
41823+{
41824+ /* is_dir_empty() returns 0 is dir is empty */
41825+ return !is_dir_empty(inode);
41826+}
41827+
41828+/* this is common implementation of detach method of file plugin for typical
41829+ directory
41830+*/
41831+int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
41832+{
41833+ dir_plugin *dplug;
41834+
41835+ dplug = inode_dir_plugin(child);
41836+ assert("nikita-2883", dplug != NULL);
41837+ assert("nikita-2884", dplug->detach != NULL);
41838+ return dplug->detach(child, parent);
41839+}
41840+
41841+/* this is common implementation of bind method of file plugin for typical
41842+ directory
41843+*/
41844+int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
41845+{
41846+ dir_plugin *dplug;
41847+
41848+ dplug = inode_dir_plugin(child);
41849+ assert("nikita-2646", dplug != NULL);
41850+ return dplug->attach(child, parent);
41851+}
41852+
41853+static int process_truncate(struct inode *, __u64 size);
41854+
41855+/* this is common implementation of safelink method of file plugin
41856+ */
41857+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41858+{
41859+ int result;
41860+
41861+ assert("vs-1705", get_current_context()->trans->atom == NULL);
41862+ if (link == SAFE_UNLINK)
41863+ /* nothing to do. iput() in the caller (process_safelink) will
41864+ * finish with file */
41865+ result = 0;
41866+ else if (link == SAFE_TRUNCATE)
41867+ result = process_truncate(object, value);
41868+ else {
41869+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41870+ result = RETERR(-EIO);
41871+ }
41872+ return result;
41873+}
41874+
41875+/* this is common implementation of estimate.create method of file plugin
41876+ can be used when object creation involves insertion of one item (usually stat
41877+ data) into tree
41878+*/
41879+reiser4_block_nr estimate_create_common(const struct inode * object)
41880+{
41881+ return estimate_one_insert_item(reiser4_tree_by_inode(object));
41882+}
41883+
41884+/* this is common implementation of estimate.create method of file plugin for
41885+ typical directory
41886+ can be used when directory creation involves insertion of two items (usually
41887+ stat data and item containing "." and "..") into tree
41888+*/
41889+reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41890+{
41891+ return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
41892+}
41893+
41894+/* this is common implementation of estimate.update method of file plugin
41895+ can be used when stat data update does not do more than inserting a unit
41896+ into a stat data item which is probably true for most cases
41897+*/
41898+reiser4_block_nr estimate_update_common(const struct inode * inode)
41899+{
41900+ return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
41901+}
41902+
41903+/* this is common implementation of estimate.unlink method of file plugin
41904+ */
41905+reiser4_block_nr
41906+estimate_unlink_common(const struct inode * object UNUSED_ARG,
41907+ const struct inode * parent UNUSED_ARG)
41908+{
41909+ return 0;
41910+}
41911+
41912+/* this is common implementation of estimate.unlink method of file plugin for
41913+ typical directory
41914+*/
41915+reiser4_block_nr
41916+estimate_unlink_common_dir(const struct inode * object,
41917+ const struct inode * parent)
41918+{
41919+ dir_plugin *dplug;
41920+
41921+ dplug = inode_dir_plugin(object);
41922+ assert("nikita-2888", dplug != NULL);
41923+ assert("nikita-2887", dplug->estimate.unlink != NULL);
41924+ return dplug->estimate.unlink(object, parent);
41925+}
41926+
41927+char *wire_write_common(struct inode *inode, char *start)
41928+{
41929+ return build_inode_onwire(inode, start);
41930+}
41931+
41932+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41933+{
41934+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41935+}
41936+
41937+struct dentry *wire_get_common(struct super_block *sb,
41938+ reiser4_object_on_wire * obj)
41939+{
41940+ struct inode *inode;
41941+ struct dentry *dentry;
41942+ reiser4_key key;
41943+
41944+ extract_key_from_id(&obj->u.std.key_id, &key);
41945+ inode = reiser4_iget(sb, &key, 1);
41946+ if (!IS_ERR(inode)) {
41947+ reiser4_iget_complete(inode);
41948+ dentry = d_alloc_anon(inode);
41949+ if (dentry == NULL) {
41950+ iput(inode);
41951+ dentry = ERR_PTR(-ENOMEM);
41952+ } else
41953+ dentry->d_op = &get_super_private(sb)->ops.dentry;
41954+ } else if (PTR_ERR(inode) == -ENOENT)
41955+ /*
41956+ * inode wasn't found at the key encoded in the file
41957+ * handle. Hence, file handle is stale.
41958+ */
41959+ dentry = ERR_PTR(RETERR(-ESTALE));
41960+ else
41961+ dentry = (void *)inode;
41962+ return dentry;
41963+}
41964+
41965+int wire_size_common(struct inode *inode)
41966+{
41967+ return inode_onwire_size(inode);
41968+}
41969+
41970+void wire_done_common(reiser4_object_on_wire * obj)
41971+{
41972+ /* nothing to do */
41973+}
41974+
41975+/* helper function to print errors */
41976+static void key_warning(const reiser4_key * key /* key to print */ ,
41977+ const struct inode *inode,
41978+ int code /* error code to print */ )
41979+{
41980+ assert("nikita-716", key != NULL);
41981+
41982+ if (code != -ENOMEM) {
41983+ warning("nikita-717", "Error for inode %llu (%i)",
41984+ (unsigned long long)get_key_objectid(key), code);
41985+ reiser4_print_key("for key", key);
41986+ }
41987+}
41988+
41989+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41990+#if REISER4_DEBUG
41991+static void
41992+check_inode_seal(const struct inode *inode,
41993+ const coord_t * coord, const reiser4_key * key)
41994+{
41995+ reiser4_key unit_key;
41996+
41997+ unit_key_by_coord(coord, &unit_key);
41998+ assert("nikita-2752",
41999+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
42000+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
42001+}
42002+
42003+static void check_sd_coord(coord_t * coord, const reiser4_key * key)
42004+{
42005+ reiser4_key ukey;
42006+
42007+ coord_clear_iplug(coord);
42008+ if (zload(coord->node))
42009+ return;
42010+
42011+ if (!coord_is_existing_unit(coord) ||
42012+ !item_plugin_by_coord(coord) ||
42013+ !keyeq(unit_key_by_coord(coord, &ukey), key) ||
42014+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
42015+ !item_is_statdata(coord)) {
42016+ warning("nikita-1901", "Conspicuous seal");
42017+ reiser4_print_key("key", key);
42018+ print_coord("coord", coord, 1);
42019+ impossible("nikita-2877", "no way");
42020+ }
42021+ zrelse(coord->node);
42022+}
42023+
42024+#else
42025+#define check_inode_seal(inode, coord, key) noop
42026+#define check_sd_coord(coord, key) noop
42027+#endif
42028+
42029+/* insert new stat-data into tree. Called with inode state
42030+ locked. Return inode state locked. */
42031+static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
42032+{
42033+ int result;
42034+ reiser4_key key;
42035+ coord_t coord;
42036+ reiser4_item_data data;
42037+ char *area;
42038+ reiser4_inode *ref;
42039+ lock_handle lh;
42040+ oid_t oid;
42041+
42042+ assert("nikita-723", inode != NULL);
42043+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
42044+
42045+ ref = reiser4_inode_data(inode);
42046+ spin_lock_inode(inode);
42047+
42048+ if (ref->plugin_mask != 0)
42049+ /* inode has non-standard plugins */
42050+ inode_set_extension(inode, PLUGIN_STAT);
42051+ /*
42052+ * prepare specification of new item to be inserted
42053+ */
42054+
42055+ data.iplug = inode_sd_plugin(inode);
42056+ data.length = data.iplug->s.sd.save_len(inode);
42057+ spin_unlock_inode(inode);
42058+
42059+ data.data = NULL;
42060+ data.user = 0;
42061+/* could be optimized for case where there is only one node format in
42062+ * use in the filesystem, probably there are lots of such
42063+ * places we could optimize for only one node layout.... -Hans */
42064+ if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
42065+ /* This is silly check, but we don't know actual node where
42066+ insertion will go into. */
42067+ return RETERR(-ENAMETOOLONG);
42068+ }
42069+ oid = oid_allocate(inode->i_sb);
42070+/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
42071+ if (oid == ABSOLUTE_MAX_OID)
42072+ return RETERR(-EOVERFLOW);
42073+
42074+ set_inode_oid(inode, oid);
42075+
42076+ coord_init_zero(&coord);
42077+ init_lh(&lh);
42078+
42079+ result = insert_by_key(reiser4_tree_by_inode(inode),
42080+ build_sd_key(inode, &key), &data, &coord, &lh,
42081+ /* stat data lives on a leaf level */
42082+ LEAF_LEVEL, CBK_UNIQUE);
42083+
42084+ /* we don't want to re-check that somebody didn't insert
42085+ stat-data while we were doing io, because if it did,
42086+ insert_by_key() returned error. */
42087+ /* but what _is_ possible is that plugin for inode's stat-data,
42088+ list of non-standard plugins or their state would change
42089+ during io, so that stat-data wouldn't fit into sd. To avoid
42090+ this race we keep inode_state lock. This lock has to be
42091+ taken each time you access inode in a way that would cause
42092+ changes in sd size: changing plugins etc.
42093+ */
42094+
42095+ if (result == IBK_INSERT_OK) {
42096+ coord_clear_iplug(&coord);
42097+ result = zload(coord.node);
42098+ if (result == 0) {
42099+ /* have we really inserted stat data? */
42100+ assert("nikita-725", item_is_statdata(&coord));
42101+
42102+ /* inode was just created. It is inserted into hash
42103+ table, but no directory entry was yet inserted into
42104+ parent. So, inode is inaccessible through
42105+ ->lookup(). All places that directly grab inode
42106+ from hash-table (like old knfsd), should check
42107+ IMMUTABLE flag that is set by common_create_child.
42108+ */
42109+ assert("nikita-3240", data.iplug != NULL);
42110+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
42111+ area = item_body_by_coord(&coord);
42112+ result = data.iplug->s.sd.save(inode, &area);
42113+ znode_make_dirty(coord.node);
42114+ if (result == 0) {
42115+ /* object has stat-data now */
42116+ reiser4_inode_clr_flag(inode, REISER4_NO_SD);
42117+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
42118+ /* initialise stat-data seal */
42119+ reiser4_seal_init(&ref->sd_seal, &coord, &key);
42120+ ref->sd_coord = coord;
42121+ check_inode_seal(inode, &coord, &key);
42122+ } else if (result != -ENOMEM)
42123+ /*
42124+ * convert any other error code to -EIO to
42125+ * avoid confusing user level with unexpected
42126+ * errors.
42127+ */
42128+ result = RETERR(-EIO);
42129+ zrelse(coord.node);
42130+ }
42131+ }
42132+ done_lh(&lh);
42133+
42134+ if (result != 0)
42135+ key_warning(&key, inode, result);
42136+ else
42137+ oid_count_allocated();
42138+
42139+ return result;
42140+}
42141+
42142+/* find sd of inode in a tree, deal with errors */
42143+int lookup_sd(struct inode *inode /* inode to look sd for */ ,
42144+ znode_lock_mode lock_mode /* lock mode */ ,
42145+ coord_t * coord /* resulting coord */ ,
42146+ lock_handle * lh /* resulting lock handle */ ,
42147+ const reiser4_key * key /* resulting key */ ,
42148+ int silent)
42149+{
42150+ int result;
42151+ __u32 flags;
42152+
42153+ assert("nikita-1692", inode != NULL);
42154+ assert("nikita-1693", coord != NULL);
42155+ assert("nikita-1694", key != NULL);
42156+
42157+ /* look for the object's stat data in a tree.
42158+ This returns in "node" pointer to a locked znode and in "pos"
42159+ position of an item found in node. Both are only valid if
42160+ coord_found is returned. */
42161+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
42162+ flags |= CBK_UNIQUE;
42163+ /*
42164+ * traverse tree to find stat data. We cannot use vroot here, because
42165+ * it only covers _body_ of the file, and stat data don't belong
42166+ * there.
42167+ */
42168+ result = coord_by_key(reiser4_tree_by_inode(inode),
42169+ key,
42170+ coord,
42171+ lh,
42172+ lock_mode,
42173+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
42174+ if (REISER4_DEBUG && result == 0)
42175+ check_sd_coord(coord, key);
42176+
42177+ if (result != 0 && !silent)
42178+ key_warning(key, inode, result);
42179+ return result;
42180+}
42181+
42182+static int
42183+locate_inode_sd(struct inode *inode,
42184+ reiser4_key * key, coord_t * coord, lock_handle * lh)
42185+{
42186+ reiser4_inode *state;
42187+ seal_t seal;
42188+ int result;
42189+
42190+ assert("nikita-3483", inode != NULL);
42191+
42192+ state = reiser4_inode_data(inode);
42193+ spin_lock_inode(inode);
42194+ *coord = state->sd_coord;
42195+ coord_clear_iplug(coord);
42196+ seal = state->sd_seal;
42197+ spin_unlock_inode(inode);
42198+
42199+ build_sd_key(inode, key);
42200+ if (reiser4_seal_is_set(&seal)) {
42201+ /* first, try to use seal */
42202+ result = reiser4_seal_validate(&seal,
42203+ coord,
42204+ key,
42205+ lh, ZNODE_WRITE_LOCK,
42206+ ZNODE_LOCK_LOPRI);
42207+ if (result == 0)
42208+ check_sd_coord(coord, key);
42209+ } else
42210+ result = -E_REPEAT;
42211+
42212+ if (result != 0) {
42213+ coord_init_zero(coord);
42214+ result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
42215+ }
42216+ return result;
42217+}
42218+
42219+#if REISER4_DEBUG
42220+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
42221+{
42222+ return (get_key_locality(k1) == get_key_locality(k2) &&
42223+ get_key_type(k1) == get_key_type(k2) &&
42224+ get_key_band(k1) == get_key_band(k2) &&
42225+ get_key_ordering(k1) == get_key_ordering(k2) &&
42226+ get_key_objectid(k1) == get_key_objectid(k2));
42227+}
42228+
42229+#include "../tree_walk.h"
42230+
42231+/* make some checks before and after stat-data resize operation */
42232+static int check_sd_resize(struct inode * inode, coord_t * coord,
42233+ int length, int progress /* 1 means after resize */)
42234+{
42235+ int ret = 0;
42236+ lock_handle left_lock;
42237+ coord_t left_coord;
42238+ reiser4_key left_key;
42239+ reiser4_key key;
42240+
42241+ if (inode_file_plugin(inode) !=
42242+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
42243+ return 0;
42244+ if (!length)
42245+ return 0;
42246+ if (coord->item_pos != 0)
42247+ return 0;
42248+
42249+ init_lh(&left_lock);
42250+ ret = reiser4_get_left_neighbor(&left_lock,
42251+ coord->node,
42252+ ZNODE_WRITE_LOCK,
42253+ GN_CAN_USE_UPPER_LEVELS);
42254+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
42255+ ret == -ENOENT || ret == -EINVAL
42256+ || ret == -E_DEADLOCK) {
42257+ ret = 0;
42258+ goto exit;
42259+ }
42260+ ret = zload(left_lock.node);
42261+ if (ret)
42262+ goto exit;
42263+ coord_init_last_unit(&left_coord, left_lock.node);
42264+ item_key_by_coord(&left_coord, &left_key);
42265+ item_key_by_coord(coord, &key);
42266+
42267+ if (all_but_offset_key_eq(&key, &left_key))
42268+ /* corruption occured */
42269+ ret = 1;
42270+ zrelse(left_lock.node);
42271+ exit:
42272+ done_lh(&left_lock);
42273+ return ret;
42274+}
42275+#endif
42276+
42277+/* update stat-data at @coord */
42278+static int
42279+update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
42280+ lock_handle * lh)
42281+{
42282+ int result;
42283+ reiser4_item_data data;
42284+ char *area;
42285+ reiser4_inode *state;
42286+ znode *loaded;
42287+
42288+ state = reiser4_inode_data(inode);
42289+
42290+ coord_clear_iplug(coord);
42291+ result = zload(coord->node);
42292+ if (result != 0)
42293+ return result;
42294+ loaded = coord->node;
42295+
42296+ spin_lock_inode(inode);
42297+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
42298+ data.iplug = inode_sd_plugin(inode);
42299+
42300+ /* if inode has non-standard plugins, add appropriate stat data
42301+ * extension */
42302+ if (state->extmask & (1 << PLUGIN_STAT)) {
42303+ if (state->plugin_mask == 0)
42304+ inode_clr_extension(inode, PLUGIN_STAT);
42305+ } else if (state->plugin_mask != 0)
42306+ inode_set_extension(inode, PLUGIN_STAT);
42307+
42308+ if (state->extmask & (1 << HEIR_STAT)) {
42309+ if (state->heir_mask == 0)
42310+ inode_clr_extension(inode, HEIR_STAT);
42311+ } else if (state->heir_mask != 0)
42312+ inode_set_extension(inode, HEIR_STAT);
42313+
42314+ /* data.length is how much space to add to (or remove
42315+ from if negative) sd */
42316+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
42317+ /* recalculate stat-data length */
42318+ data.length =
42319+ data.iplug->s.sd.save_len(inode) -
42320+ item_length_by_coord(coord);
42321+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
42322+ } else
42323+ data.length = 0;
42324+ spin_unlock_inode(inode);
42325+
42326+ /* if on-disk stat data is of different length than required
42327+ for this inode, resize it */
42328+
42329+ if (data.length != 0) {
42330+ data.data = NULL;
42331+ data.user = 0;
42332+
42333+ assert("edward-1441",
42334+ !check_sd_resize(inode, coord,
42335+ data.length, 0/* before resize */));
42336+
42337+ /* insertion code requires that insertion point (coord) was
42338+ * between units. */
42339+ coord->between = AFTER_UNIT;
42340+ result = reiser4_resize_item(coord, &data, key, lh,
42341+ COPI_DONT_SHIFT_LEFT);
42342+ if (result != 0) {
42343+ key_warning(key, inode, result);
42344+ zrelse(loaded);
42345+ return result;
42346+ }
42347+ if (loaded != coord->node) {
42348+ /* reiser4_resize_item moved coord to another node.
42349+ Zload it */
42350+ zrelse(loaded);
42351+ coord_clear_iplug(coord);
42352+ result = zload(coord->node);
42353+ if (result != 0)
42354+ return result;
42355+ loaded = coord->node;
42356+ }
42357+ assert("edward-1442",
42358+ !check_sd_resize(inode, coord,
42359+ data.length, 1/* after resize */));
42360+ }
42361+ area = item_body_by_coord(coord);
42362+ spin_lock_inode(inode);
42363+ result = data.iplug->s.sd.save(inode, &area);
42364+ znode_make_dirty(coord->node);
42365+
42366+ /* re-initialise stat-data seal */
42367+
42368+ /*
42369+ * coord.between was possibly skewed from AT_UNIT when stat-data size
42370+ * was changed and new extensions were pasted into item.
42371+ */
42372+ coord->between = AT_UNIT;
42373+ reiser4_seal_init(&state->sd_seal, coord, key);
42374+ state->sd_coord = *coord;
42375+ spin_unlock_inode(inode);
42376+ check_inode_seal(inode, coord, key);
42377+ zrelse(loaded);
42378+ return result;
42379+}
42380+
42381+/* Update existing stat-data in a tree. Called with inode state locked. Return
42382+ inode state locked. */
42383+static int update_sd(struct inode *inode /* inode to update sd for */ )
42384+{
42385+ int result;
42386+ reiser4_key key;
42387+ coord_t coord;
42388+ lock_handle lh;
42389+
42390+ assert("nikita-726", inode != NULL);
42391+
42392+ /* no stat-data, nothing to update?! */
42393+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
42394+
42395+ init_lh(&lh);
42396+
42397+ result = locate_inode_sd(inode, &key, &coord, &lh);
42398+ if (result == 0)
42399+ result = update_sd_at(inode, &coord, &key, &lh);
42400+ done_lh(&lh);
42401+
42402+ return result;
42403+}
42404+
42405+/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
42406+ Remove object stat data. Space for that must be reserved by caller before
42407+*/
42408+static int
42409+common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
42410+{
42411+ int result;
42412+
42413+ assert("nikita-1477", inode != NULL);
42414+
42415+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
42416+ reiser4_key sd_key;
42417+
42418+ DQUOT_FREE_INODE(inode);
42419+ DQUOT_DROP(inode);
42420+
42421+ build_sd_key(inode, &sd_key);
42422+ result =
42423+ reiser4_cut_tree(reiser4_tree_by_inode(inode),
42424+ &sd_key, &sd_key, NULL, 0);
42425+ if (result == 0) {
42426+ reiser4_inode_set_flag(inode, REISER4_NO_SD);
42427+ result = oid_release(inode->i_sb, get_inode_oid(inode));
42428+ if (result == 0) {
42429+ oid_count_released();
42430+
42431+ result = safe_link_del(reiser4_tree_by_inode(inode),
42432+ get_inode_oid(inode),
42433+ SAFE_UNLINK);
42434+ }
42435+ }
42436+ } else
42437+ result = 0;
42438+ return result;
42439+}
42440+
42441+/* helper for safelink_common */
42442+static int process_truncate(struct inode *inode, __u64 size)
42443+{
42444+ int result;
42445+ struct iattr attr;
42446+ file_plugin *fplug;
42447+ reiser4_context *ctx;
42448+ struct dentry dentry;
42449+
42450+ assert("vs-21", is_in_reiser4_context());
42451+ ctx = reiser4_init_context(inode->i_sb);
42452+ assert("vs-22", !IS_ERR(ctx));
42453+
42454+ attr.ia_size = size;
42455+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
42456+ fplug = inode_file_plugin(inode);
42457+
42458+ mutex_lock(&inode->i_mutex);
42459+ assert("vs-1704", get_current_context()->trans->atom == NULL);
42460+ dentry.d_inode = inode;
42461+ result = inode->i_op->setattr(&dentry, &attr);
42462+ mutex_unlock(&inode->i_mutex);
42463+
42464+ context_set_commit_async(ctx);
42465+ reiser4_exit_context(ctx);
42466+
42467+ return result;
42468+}
42469+
42470+/*
42471+ Local variables:
42472+ c-indentation-style: "K&R"
42473+ mode-name: "LC"
42474+ c-basic-offset: 8
42475+ tab-width: 8
42476+ fill-column: 80
42477+ scroll-step: 1
42478+ End:
42479+*/
42480diff --git a/fs/reiser4/plugin/hash.c b/fs/reiser4/plugin/hash.c
42481new file mode 100644
42482index 0000000..70f1e40
42483--- /dev/null
42484+++ b/fs/reiser4/plugin/hash.c
42485@@ -0,0 +1,353 @@
42486+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
42487+ * reiser4/README */
42488+
42489+/* Hash functions */
42490+
42491+#include "../debug.h"
42492+#include "plugin_header.h"
42493+#include "plugin.h"
42494+#include "../super.h"
42495+#include "../inode.h"
42496+
42497+#include <linux/types.h>
42498+
42499+/* old rupasov (yura) hash */
42500+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
42501+ int len /* @name's length */ )
42502+{
42503+ int i;
42504+ int j;
42505+ int pow;
42506+ __u64 a;
42507+ __u64 c;
42508+
42509+ assert("nikita-672", name != NULL);
42510+ assert("nikita-673", len >= 0);
42511+
42512+ for (pow = 1, i = 1; i < len; ++i)
42513+ pow = pow * 10;
42514+
42515+ if (len == 1)
42516+ a = name[0] - 48;
42517+ else
42518+ a = (name[0] - 48) * pow;
42519+
42520+ for (i = 1; i < len; ++i) {
42521+ c = name[i] - 48;
42522+ for (pow = 1, j = i; j < len - 1; ++j)
42523+ pow = pow * 10;
42524+ a = a + c * pow;
42525+ }
42526+ for (; i < 40; ++i) {
42527+ c = '0' - 48;
42528+ for (pow = 1, j = i; j < len - 1; ++j)
42529+ pow = pow * 10;
42530+ a = a + c * pow;
42531+ }
42532+
42533+ for (; i < 256; ++i) {
42534+ c = i;
42535+ for (pow = 1, j = i; j < len - 1; ++j)
42536+ pow = pow * 10;
42537+ a = a + c * pow;
42538+ }
42539+
42540+ a = a << 7;
42541+ return a;
42542+}
42543+
42544+/* r5 hash */
42545+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
42546+ int len UNUSED_ARG /* @name's length */ )
42547+{
42548+ __u64 a = 0;
42549+
42550+ assert("nikita-674", name != NULL);
42551+ assert("nikita-675", len >= 0);
42552+
42553+ while (*name) {
42554+ a += *name << 4;
42555+ a += *name >> 4;
42556+ a *= 11;
42557+ name++;
42558+ }
42559+ return a;
42560+}
42561+
42562+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
42563+ H0 = Key
42564+ Hi = E Mi(Hi-1) + Hi-1
42565+
42566+ (see Applied Cryptography, 2nd edition, p448).
42567+
42568+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
42569+
42570+ Jeremy has agreed to the contents of reiserfs/README. -Hans
42571+
42572+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
42573+*/
42574+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
42575+ int len /* @name's length */ )
42576+{
42577+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
42578+
42579+ __u64 h0 = k[0], h1 = k[1];
42580+ __u64 a, b, c, d;
42581+ __u64 pad;
42582+ int i;
42583+
42584+ assert("nikita-676", name != NULL);
42585+ assert("nikita-677", len >= 0);
42586+
42587+#define DELTA 0x9E3779B9u
42588+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
42589+#define PARTROUNDS 6 /* 6 gets complete mixing */
42590+
42591+/* a, b, c, d - data; h0, h1 - accumulated hash */
42592+#define TEACORE(rounds) \
42593+ do { \
42594+ __u64 sum = 0; \
42595+ int n = rounds; \
42596+ __u64 b0, b1; \
42597+ \
42598+ b0 = h0; \
42599+ b1 = h1; \
42600+ \
42601+ do \
42602+ { \
42603+ sum += DELTA; \
42604+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
42605+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
42606+ } while(--n); \
42607+ \
42608+ h0 += b0; \
42609+ h1 += b1; \
42610+ } while(0)
42611+
42612+ pad = (__u64) len | ((__u64) len << 8);
42613+ pad |= pad << 16;
42614+
42615+ while (len >= 16) {
42616+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42617+ 16 | (__u64) name[3] << 24;
42618+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42619+ 16 | (__u64) name[7] << 24;
42620+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42621+ 16 | (__u64) name[11] << 24;
42622+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
42623+ << 16 | (__u64) name[15] << 24;
42624+
42625+ TEACORE(PARTROUNDS);
42626+
42627+ len -= 16;
42628+ name += 16;
42629+ }
42630+
42631+ if (len >= 12) {
42632+ //assert(len < 16);
42633+ if (len >= 16)
42634+ *(int *)0 = 0;
42635+
42636+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42637+ 16 | (__u64) name[3] << 24;
42638+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42639+ 16 | (__u64) name[7] << 24;
42640+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42641+ 16 | (__u64) name[11] << 24;
42642+
42643+ d = pad;
42644+ for (i = 12; i < len; i++) {
42645+ d <<= 8;
42646+ d |= name[i];
42647+ }
42648+ } else if (len >= 8) {
42649+ //assert(len < 12);
42650+ if (len >= 12)
42651+ *(int *)0 = 0;
42652+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42653+ 16 | (__u64) name[3] << 24;
42654+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42655+ 16 | (__u64) name[7] << 24;
42656+
42657+ c = d = pad;
42658+ for (i = 8; i < len; i++) {
42659+ c <<= 8;
42660+ c |= name[i];
42661+ }
42662+ } else if (len >= 4) {
42663+ //assert(len < 8);
42664+ if (len >= 8)
42665+ *(int *)0 = 0;
42666+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42667+ 16 | (__u64) name[3] << 24;
42668+
42669+ b = c = d = pad;
42670+ for (i = 4; i < len; i++) {
42671+ b <<= 8;
42672+ b |= name[i];
42673+ }
42674+ } else {
42675+ //assert(len < 4);
42676+ if (len >= 4)
42677+ *(int *)0 = 0;
42678+ a = b = c = d = pad;
42679+ for (i = 0; i < len; i++) {
42680+ a <<= 8;
42681+ a |= name[i];
42682+ }
42683+ }
42684+
42685+ TEACORE(FULLROUNDS);
42686+
42687+/* return 0;*/
42688+ return h0 ^ h1;
42689+
42690+}
42691+
42692+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42693+
42694+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42695+
42696+ Excerpts:
42697+
42698+ FNV hashes are designed to be fast while maintaining a low collision
42699+ rate.
42700+
42701+ [This version also seems to preserve lexicographical order locally.]
42702+
42703+ FNV hash algorithms and source code have been released into the public
42704+ domain.
42705+
42706+*/
42707+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42708+ int len UNUSED_ARG /* @name's length */ )
42709+{
42710+ unsigned long long a = 0xcbf29ce484222325ull;
42711+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
42712+
42713+ assert("nikita-678", name != NULL);
42714+ assert("nikita-679", len >= 0);
42715+
42716+ /* FNV-1 hash each octet in the buffer */
42717+ for (; *name; ++name) {
42718+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
42719+ a *= fnv_64_prime;
42720+ /* xor the bottom with the current octet */
42721+ a ^= (unsigned long long)(*name);
42722+ }
42723+ /* return our new hash value */
42724+ return a;
42725+}
42726+
42727+/* degenerate hash function used to simplify testing of non-unique key
42728+ handling */
42729+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42730+ int len UNUSED_ARG /* @name's length */ )
42731+{
42732+ return 0xc0c0c0c010101010ull;
42733+}
42734+
42735+static int change_hash(struct inode *inode,
42736+ reiser4_plugin * plugin,
42737+ pset_member memb)
42738+{
42739+ int result;
42740+
42741+ assert("nikita-3503", inode != NULL);
42742+ assert("nikita-3504", plugin != NULL);
42743+
42744+ assert("nikita-3505", is_reiser4_inode(inode));
42745+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42746+
42747+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
42748+ return RETERR(-EINVAL);
42749+
42750+ result = 0;
42751+ if (inode_hash_plugin(inode) == NULL ||
42752+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
42753+ if (is_dir_empty(inode) == 0)
42754+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
42755+ PSET_HASH, plugin);
42756+ else
42757+ result = RETERR(-ENOTEMPTY);
42758+
42759+ }
42760+ return result;
42761+}
42762+
42763+static reiser4_plugin_ops hash_plugin_ops = {
42764+ .init = NULL,
42765+ .load = NULL,
42766+ .save_len = NULL,
42767+ .save = NULL,
42768+ .change = change_hash
42769+};
42770+
42771+/* hash plugins */
42772+hash_plugin hash_plugins[LAST_HASH_ID] = {
42773+ [RUPASOV_HASH_ID] = {
42774+ .h = {
42775+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42776+ .id = RUPASOV_HASH_ID,
42777+ .pops = &hash_plugin_ops,
42778+ .label = "rupasov",
42779+ .desc = "Original Yura's hash",
42780+ .linkage = {NULL, NULL}
42781+ },
42782+ .hash = hash_rupasov
42783+ },
42784+ [R5_HASH_ID] = {
42785+ .h = {
42786+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42787+ .id = R5_HASH_ID,
42788+ .pops = &hash_plugin_ops,
42789+ .label = "r5",
42790+ .desc = "r5 hash",
42791+ .linkage = {NULL, NULL}
42792+ },
42793+ .hash = hash_r5
42794+ },
42795+ [TEA_HASH_ID] = {
42796+ .h = {
42797+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42798+ .id = TEA_HASH_ID,
42799+ .pops = &hash_plugin_ops,
42800+ .label = "tea",
42801+ .desc = "tea hash",
42802+ .linkage = {NULL, NULL}
42803+ },
42804+ .hash = hash_tea
42805+ },
42806+ [FNV1_HASH_ID] = {
42807+ .h = {
42808+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42809+ .id = FNV1_HASH_ID,
42810+ .pops = &hash_plugin_ops,
42811+ .label = "fnv1",
42812+ .desc = "fnv1 hash",
42813+ .linkage = {NULL, NULL}
42814+ },
42815+ .hash = hash_fnv1
42816+ },
42817+ [DEGENERATE_HASH_ID] = {
42818+ .h = {
42819+ .type_id = REISER4_HASH_PLUGIN_TYPE,
42820+ .id = DEGENERATE_HASH_ID,
42821+ .pops = &hash_plugin_ops,
42822+ .label = "degenerate hash",
42823+ .desc = "Degenerate hash: only for testing",
42824+ .linkage = {NULL, NULL}
42825+ },
42826+ .hash = hash_deg
42827+ }
42828+};
42829+
42830+/* Make Linus happy.
42831+ Local variables:
42832+ c-indentation-style: "K&R"
42833+ mode-name: "LC"
42834+ c-basic-offset: 8
42835+ tab-width: 8
42836+ fill-column: 120
42837+ End:
42838+*/
42839diff --git a/fs/reiser4/plugin/inode_ops.c b/fs/reiser4/plugin/inode_ops.c
42840new file mode 100644
42841index 0000000..48430f7
42842--- /dev/null
42843+++ b/fs/reiser4/plugin/inode_ops.c
42844@@ -0,0 +1,897 @@
42845+/*
42846+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42847+ */
42848+
42849+/*
42850+ * this file contains typical implementations for most of methods of struct
42851+ * inode_operations
42852+ */
42853+
42854+#include "../inode.h"
42855+#include "../safe_link.h"
42856+
42857+#include <linux/quotaops.h>
42858+#include <linux/namei.h>
42859+
42860+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42861+ reiser4_object_create_data *data);
42862+
42863+/**
42864+ * reiser4_create_common - create of inode operations
42865+ * @parent: inode of parent directory
42866+ * @dentry: dentry of new object to create
42867+ * @mode: the permissions to use
42868+ * @nameidata:
42869+ *
42870+ * This is common implementation of vfs's create method of struct
42871+ * inode_operations.
42872+ * Creates regular file using file plugin from parent directory plugin set.
42873+ */
42874+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
42875+ int mode, struct nameidata *nameidata)
42876+{
42877+ reiser4_object_create_data data;
42878+ file_plugin *fplug;
42879+
42880+ memset(&data, 0, sizeof data);
42881+ data.mode = S_IFREG | mode;
42882+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
42883+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
42884+ warning("vpf-1900", "'%s' is not a regular file plugin.",
42885+ fplug->h.label);
42886+ return RETERR(-EIO);
42887+ }
42888+ data.id = fplug->h.id;
42889+ return create_vfs_object(parent, dentry, &data);
42890+}
42891+
42892+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42893+void check_light_weight(struct inode *inode, struct inode *parent);
42894+
42895+/**
42896+ * reiser4_lookup_common - lookup of inode operations
42897+ * @parent: inode of directory to lookup into
42898+ * @dentry: name to look for
42899+ * @nameidata:
42900+ *
42901+ * This is common implementation of vfs's lookup method of struct
42902+ * inode_operations.
42903+ */
42904+struct dentry *reiser4_lookup_common(struct inode *parent,
42905+ struct dentry *dentry,
42906+ struct nameidata *nameidata)
42907+{
42908+ reiser4_context *ctx;
42909+ int result;
42910+ struct dentry *new;
42911+ struct inode *inode;
42912+ reiser4_dir_entry_desc entry;
42913+
42914+ ctx = reiser4_init_context(parent->i_sb);
42915+ if (IS_ERR(ctx))
42916+ return (struct dentry *)ctx;
42917+
42918+ /* set up operations on dentry. */
42919+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42920+
42921+ result = reiser4_lookup_name(parent, dentry, &entry.key);
42922+ if (result) {
42923+ context_set_commit_async(ctx);
42924+ reiser4_exit_context(ctx);
42925+ if (result == -ENOENT) {
42926+ /* object not found */
42927+ if (!IS_DEADDIR(parent))
42928+ d_add(dentry, NULL);
42929+ return NULL;
42930+ }
42931+ return ERR_PTR(result);
42932+ }
42933+
42934+ inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42935+ if (IS_ERR(inode)) {
42936+ context_set_commit_async(ctx);
42937+ reiser4_exit_context(ctx);
42938+ return ERR_PTR(PTR_ERR(inode));
42939+ }
42940+
42941+ /* success */
42942+ check_light_weight(inode, parent);
42943+ new = d_splice_alias(inode, dentry);
42944+ reiser4_iget_complete(inode);
42945+
42946+ /* prevent balance_dirty_pages() from being called: we don't want to
42947+ * do this under directory i_mutex. */
42948+ context_set_commit_async(ctx);
42949+ reiser4_exit_context(ctx);
42950+ return new;
42951+}
42952+
42953+static reiser4_block_nr common_estimate_link(struct inode *parent,
42954+ struct inode *object);
42955+int reiser4_update_dir(struct inode *);
42956+
42957+/**
42958+ * reiser4_link_common - link of inode operations
42959+ * @existing: dentry of object which is to get new name
42960+ * @parent: directory where new name is to be created
42961+ * @newname: new name
42962+ *
42963+ * This is common implementation of vfs's link method of struct
42964+ * inode_operations.
42965+ */
42966+int reiser4_link_common(struct dentry *existing, struct inode *parent,
42967+ struct dentry *newname)
42968+{
42969+ reiser4_context *ctx;
42970+ int result;
42971+ struct inode *object;
42972+ dir_plugin *parent_dplug;
42973+ reiser4_dir_entry_desc entry;
42974+ reiser4_object_create_data data;
42975+ reiser4_block_nr reserve;
42976+
42977+ ctx = reiser4_init_context(parent->i_sb);
42978+ if (IS_ERR(ctx))
42979+ return PTR_ERR(ctx);
42980+
42981+ assert("nikita-1431", existing != NULL);
42982+ assert("nikita-1432", parent != NULL);
42983+ assert("nikita-1433", newname != NULL);
42984+
42985+ object = existing->d_inode;
42986+ assert("nikita-1434", object != NULL);
42987+
42988+ /* check for race with create_object() */
42989+ if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
42990+ context_set_commit_async(ctx);
42991+ reiser4_exit_context(ctx);
42992+ return RETERR(-E_REPEAT);
42993+ }
42994+
42995+ parent_dplug = inode_dir_plugin(parent);
42996+
42997+ memset(&entry, 0, sizeof entry);
42998+ entry.obj = object;
42999+
43000+ data.mode = object->i_mode;
43001+ data.id = inode_file_plugin(object)->h.id;
43002+
43003+ reserve = common_estimate_link(parent, existing->d_inode);
43004+ if ((__s64) reserve < 0) {
43005+ context_set_commit_async(ctx);
43006+ reiser4_exit_context(ctx);
43007+ return reserve;
43008+ }
43009+
43010+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
43011+ context_set_commit_async(ctx);
43012+ reiser4_exit_context(ctx);
43013+ return RETERR(-ENOSPC);
43014+ }
43015+
43016+ /*
43017+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
43018+ * means that link(2) can race against unlink(2) or rename(2), and
43019+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
43020+ *
43021+ * For such inode we have to undo special processing done in
43022+ * reiser4_unlink() viz. creation of safe-link.
43023+ */
43024+ if (unlikely(object->i_nlink == 0)) {
43025+ result = safe_link_del(reiser4_tree_by_inode(object),
43026+ get_inode_oid(object), SAFE_UNLINK);
43027+ if (result != 0) {
43028+ context_set_commit_async(ctx);
43029+ reiser4_exit_context(ctx);
43030+ return result;
43031+ }
43032+ }
43033+
43034+ /* increment nlink of @existing and update its stat data */
43035+ result = reiser4_add_nlink(object, parent, 1);
43036+ if (result == 0) {
43037+ /* add entry to the parent */
43038+ result =
43039+ parent_dplug->add_entry(parent, newname, &data, &entry);
43040+ if (result != 0) {
43041+ /* failed to add entry to the parent, decrement nlink
43042+ of @existing */
43043+ reiser4_del_nlink(object, parent, 1);
43044+ /*
43045+ * now, if that failed, we have a file with too big
43046+ * nlink---space leak, much better than directory
43047+ * entry pointing to nowhere
43048+ */
43049+ }
43050+ }
43051+ if (result == 0) {
43052+ atomic_inc(&object->i_count);
43053+ /*
43054+ * Upon successful completion, link() shall mark for update
43055+ * the st_ctime field of the file. Also, the st_ctime and
43056+ * st_mtime fields of the directory that contains the new
43057+ * entry shall be marked for update. --SUS
43058+ */
43059+ result = reiser4_update_dir(parent);
43060+ }
43061+ if (result == 0)
43062+ d_instantiate(newname, existing->d_inode);
43063+
43064+ context_set_commit_async(ctx);
43065+ reiser4_exit_context(ctx);
43066+ return result;
43067+}
43068+
43069+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
43070+
43071+/**
43072+ * reiser4_unlink_common - unlink of inode operations
43073+ * @parent: inode of directory to remove name from
43074+ * @victim: name to be removed
43075+ *
43076+ * This is common implementation of vfs's unlink method of struct
43077+ * inode_operations.
43078+ */
43079+int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
43080+{
43081+ reiser4_context *ctx;
43082+ int result;
43083+ struct inode *object;
43084+ file_plugin *fplug;
43085+
43086+ ctx = reiser4_init_context(parent->i_sb);
43087+ if (IS_ERR(ctx))
43088+ return PTR_ERR(ctx);
43089+
43090+ object = victim->d_inode;
43091+ fplug = inode_file_plugin(object);
43092+ assert("nikita-2882", fplug->detach != NULL);
43093+
43094+ result = unlink_check_and_grab(parent, victim);
43095+ if (result != 0) {
43096+ context_set_commit_async(ctx);
43097+ reiser4_exit_context(ctx);
43098+ return result;
43099+ }
43100+
43101+ result = fplug->detach(object, parent);
43102+ if (result == 0) {
43103+ dir_plugin *parent_dplug;
43104+ reiser4_dir_entry_desc entry;
43105+
43106+ parent_dplug = inode_dir_plugin(parent);
43107+ memset(&entry, 0, sizeof entry);
43108+
43109+ /* first, delete directory entry */
43110+ result = parent_dplug->rem_entry(parent, victim, &entry);
43111+ if (result == 0) {
43112+ /*
43113+ * if name was removed successfully, we _have_ to
43114+ * return 0 from this function, because upper level
43115+ * caller (vfs_{rmdir,unlink}) expect this.
43116+ *
43117+ * now that directory entry is removed, update
43118+ * stat-data
43119+ */
43120+ reiser4_del_nlink(object, parent, 1);
43121+ /*
43122+ * Upon successful completion, unlink() shall mark for
43123+ * update the st_ctime and st_mtime fields of the
43124+ * parent directory. Also, if the file's link count is
43125+ * not 0, the st_ctime field of the file shall be
43126+ * marked for update. --SUS
43127+ */
43128+ reiser4_update_dir(parent);
43129+ /* add safe-link for this file */
43130+ if (object->i_nlink == 0)
43131+ safe_link_add(object, SAFE_UNLINK);
43132+ }
43133+ }
43134+
43135+ if (unlikely(result != 0)) {
43136+ if (result != -ENOMEM)
43137+ warning("nikita-3398", "Cannot unlink %llu (%i)",
43138+ (unsigned long long)get_inode_oid(object),
43139+ result);
43140+ /* if operation failed commit pending inode modifications to
43141+ * the stat-data */
43142+ reiser4_update_sd(object);
43143+ reiser4_update_sd(parent);
43144+ }
43145+
43146+ reiser4_release_reserved(object->i_sb);
43147+
43148+ /* @object's i_ctime was updated by ->rem_link() method(). */
43149+
43150+ /* @victim can be already removed from the disk by this time. Inode is
43151+ then marked so that iput() wouldn't try to remove stat data. But
43152+ inode itself is still there.
43153+ */
43154+
43155+ /*
43156+ * we cannot release directory semaphore here, because name has
43157+ * already been deleted, but dentry (@victim) still exists. Prevent
43158+ * balance_dirty_pages() from being called on exiting this context: we
43159+ * don't want to do this under directory i_mutex.
43160+ */
43161+ context_set_commit_async(ctx);
43162+ reiser4_exit_context(ctx);
43163+ return result;
43164+}
43165+
43166+/**
43167+ * reiser4_symlink_common - symlink of inode operations
43168+ * @parent: inode of parent directory
43169+ * @dentry: dentry of object to be created
43170+ * @linkname: string symlink is to contain
43171+ *
43172+ * This is common implementation of vfs's symlink method of struct
43173+ * inode_operations.
43174+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
43175+ */
43176+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
43177+ const char *linkname)
43178+{
43179+ reiser4_object_create_data data;
43180+
43181+ memset(&data, 0, sizeof data);
43182+ data.name = linkname;
43183+ data.id = SYMLINK_FILE_PLUGIN_ID;
43184+ data.mode = S_IFLNK | S_IRWXUGO;
43185+ return create_vfs_object(parent, dentry, &data);
43186+}
43187+
43188+/**
43189+ * reiser4_mkdir_common - mkdir of inode operations
43190+ * @parent: inode of parent directory
43191+ * @dentry: dentry of object to be created
43192+ * @mode: the permissions to use
43193+ *
43194+ * This is common implementation of vfs's mkdir method of struct
43195+ * inode_operations.
43196+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
43197+ */
43198+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
43199+{
43200+ reiser4_object_create_data data;
43201+
43202+ memset(&data, 0, sizeof data);
43203+ data.mode = S_IFDIR | mode;
43204+ data.id = DIRECTORY_FILE_PLUGIN_ID;
43205+ return create_vfs_object(parent, dentry, &data);
43206+}
43207+
43208+/**
43209+ * reiser4_mknod_common - mknod of inode operations
43210+ * @parent: inode of parent directory
43211+ * @dentry: dentry of object to be created
43212+ * @mode: the permissions to use and file type
43213+ * @rdev: minor and major of new device file
43214+ *
43215+ * This is common implementation of vfs's mknod method of struct
43216+ * inode_operations.
43217+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
43218+ */
43219+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
43220+ int mode, dev_t rdev)
43221+{
43222+ reiser4_object_create_data data;
43223+
43224+ memset(&data, 0, sizeof data);
43225+ data.mode = mode;
43226+ data.rdev = rdev;
43227+ data.id = SPECIAL_FILE_PLUGIN_ID;
43228+ return create_vfs_object(parent, dentry, &data);
43229+}
43230+
43231+/*
43232+ * implementation of vfs's rename method of struct inode_operations for typical
43233+ * directory is in inode_ops_rename.c
43234+ */
43235+
43236+/**
43237+ * reiser4_follow_link_common - follow_link of inode operations
43238+ * @dentry: dentry of symlink
43239+ * @data:
43240+ *
43241+ * This is common implementation of vfs's followlink method of struct
43242+ * inode_operations.
43243+ * Assumes that inode's i_private points to the content of symbolic link.
43244+ */
43245+void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
43246+{
43247+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
43248+
43249+ if (!dentry->d_inode->i_private
43250+ || !reiser4_inode_get_flag(dentry->d_inode,
43251+ REISER4_GENERIC_PTR_USED))
43252+ return ERR_PTR(RETERR(-EINVAL));
43253+ nd_set_link(nd, dentry->d_inode->i_private);
43254+ return NULL;
43255+}
43256+
43257+/**
43258+ * reiser4_permission_common - permission of inode operations
43259+ * @inode: inode to check permissions for
43260+ * @mask: mode bits to check permissions for
43261+ * @nameidata:
43262+ *
43263+ * Uses generic function to check for rwx permissions.
43264+ */
43265+int reiser4_permission_common(struct inode *inode, int mask,
43266+ struct nameidata *nameidata)
43267+{
43268+ return generic_permission(inode, mask, NULL);
43269+}
43270+
43271+static int setattr_reserve(reiser4_tree *);
43272+
43273+/* this is common implementation of vfs's setattr method of struct
43274+ inode_operations
43275+*/
43276+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
43277+{
43278+ reiser4_context *ctx;
43279+ struct inode *inode;
43280+ int result;
43281+
43282+ inode = dentry->d_inode;
43283+ result = inode_change_ok(inode, attr);
43284+ if (result)
43285+ return result;
43286+
43287+ ctx = reiser4_init_context(inode->i_sb);
43288+ if (IS_ERR(ctx))
43289+ return PTR_ERR(ctx);
43290+
43291+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
43292+
43293+ /*
43294+ * grab disk space and call standard inode_setattr().
43295+ */
43296+ result = setattr_reserve(reiser4_tree_by_inode(inode));
43297+ if (!result) {
43298+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
43299+ || (attr->ia_valid & ATTR_GID
43300+ && attr->ia_gid != inode->i_gid)) {
43301+ result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
43302+ if (result) {
43303+ context_set_commit_async(ctx);
43304+ reiser4_exit_context(ctx);
43305+ return result;
43306+ }
43307+ }
43308+ result = inode_setattr(inode, attr);
43309+ if (!result)
43310+ reiser4_update_sd(inode);
43311+ }
43312+
43313+ context_set_commit_async(ctx);
43314+ reiser4_exit_context(ctx);
43315+ return result;
43316+}
43317+
43318+/* this is common implementation of vfs's getattr method of struct
43319+ inode_operations
43320+*/
43321+int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
43322+ struct dentry *dentry, struct kstat *stat)
43323+{
43324+ struct inode *obj;
43325+
43326+ assert("nikita-2298", dentry != NULL);
43327+ assert("nikita-2299", stat != NULL);
43328+ assert("nikita-2300", dentry->d_inode != NULL);
43329+
43330+ obj = dentry->d_inode;
43331+
43332+ stat->dev = obj->i_sb->s_dev;
43333+ stat->ino = oid_to_uino(get_inode_oid(obj));
43334+ stat->mode = obj->i_mode;
43335+ /* don't confuse userland with huge nlink. This is not entirely
43336+ * correct, because nlink_t is not necessary 16 bit signed. */
43337+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
43338+ stat->uid = obj->i_uid;
43339+ stat->gid = obj->i_gid;
43340+ stat->rdev = obj->i_rdev;
43341+ stat->atime = obj->i_atime;
43342+ stat->mtime = obj->i_mtime;
43343+ stat->ctime = obj->i_ctime;
43344+ stat->size = obj->i_size;
43345+ stat->blocks =
43346+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
43347+ /* "preferred" blocksize for efficient file system I/O */
43348+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
43349+
43350+ return 0;
43351+}
43352+
43353+/* Estimate the maximum amount of nodes which might be allocated or changed on
43354+ typical new object creation. Typical creation consists of calling create
43355+ method of file plugin, adding directory entry to parent and update parent
43356+ directory's stat data.
43357+*/
43358+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent, /* parent object */
43359+ struct inode *object
43360+ /* object */ )
43361+{
43362+ assert("vpf-309", parent != NULL);
43363+ assert("vpf-307", object != NULL);
43364+
43365+ return
43366+ /* object creation estimation */
43367+ inode_file_plugin(object)->estimate.create(object) +
43368+ /* stat data of parent directory estimation */
43369+ inode_file_plugin(parent)->estimate.update(parent) +
43370+ /* adding entry estimation */
43371+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
43372+ /* to undo in the case of failure */
43373+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
43374+}
43375+
43376+/* Create child in directory.
43377+
43378+ . get object's plugin
43379+ . get fresh inode
43380+ . initialize inode
43381+ . add object's stat-data
43382+ . initialize object's directory
43383+ . add entry to the parent
43384+ . instantiate dentry
43385+
43386+*/
43387+static int do_create_vfs_child(reiser4_object_create_data * data, /* parameters of new
43388+ object */
43389+ struct inode **retobj)
43390+{
43391+ int result;
43392+
43393+ struct dentry *dentry; /* parent object */
43394+ struct inode *parent; /* new name */
43395+
43396+ dir_plugin *par_dir; /* directory plugin on the parent */
43397+ dir_plugin *obj_dir; /* directory plugin on the new object */
43398+ file_plugin *obj_plug; /* object plugin on the new object */
43399+ struct inode *object; /* new object */
43400+ reiser4_block_nr reserve;
43401+
43402+ reiser4_dir_entry_desc entry; /* new directory entry */
43403+
43404+ assert("nikita-1420", data != NULL);
43405+ parent = data->parent;
43406+ dentry = data->dentry;
43407+
43408+ assert("nikita-1418", parent != NULL);
43409+ assert("nikita-1419", dentry != NULL);
43410+
43411+ /* check, that name is acceptable for parent */
43412+ par_dir = inode_dir_plugin(parent);
43413+ if (par_dir->is_name_acceptable &&
43414+ !par_dir->is_name_acceptable(parent,
43415+ dentry->d_name.name,
43416+ (int)dentry->d_name.len))
43417+ return RETERR(-ENAMETOOLONG);
43418+
43419+ result = 0;
43420+ obj_plug = file_plugin_by_id((int)data->id);
43421+ if (obj_plug == NULL) {
43422+ warning("nikita-430", "Cannot find plugin %i", data->id);
43423+ return RETERR(-ENOENT);
43424+ }
43425+ object = new_inode(parent->i_sb);
43426+ if (object == NULL)
43427+ return RETERR(-ENOMEM);
43428+ /* we'll update i_nlink below */
43429+ object->i_nlink = 0;
43430+ /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
43431+ * to simplify error handling: if some error occurs before i_ino is
43432+ * initialized with oid, i_ino should already be set to some
43433+ * distinguished value. */
43434+ object->i_ino = 0;
43435+
43436+ /* So that on error iput will be called. */
43437+ *retobj = object;
43438+
43439+ if (DQUOT_ALLOC_INODE(object)) {
43440+ DQUOT_DROP(object);
43441+ object->i_flags |= S_NOQUOTA;
43442+ return RETERR(-EDQUOT);
43443+ }
43444+
43445+ memset(&entry, 0, sizeof entry);
43446+ entry.obj = object;
43447+
43448+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
43449+ file_plugin_to_plugin(obj_plug));
43450+ result = obj_plug->set_plug_in_inode(object, parent, data);
43451+ if (result) {
43452+ warning("nikita-431", "Cannot install plugin %i on %llx",
43453+ data->id, (unsigned long long)get_inode_oid(object));
43454+ DQUOT_FREE_INODE(object);
43455+ object->i_flags |= S_NOQUOTA;
43456+ return result;
43457+ }
43458+
43459+ /* reget plugin after installation */
43460+ obj_plug = inode_file_plugin(object);
43461+
43462+ if (obj_plug->create_object == NULL) {
43463+ DQUOT_FREE_INODE(object);
43464+ object->i_flags |= S_NOQUOTA;
43465+ return RETERR(-EPERM);
43466+ }
43467+
43468+ /* if any of hash, tail, sd or permission plugins for newly created
43469+ object are not set yet set them here inheriting them from parent
43470+ directory
43471+ */
43472+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
43473+ result = obj_plug->adjust_to_parent(object,
43474+ parent,
43475+ object->i_sb->s_root->d_inode);
43476+ if (result == 0)
43477+ result = finish_pset(object);
43478+ if (result != 0) {
43479+ warning("nikita-432", "Cannot inherit from %llx to %llx",
43480+ (unsigned long long)get_inode_oid(parent),
43481+ (unsigned long long)get_inode_oid(object));
43482+ DQUOT_FREE_INODE(object);
43483+ object->i_flags |= S_NOQUOTA;
43484+ return result;
43485+ }
43486+
43487+ /* setup inode and file-operations for this inode */
43488+ setup_inode_ops(object, data);
43489+
43490+ /* call file plugin's method to initialize plugin specific part of
43491+ * inode */
43492+ if (obj_plug->init_inode_data)
43493+ obj_plug->init_inode_data(object, data, 1 /*create */ );
43494+
43495+ /* obtain directory plugin (if any) for new object. */
43496+ obj_dir = inode_dir_plugin(object);
43497+ if (obj_dir != NULL && obj_dir->init == NULL) {
43498+ DQUOT_FREE_INODE(object);
43499+ object->i_flags |= S_NOQUOTA;
43500+ return RETERR(-EPERM);
43501+ }
43502+
43503+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
43504+
43505+ reserve = estimate_create_vfs_object(parent, object);
43506+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
43507+ DQUOT_FREE_INODE(object);
43508+ object->i_flags |= S_NOQUOTA;
43509+ return RETERR(-ENOSPC);
43510+ }
43511+
43512+ /* mark inode `immutable'. We disable changes to the file being
43513+ created until valid directory entry for it is inserted. Otherwise,
43514+ if file were expanded and insertion of directory entry fails, we
43515+ have to remove file, but we only alloted enough space in
43516+ transaction to remove _empty_ file. 3.x code used to remove stat
43517+ data in different transaction thus possibly leaking disk space on
43518+ crash. This all only matters if it's possible to access file
43519+ without name, for example, by inode number
43520+ */
43521+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
43522+
43523+ /* create empty object, this includes allocation of new objectid. For
43524+ directories this implies creation of dot and dotdot */
43525+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
43526+
43527+ /* mark inode as `loaded'. From this point onward
43528+ reiser4_delete_inode() will try to remove its stat-data. */
43529+ reiser4_inode_set_flag(object, REISER4_LOADED);
43530+
43531+ result = obj_plug->create_object(object, parent, data);
43532+ if (result != 0) {
43533+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43534+ if (result != -ENAMETOOLONG && result != -ENOMEM)
43535+ warning("nikita-2219",
43536+ "Failed to create sd for %llu",
43537+ (unsigned long long)get_inode_oid(object));
43538+ DQUOT_FREE_INODE(object);
43539+ object->i_flags |= S_NOQUOTA;
43540+ return result;
43541+ }
43542+
43543+ if (obj_dir != NULL)
43544+ result = obj_dir->init(object, parent, data);
43545+ if (result == 0) {
43546+ assert("nikita-434", !reiser4_inode_get_flag(object,
43547+ REISER4_NO_SD));
43548+ /* insert inode into VFS hash table */
43549+ insert_inode_hash(object);
43550+ /* create entry */
43551+ result = par_dir->add_entry(parent, dentry, data, &entry);
43552+ if (result == 0) {
43553+ result = reiser4_add_nlink(object, parent, 0);
43554+ /* If O_CREAT is set and the file did not previously
43555+ exist, upon successful completion, open() shall
43556+ mark for update the st_atime, st_ctime, and
43557+ st_mtime fields of the file and the st_ctime and
43558+ st_mtime fields of the parent directory. --SUS
43559+ */
43560+ /* @object times are already updated by
43561+ reiser4_add_nlink() */
43562+ if (result == 0)
43563+ reiser4_update_dir(parent);
43564+ if (result != 0)
43565+ /* cleanup failure to add nlink */
43566+ par_dir->rem_entry(parent, dentry, &entry);
43567+ }
43568+ if (result != 0)
43569+ /* cleanup failure to add entry */
43570+ obj_plug->detach(object, parent);
43571+ } else if (result != -ENOMEM)
43572+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
43573+ (unsigned long long)get_inode_oid(object), result);
43574+
43575+ /*
43576+ * update stat-data, committing all pending modifications to the inode
43577+ * fields.
43578+ */
43579+ reiser4_update_sd(object);
43580+ if (result != 0) {
43581+ DQUOT_FREE_INODE(object);
43582+ object->i_flags |= S_NOQUOTA;
43583+ /* if everything was ok (result == 0), parent stat-data is
43584+ * already updated above (update_parent_dir()) */
43585+ reiser4_update_sd(parent);
43586+ /* failure to create entry, remove object */
43587+ obj_plug->delete_object(object);
43588+ }
43589+
43590+ /* file has name now, clear immutable flag */
43591+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43592+
43593+ /* on error, iput() will call ->delete_inode(). We should keep track
43594+ of the existence of stat-data for this inode and avoid attempt to
43595+ remove it in reiser4_delete_inode(). This is accomplished through
43596+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
43597+ */
43598+ return result;
43599+}
43600+
43601+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
43602+ reiser4_mknod and reiser4_symlink
43603+*/
43604+static int
43605+create_vfs_object(struct inode *parent,
43606+ struct dentry *dentry, reiser4_object_create_data * data)
43607+{
43608+ reiser4_context *ctx;
43609+ int result;
43610+ struct inode *child;
43611+
43612+ ctx = reiser4_init_context(parent->i_sb);
43613+ if (IS_ERR(ctx))
43614+ return PTR_ERR(ctx);
43615+ context_set_commit_async(ctx);
43616+
43617+ data->parent = parent;
43618+ data->dentry = dentry;
43619+ child = NULL;
43620+ result = do_create_vfs_child(data, &child);
43621+ if (unlikely(result != 0)) {
43622+ if (child != NULL) {
43623+ reiser4_make_bad_inode(child);
43624+ iput(child);
43625+ }
43626+ } else
43627+ d_instantiate(dentry, child);
43628+
43629+ reiser4_exit_context(ctx);
43630+ return result;
43631+}
43632+
43633+/* helper for link_common. Estimate disk space necessary to add a link
43634+ from @parent to @object
43635+*/
43636+static reiser4_block_nr common_estimate_link(struct inode *parent, /* parent directory */
43637+ struct inode *object
43638+ /* object to which new link is being cerated */
43639+ )
43640+{
43641+ reiser4_block_nr res = 0;
43642+ file_plugin *fplug;
43643+ dir_plugin *dplug;
43644+
43645+ assert("vpf-317", object != NULL);
43646+ assert("vpf-318", parent != NULL);
43647+
43648+ fplug = inode_file_plugin(object);
43649+ dplug = inode_dir_plugin(parent);
43650+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
43651+ /* reiser4_add_nlink(object) */
43652+ res += fplug->estimate.update(object);
43653+ /* add_entry(parent) */
43654+ res += dplug->estimate.add_entry(parent);
43655+ /* reiser4_del_nlink(object) */
43656+ res += fplug->estimate.update(object);
43657+ /* update_dir(parent) */
43658+ res += inode_file_plugin(parent)->estimate.update(parent);
43659+ /* safe-link */
43660+ res += estimate_one_item_removal(reiser4_tree_by_inode(object));
43661+
43662+ return res;
43663+}
43664+
43665+/* Estimate disk space necessary to remove a link between @parent and
43666+ @object.
43667+*/
43668+static reiser4_block_nr estimate_unlink(struct inode *parent, /* parent directory */
43669+ struct inode *object
43670+ /* object to which new link is being cerated */
43671+ )
43672+{
43673+ reiser4_block_nr res = 0;
43674+ file_plugin *fplug;
43675+ dir_plugin *dplug;
43676+
43677+ assert("vpf-317", object != NULL);
43678+ assert("vpf-318", parent != NULL);
43679+
43680+ fplug = inode_file_plugin(object);
43681+ dplug = inode_dir_plugin(parent);
43682+
43683+ /* rem_entry(parent) */
43684+ res += dplug->estimate.rem_entry(parent);
43685+ /* reiser4_del_nlink(object) */
43686+ res += fplug->estimate.update(object);
43687+ /* update_dir(parent) */
43688+ res += inode_file_plugin(parent)->estimate.update(parent);
43689+ /* fplug->unlink */
43690+ res += fplug->estimate.unlink(object, parent);
43691+ /* safe-link */
43692+ res += estimate_one_insert_item(reiser4_tree_by_inode(object));
43693+
43694+ return res;
43695+}
43696+
43697+/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
43698+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43699+{
43700+ file_plugin *fplug;
43701+ struct inode *child;
43702+ int result;
43703+
43704+ result = 0;
43705+ child = victim->d_inode;
43706+ fplug = inode_file_plugin(child);
43707+
43708+ /* check for race with create_object() */
43709+ if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
43710+ return RETERR(-E_REPEAT);
43711+ /* object being deleted should have stat data */
43712+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
43713+
43714+ /* ask object plugin */
43715+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43716+ return RETERR(-ENOTEMPTY);
43717+
43718+ result = (int)estimate_unlink(parent, child);
43719+ if (result < 0)
43720+ return result;
43721+
43722+ return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43723+}
43724+
43725+/* helper for reiser4_setattr_common */
43726+static int setattr_reserve(reiser4_tree * tree)
43727+{
43728+ assert("vs-1096", is_grab_enabled(get_current_context()));
43729+ return reiser4_grab_space(estimate_one_insert_into_item(tree),
43730+ BA_CAN_COMMIT);
43731+}
43732+
43733+/* helper function. Standards require that for many file-system operations
43734+ on success ctime and mtime of parent directory is to be updated. */
43735+int reiser4_update_dir(struct inode *dir)
43736+{
43737+ assert("nikita-2525", dir != NULL);
43738+
43739+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43740+ return reiser4_update_sd(dir);
43741+}
43742diff --git a/fs/reiser4/plugin/inode_ops_rename.c b/fs/reiser4/plugin/inode_ops_rename.c
43743new file mode 100644
43744index 0000000..a64e777
43745--- /dev/null
43746+++ b/fs/reiser4/plugin/inode_ops_rename.c
43747@@ -0,0 +1,914 @@
43748+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43749+ * reiser4/README */
43750+
43751+#include "../inode.h"
43752+#include "../safe_link.h"
43753+
43754+static const char *possible_leak = "Possible disk space leak.";
43755+
43756+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43757+
43758+ Helper function called from hashed_rename() */
43759+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
43760+ * to be re-targeted at */
43761+ struct inode *from_dir, /* directory where @from_coord
43762+ * lives */
43763+ struct inode *from_inode, /* inode @from_coord
43764+ * originally point to */
43765+ coord_t * from_coord, /* where directory entry is in
43766+ * the tree */
43767+ lock_handle * from_lh /* lock handle on @from_coord */ )
43768+{
43769+ item_plugin *from_item;
43770+ int result;
43771+ znode *node;
43772+
43773+ coord_clear_iplug(from_coord);
43774+ node = from_coord->node;
43775+ result = zload(node);
43776+ if (result != 0)
43777+ return result;
43778+ from_item = item_plugin_by_coord(from_coord);
43779+ if (plugin_of_group(item_plugin_by_coord(from_coord),
43780+ DIR_ENTRY_ITEM_TYPE))
43781+ {
43782+ reiser4_key to_key;
43783+
43784+ build_sd_key(to_inode, &to_key);
43785+
43786+ /* everything is found and prepared to change directory entry
43787+ at @from_coord to point to @to_inode.
43788+
43789+ @to_inode is just about to get new name, so bump its link
43790+ counter.
43791+
43792+ */
43793+ result = reiser4_add_nlink(to_inode, from_dir, 0);
43794+ if (result != 0) {
43795+ /* Don't issue warning: this may be plain -EMLINK */
43796+ zrelse(node);
43797+ return result;
43798+ }
43799+
43800+ result =
43801+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43802+ if (result != 0) {
43803+ reiser4_del_nlink(to_inode, from_dir, 0);
43804+ zrelse(node);
43805+ return result;
43806+ }
43807+
43808+ /* @from_inode just lost its name, he-he.
43809+
43810+ If @from_inode was directory, it contained dotdot pointing
43811+ to @from_dir. @from_dir i_nlink will be decreased when
43812+ iput() will be called on @from_inode.
43813+
43814+ If file-system is not ADG (hard-links are
43815+ supported on directories), iput(from_inode) will not remove
43816+ @from_inode, and thus above is incorrect, but hard-links on
43817+ directories are problematic in many other respects.
43818+ */
43819+ result = reiser4_del_nlink(from_inode, from_dir, 0);
43820+ if (result != 0) {
43821+ warning("nikita-2330",
43822+ "Cannot remove link from source: %i. %s",
43823+ result, possible_leak);
43824+ }
43825+ /* Has to return success, because entry is already
43826+ * modified. */
43827+ result = 0;
43828+
43829+ /* NOTE-NIKITA consider calling plugin method in stead of
43830+ accessing inode fields directly. */
43831+ from_dir->i_mtime = CURRENT_TIME;
43832+ } else {
43833+ warning("nikita-2326", "Unexpected item type");
43834+ result = RETERR(-EIO);
43835+ }
43836+ zrelse(node);
43837+ return result;
43838+}
43839+
43840+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43841+
43842+ Helper function used by hashed_rename(). */
43843+static int add_name(struct inode *inode, /* inode where @coord is to be
43844+ * re-targeted at */
43845+ struct inode *dir, /* directory where @coord lives */
43846+ struct dentry *name, /* new name */
43847+ coord_t * coord, /* where directory entry is in the tree */
43848+ lock_handle * lh, /* lock handle on @coord */
43849+ int is_dir /* true, if @inode is directory */ )
43850+{
43851+ int result;
43852+ reiser4_dir_entry_desc entry;
43853+
43854+ assert("nikita-2333", lh->node == coord->node);
43855+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43856+
43857+ memset(&entry, 0, sizeof entry);
43858+ entry.obj = inode;
43859+ /* build key of directory entry description */
43860+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43861+
43862+ /* ext2 does this in different order: first inserts new entry,
43863+ then increases directory nlink. We don't want do this,
43864+ because reiser4_add_nlink() calls ->add_link() plugin
43865+ method that can fail for whatever reason, leaving as with
43866+ cleanup problems.
43867+ */
43868+ /* @inode is getting new name */
43869+ reiser4_add_nlink(inode, dir, 0);
43870+ /* create @new_name in @new_dir pointing to
43871+ @old_inode */
43872+ result = WITH_COORD(coord,
43873+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43874+ coord,
43875+ lh,
43876+ name,
43877+ &entry));
43878+ if (result != 0) {
43879+ int result2;
43880+ result2 = reiser4_del_nlink(inode, dir, 0);
43881+ if (result2 != 0) {
43882+ warning("nikita-2327",
43883+ "Cannot drop link on %lli %i. %s",
43884+ (unsigned long long)get_inode_oid(inode),
43885+ result2, possible_leak);
43886+ }
43887+ } else
43888+ INODE_INC_FIELD(dir, i_size);
43889+ return result;
43890+}
43891+
43892+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43893+ struct dentry *old_name, /* old name */
43894+ struct inode *new_dir, /* directory where @new is located */
43895+ struct dentry *new_name /* new name */ )
43896+{
43897+ reiser4_block_nr res1, res2;
43898+ dir_plugin *p_parent_old, *p_parent_new;
43899+ file_plugin *p_child_old, *p_child_new;
43900+
43901+ assert("vpf-311", old_dir != NULL);
43902+ assert("vpf-312", new_dir != NULL);
43903+ assert("vpf-313", old_name != NULL);
43904+ assert("vpf-314", new_name != NULL);
43905+
43906+ p_parent_old = inode_dir_plugin(old_dir);
43907+ p_parent_new = inode_dir_plugin(new_dir);
43908+ p_child_old = inode_file_plugin(old_name->d_inode);
43909+ if (new_name->d_inode)
43910+ p_child_new = inode_file_plugin(new_name->d_inode);
43911+ else
43912+ p_child_new = NULL;
43913+
43914+ /* find_entry - can insert one leaf. */
43915+ res1 = res2 = 1;
43916+
43917+ /* replace_name */
43918+ {
43919+ /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43920+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43921+ /* update key */
43922+ res1 += 1;
43923+ /* reiser4_del_nlink(p_child_new) */
43924+ if (p_child_new)
43925+ res1 += p_child_new->estimate.update(new_name->d_inode);
43926+ }
43927+
43928+ /* else add_name */
43929+ {
43930+ /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43931+ res2 +=
43932+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43933+ /* reiser4_add_nlink(p_parent_old) */
43934+ res2 += p_child_old->estimate.update(old_name->d_inode);
43935+ /* add_entry(p_parent_new) */
43936+ res2 += p_parent_new->estimate.add_entry(new_dir);
43937+ /* reiser4_del_nlink(p_parent_old) */
43938+ res2 += p_child_old->estimate.update(old_name->d_inode);
43939+ }
43940+
43941+ res1 = res1 < res2 ? res2 : res1;
43942+
43943+ /* reiser4_write_sd(p_parent_new) */
43944+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43945+
43946+ /* reiser4_write_sd(p_child_new) */
43947+ if (p_child_new)
43948+ res1 += p_child_new->estimate.update(new_name->d_inode);
43949+
43950+ /* hashed_rem_entry(p_parent_old) */
43951+ res1 += p_parent_old->estimate.rem_entry(old_dir);
43952+
43953+ /* reiser4_del_nlink(p_child_old) */
43954+ res1 += p_child_old->estimate.update(old_name->d_inode);
43955+
43956+ /* replace_name */
43957+ {
43958+ /* reiser4_add_nlink(p_parent_dir_new) */
43959+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43960+ /* update_key */
43961+ res1 += 1;
43962+ /* reiser4_del_nlink(p_parent_new) */
43963+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43964+ /* reiser4_del_nlink(p_parent_old) */
43965+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43966+ }
43967+
43968+ /* reiser4_write_sd(p_parent_old) */
43969+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43970+
43971+ /* reiser4_write_sd(p_child_old) */
43972+ res1 += p_child_old->estimate.update(old_name->d_inode);
43973+
43974+ return res1;
43975+}
43976+
43977+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory where @old is located */
43978+ struct dentry *old_name, /* old name */
43979+ struct inode *new_dir, /* directory where @new is located */
43980+ struct dentry *new_name
43981+ /* new name */ )
43982+{
43983+ reiser4_block_nr reserve;
43984+
43985+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43986+
43987+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43988+ return RETERR(-ENOSPC);
43989+
43990+ return 0;
43991+}
43992+
43993+/* check whether @old_inode and @new_inode can be moved within file system
43994+ * tree. This singles out attempts to rename pseudo-files, for example. */
43995+static int can_rename(struct inode *old_dir, struct inode *old_inode,
43996+ struct inode *new_dir, struct inode *new_inode)
43997+{
43998+ file_plugin *fplug;
43999+ dir_plugin *dplug;
44000+
44001+ assert("nikita-3370", old_inode != NULL);
44002+
44003+ dplug = inode_dir_plugin(new_dir);
44004+ fplug = inode_file_plugin(old_inode);
44005+
44006+ if (dplug == NULL)
44007+ return RETERR(-ENOTDIR);
44008+ else if (new_dir->i_op->create == NULL)
44009+ return RETERR(-EPERM);
44010+ else if (!fplug->can_add_link(old_inode))
44011+ return RETERR(-EMLINK);
44012+ else if (new_inode != NULL) {
44013+ fplug = inode_file_plugin(new_inode);
44014+ if (fplug->can_rem_link != NULL &&
44015+ !fplug->can_rem_link(new_inode))
44016+ return RETERR(-EBUSY);
44017+ }
44018+ return 0;
44019+}
44020+
44021+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
44022+ znode_lock_mode, reiser4_dir_entry_desc *);
44023+int reiser4_update_dir(struct inode *);
44024+
44025+/* this is common implementation of vfs's rename method of struct
44026+ inode_operations
44027+ See comments in the body.
44028+
44029+ It is arguable that this function can be made generic so, that it
44030+ will be applicable to any kind of directory plugin that deals with
44031+ directories composed out of directory entries. The only obstacle
44032+ here is that we don't have any data-type to represent directory
44033+ entry. This should be re-considered when more than one different
44034+ directory plugin will be implemented.
44035+*/
44036+int reiser4_rename_common(struct inode *old_dir /* directory where @old
44037+ * is located */ ,
44038+ struct dentry *old_name /* old name */ ,
44039+ struct inode *new_dir /* directory where @new
44040+ * is located */ ,
44041+ struct dentry *new_name /* new name */ )
44042+{
44043+ /* From `The Open Group Base Specifications Issue 6'
44044+
44045+ If either the old or new argument names a symbolic link, rename()
44046+ shall operate on the symbolic link itself, and shall not resolve
44047+ the last component of the argument. If the old argument and the new
44048+ argument resolve to the same existing file, rename() shall return
44049+ successfully and perform no other action.
44050+
44051+ [this is done by VFS: vfs_rename()]
44052+
44053+ If the old argument points to the pathname of a file that is not a
44054+ directory, the new argument shall not point to the pathname of a
44055+ directory.
44056+
44057+ [checked by VFS: vfs_rename->may_delete()]
44058+
44059+ If the link named by the new argument exists, it shall
44060+ be removed and old renamed to new. In this case, a link named new
44061+ shall remain visible to other processes throughout the renaming
44062+ operation and refer either to the file referred to by new or old
44063+ before the operation began.
44064+
44065+ [we should assure this]
44066+
44067+ Write access permission is required for
44068+ both the directory containing old and the directory containing new.
44069+
44070+ [checked by VFS: vfs_rename->may_delete(), may_create()]
44071+
44072+ If the old argument points to the pathname of a directory, the new
44073+ argument shall not point to the pathname of a file that is not a
44074+ directory.
44075+
44076+ [checked by VFS: vfs_rename->may_delete()]
44077+
44078+ If the directory named by the new argument exists, it
44079+ shall be removed and old renamed to new. In this case, a link named
44080+ new shall exist throughout the renaming operation and shall refer
44081+ either to the directory referred to by new or old before the
44082+ operation began.
44083+
44084+ [we should assure this]
44085+
44086+ If new names an existing directory, it shall be
44087+ required to be an empty directory.
44088+
44089+ [we should check this]
44090+
44091+ If the old argument points to a pathname of a symbolic link, the
44092+ symbolic link shall be renamed. If the new argument points to a
44093+ pathname of a symbolic link, the symbolic link shall be removed.
44094+
44095+ The new pathname shall not contain a path prefix that names
44096+ old. Write access permission is required for the directory
44097+ containing old and the directory containing new. If the old
44098+ argument points to the pathname of a directory, write access
44099+ permission may be required for the directory named by old, and, if
44100+ it exists, the directory named by new.
44101+
44102+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
44103+
44104+ If the link named by the new argument exists and the file's link
44105+ count becomes 0 when it is removed and no process has the file
44106+ open, the space occupied by the file shall be freed and the file
44107+ shall no longer be accessible. If one or more processes have the
44108+ file open when the last link is removed, the link shall be removed
44109+ before rename() returns, but the removal of the file contents shall
44110+ be postponed until all references to the file are closed.
44111+
44112+ [iput() handles this, but we can do this manually, a la
44113+ reiser4_unlink()]
44114+
44115+ Upon successful completion, rename() shall mark for update the
44116+ st_ctime and st_mtime fields of the parent directory of each file.
44117+
44118+ [N/A]
44119+
44120+ */
44121+ reiser4_context *ctx;
44122+ int result;
44123+ int is_dir; /* is @old_name directory */
44124+
44125+ struct inode *old_inode;
44126+ struct inode *new_inode;
44127+ coord_t *new_coord;
44128+
44129+ reiser4_dentry_fsdata *new_fsdata;
44130+ dir_plugin *dplug;
44131+ file_plugin *fplug;
44132+
44133+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
44134+ lock_handle *new_lh, *dotdot_lh;
44135+ struct dentry *dotdot_name;
44136+ reiser4_dentry_fsdata *dataonstack;
44137+
44138+ ctx = reiser4_init_context(old_dir->i_sb);
44139+ if (IS_ERR(ctx))
44140+ return PTR_ERR(ctx);
44141+
44142+ old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
44143+ sizeof(*dotdot_name) + sizeof(*dataonstack),
44144+ reiser4_ctx_gfp_mask_get());
44145+ if (old_entry == NULL) {
44146+ context_set_commit_async(ctx);
44147+ reiser4_exit_context(ctx);
44148+ return RETERR(-ENOMEM);
44149+ }
44150+ memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
44151+ sizeof(*dotdot_name) + sizeof(*dataonstack));
44152+
44153+ new_entry = old_entry + 1;
44154+ dotdot_entry = old_entry + 2;
44155+ new_lh = (lock_handle *)(old_entry + 3);
44156+ dotdot_lh = new_lh + 1;
44157+ dotdot_name = (struct dentry *)(new_lh + 2);
44158+ dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
44159+
44160+ assert("nikita-2318", old_dir != NULL);
44161+ assert("nikita-2319", new_dir != NULL);
44162+ assert("nikita-2320", old_name != NULL);
44163+ assert("nikita-2321", new_name != NULL);
44164+
44165+ old_inode = old_name->d_inode;
44166+ new_inode = new_name->d_inode;
44167+
44168+ dplug = inode_dir_plugin(old_dir);
44169+ fplug = NULL;
44170+
44171+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
44172+ if (IS_ERR(new_fsdata)) {
44173+ kfree(old_entry);
44174+ context_set_commit_async(ctx);
44175+ reiser4_exit_context(ctx);
44176+ return PTR_ERR(new_fsdata);
44177+ }
44178+
44179+ new_coord = &new_fsdata->dec.entry_coord;
44180+ coord_clear_iplug(new_coord);
44181+
44182+ is_dir = S_ISDIR(old_inode->i_mode);
44183+
44184+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
44185+
44186+ /* if target is existing directory and it's not empty---return error.
44187+
44188+ This check is done specifically, because is_dir_empty() requires
44189+ tree traversal and have to be done before locks are taken.
44190+ */
44191+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
44192+ kfree(old_entry);
44193+ context_set_commit_async(ctx);
44194+ reiser4_exit_context(ctx);
44195+ return RETERR(-ENOTEMPTY);
44196+ }
44197+
44198+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
44199+ if (result != 0) {
44200+ kfree(old_entry);
44201+ context_set_commit_async(ctx);
44202+ reiser4_exit_context(ctx);
44203+ return result;
44204+ }
44205+
44206+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
44207+ new_dir, new_name);
44208+ if (result != 0) {
44209+ kfree(old_entry);
44210+ context_set_commit_async(ctx);
44211+ reiser4_exit_context(ctx);
44212+ return result;
44213+ }
44214+
44215+ init_lh(new_lh);
44216+
44217+ /* find entry for @new_name */
44218+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
44219+ new_entry);
44220+
44221+ if (IS_CBKERR(result)) {
44222+ done_lh(new_lh);
44223+ kfree(old_entry);
44224+ context_set_commit_async(ctx);
44225+ reiser4_exit_context(ctx);
44226+ return result;
44227+ }
44228+
44229+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
44230+
44231+ /* add or replace name for @old_inode as @new_name */
44232+ if (new_inode != NULL) {
44233+ /* target (@new_name) exists. */
44234+ /* Not clear what to do with objects that are
44235+ both directories and files at the same time. */
44236+ if (result == CBK_COORD_FOUND) {
44237+ result = replace_name(old_inode,
44238+ new_dir,
44239+ new_inode, new_coord, new_lh);
44240+ if (result == 0)
44241+ fplug = inode_file_plugin(new_inode);
44242+ } else if (result == CBK_COORD_NOTFOUND) {
44243+ /* VFS told us that @new_name is bound to existing
44244+ inode, but we failed to find directory entry. */
44245+ warning("nikita-2324", "Target not found");
44246+ result = RETERR(-ENOENT);
44247+ }
44248+ } else {
44249+ /* target (@new_name) doesn't exists. */
44250+ if (result == CBK_COORD_NOTFOUND)
44251+ result = add_name(old_inode,
44252+ new_dir,
44253+ new_name, new_coord, new_lh, is_dir);
44254+ else if (result == CBK_COORD_FOUND) {
44255+ /* VFS told us that @new_name is "negative" dentry,
44256+ but we found directory entry. */
44257+ warning("nikita-2331", "Target found unexpectedly");
44258+ result = RETERR(-EIO);
44259+ }
44260+ }
44261+
44262+ assert("nikita-3462", ergo(result == 0,
44263+ old_inode->i_nlink >= 2 + !!is_dir));
44264+
44265+ /* We are done with all modifications to the @new_dir, release lock on
44266+ node. */
44267+ done_lh(new_lh);
44268+
44269+ if (fplug != NULL) {
44270+ /* detach @new_inode from name-space */
44271+ result = fplug->detach(new_inode, new_dir);
44272+ if (result != 0)
44273+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
44274+ (unsigned long long)get_inode_oid(new_inode),
44275+ result, possible_leak);
44276+ }
44277+
44278+ if (new_inode != NULL)
44279+ reiser4_update_sd(new_inode);
44280+
44281+ if (result == 0) {
44282+ old_entry->obj = old_inode;
44283+
44284+ dplug->build_entry_key(old_dir,
44285+ &old_name->d_name, &old_entry->key);
44286+
44287+ /* At this stage new name was introduced for
44288+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44289+ counters were updated.
44290+
44291+ We want to remove @old_name now. If @old_inode wasn't
44292+ directory this is simple.
44293+ */
44294+ result = dplug->rem_entry(old_dir, old_name, old_entry);
44295+ if (result != 0 && result != -ENOMEM) {
44296+ warning("nikita-2335",
44297+ "Cannot remove old name: %i", result);
44298+ } else {
44299+ result = reiser4_del_nlink(old_inode, old_dir, 0);
44300+ if (result != 0 && result != -ENOMEM) {
44301+ warning("nikita-2337",
44302+ "Cannot drop link on old: %i", result);
44303+ }
44304+ }
44305+
44306+ if (result == 0 && is_dir) {
44307+ /* @old_inode is directory. We also have to update
44308+ dotdot entry. */
44309+ coord_t *dotdot_coord;
44310+
44311+ memset(dataonstack, 0, sizeof dataonstack);
44312+ memset(dotdot_entry, 0, sizeof dotdot_entry);
44313+ dotdot_entry->obj = old_dir;
44314+ memset(dotdot_name, 0, sizeof dotdot_name);
44315+ dotdot_name->d_name.name = "..";
44316+ dotdot_name->d_name.len = 2;
44317+ /*
44318+ * allocate ->d_fsdata on the stack to avoid using
44319+ * reiser4_get_dentry_fsdata(). Locking is not needed,
44320+ * because dentry is private to the current thread.
44321+ */
44322+ dotdot_name->d_fsdata = dataonstack;
44323+ init_lh(dotdot_lh);
44324+
44325+ dotdot_coord = &dataonstack->dec.entry_coord;
44326+ coord_clear_iplug(dotdot_coord);
44327+
44328+ result = reiser4_find_entry(old_inode, dotdot_name,
44329+ dotdot_lh, ZNODE_WRITE_LOCK,
44330+ dotdot_entry);
44331+ if (result == 0) {
44332+ /* replace_name() decreases i_nlink on
44333+ * @old_dir */
44334+ result = replace_name(new_dir,
44335+ old_inode,
44336+ old_dir,
44337+ dotdot_coord, dotdot_lh);
44338+ } else
44339+ result = RETERR(-EIO);
44340+ done_lh(dotdot_lh);
44341+ }
44342+ }
44343+ reiser4_update_dir(new_dir);
44344+ reiser4_update_dir(old_dir);
44345+ reiser4_update_sd(old_inode);
44346+ if (result == 0) {
44347+ file_plugin *fplug;
44348+
44349+ if (new_inode != NULL) {
44350+ /* add safe-link for target file (in case we removed
44351+ * last reference to the poor fellow */
44352+ fplug = inode_file_plugin(new_inode);
44353+ if (new_inode->i_nlink == 0)
44354+ result = safe_link_add(new_inode, SAFE_UNLINK);
44355+ }
44356+ }
44357+ kfree(old_entry);
44358+ context_set_commit_async(ctx);
44359+ reiser4_exit_context(ctx);
44360+ return result;
44361+}
44362+
44363+#if 0
44364+int reiser4_rename_common(struct inode *old_dir /* directory where @old
44365+ * is located */ ,
44366+ struct dentry *old_name /* old name */ ,
44367+ struct inode *new_dir /* directory where @new
44368+ * is located */ ,
44369+ struct dentry *new_name /* new name */ )
44370+{
44371+ /* From `The Open Group Base Specifications Issue 6'
44372+
44373+ If either the old or new argument names a symbolic link, rename()
44374+ shall operate on the symbolic link itself, and shall not resolve
44375+ the last component of the argument. If the old argument and the new
44376+ argument resolve to the same existing file, rename() shall return
44377+ successfully and perform no other action.
44378+
44379+ [this is done by VFS: vfs_rename()]
44380+
44381+ If the old argument points to the pathname of a file that is not a
44382+ directory, the new argument shall not point to the pathname of a
44383+ directory.
44384+
44385+ [checked by VFS: vfs_rename->may_delete()]
44386+
44387+ If the link named by the new argument exists, it shall
44388+ be removed and old renamed to new. In this case, a link named new
44389+ shall remain visible to other processes throughout the renaming
44390+ operation and refer either to the file referred to by new or old
44391+ before the operation began.
44392+
44393+ [we should assure this]
44394+
44395+ Write access permission is required for
44396+ both the directory containing old and the directory containing new.
44397+
44398+ [checked by VFS: vfs_rename->may_delete(), may_create()]
44399+
44400+ If the old argument points to the pathname of a directory, the new
44401+ argument shall not point to the pathname of a file that is not a
44402+ directory.
44403+
44404+ [checked by VFS: vfs_rename->may_delete()]
44405+
44406+ If the directory named by the new argument exists, it
44407+ shall be removed and old renamed to new. In this case, a link named
44408+ new shall exist throughout the renaming operation and shall refer
44409+ either to the directory referred to by new or old before the
44410+ operation began.
44411+
44412+ [we should assure this]
44413+
44414+ If new names an existing directory, it shall be
44415+ required to be an empty directory.
44416+
44417+ [we should check this]
44418+
44419+ If the old argument points to a pathname of a symbolic link, the
44420+ symbolic link shall be renamed. If the new argument points to a
44421+ pathname of a symbolic link, the symbolic link shall be removed.
44422+
44423+ The new pathname shall not contain a path prefix that names
44424+ old. Write access permission is required for the directory
44425+ containing old and the directory containing new. If the old
44426+ argument points to the pathname of a directory, write access
44427+ permission may be required for the directory named by old, and, if
44428+ it exists, the directory named by new.
44429+
44430+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
44431+
44432+ If the link named by the new argument exists and the file's link
44433+ count becomes 0 when it is removed and no process has the file
44434+ open, the space occupied by the file shall be freed and the file
44435+ shall no longer be accessible. If one or more processes have the
44436+ file open when the last link is removed, the link shall be removed
44437+ before rename() returns, but the removal of the file contents shall
44438+ be postponed until all references to the file are closed.
44439+
44440+ [iput() handles this, but we can do this manually, a la
44441+ reiser4_unlink()]
44442+
44443+ Upon successful completion, rename() shall mark for update the
44444+ st_ctime and st_mtime fields of the parent directory of each file.
44445+
44446+ [N/A]
44447+
44448+ */
44449+ reiser4_context *ctx;
44450+ int result;
44451+ int is_dir; /* is @old_name directory */
44452+ struct inode *old_inode;
44453+ struct inode *new_inode;
44454+ reiser4_dir_entry_desc old_entry;
44455+ reiser4_dir_entry_desc new_entry;
44456+ coord_t *new_coord;
44457+ reiser4_dentry_fsdata *new_fsdata;
44458+ lock_handle new_lh;
44459+ dir_plugin *dplug;
44460+ file_plugin *fplug;
44461+
44462+ ctx = reiser4_init_context(old_dir->i_sb);
44463+ if (IS_ERR(ctx))
44464+ return PTR_ERR(ctx);
44465+
44466+ assert("nikita-2318", old_dir != NULL);
44467+ assert("nikita-2319", new_dir != NULL);
44468+ assert("nikita-2320", old_name != NULL);
44469+ assert("nikita-2321", new_name != NULL);
44470+
44471+ old_inode = old_name->d_inode;
44472+ new_inode = new_name->d_inode;
44473+
44474+ dplug = inode_dir_plugin(old_dir);
44475+ fplug = NULL;
44476+
44477+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
44478+ if (IS_ERR(new_fsdata)) {
44479+ result = PTR_ERR(new_fsdata);
44480+ goto exit;
44481+ }
44482+
44483+ new_coord = &new_fsdata->dec.entry_coord;
44484+ coord_clear_iplug(new_coord);
44485+
44486+ is_dir = S_ISDIR(old_inode->i_mode);
44487+
44488+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
44489+
44490+ /* if target is existing directory and it's not empty---return error.
44491+
44492+ This check is done specifically, because is_dir_empty() requires
44493+ tree traversal and have to be done before locks are taken.
44494+ */
44495+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
44496+ return RETERR(-ENOTEMPTY);
44497+
44498+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
44499+ if (result != 0)
44500+ goto exit;
44501+
44502+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
44503+ new_dir, new_name);
44504+ if (result != 0)
44505+ goto exit;
44506+
44507+ init_lh(&new_lh);
44508+
44509+ /* find entry for @new_name */
44510+ result = reiser4_find_entry(new_dir, new_name, &new_lh,
44511+ ZNODE_WRITE_LOCK, &new_entry);
44512+
44513+ if (IS_CBKERR(result)) {
44514+ done_lh(&new_lh);
44515+ goto exit;
44516+ }
44517+
44518+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
44519+
44520+ /* add or replace name for @old_inode as @new_name */
44521+ if (new_inode != NULL) {
44522+ /* target (@new_name) exists. */
44523+ /* Not clear what to do with objects that are
44524+ both directories and files at the same time. */
44525+ if (result == CBK_COORD_FOUND) {
44526+ result = replace_name(old_inode,
44527+ new_dir,
44528+ new_inode, new_coord, &new_lh);
44529+ if (result == 0)
44530+ fplug = inode_file_plugin(new_inode);
44531+ } else if (result == CBK_COORD_NOTFOUND) {
44532+ /* VFS told us that @new_name is bound to existing
44533+ inode, but we failed to find directory entry. */
44534+ warning("nikita-2324", "Target not found");
44535+ result = RETERR(-ENOENT);
44536+ }
44537+ } else {
44538+ /* target (@new_name) doesn't exists. */
44539+ if (result == CBK_COORD_NOTFOUND)
44540+ result = add_name(old_inode,
44541+ new_dir,
44542+ new_name, new_coord, &new_lh, is_dir);
44543+ else if (result == CBK_COORD_FOUND) {
44544+ /* VFS told us that @new_name is "negative" dentry,
44545+ but we found directory entry. */
44546+ warning("nikita-2331", "Target found unexpectedly");
44547+ result = RETERR(-EIO);
44548+ }
44549+ }
44550+
44551+ assert("nikita-3462", ergo(result == 0,
44552+ old_inode->i_nlink >= 2 + !!is_dir));
44553+
44554+ /* We are done with all modifications to the @new_dir, release lock on
44555+ node. */
44556+ done_lh(&new_lh);
44557+
44558+ if (fplug != NULL) {
44559+ /* detach @new_inode from name-space */
44560+ result = fplug->detach(new_inode, new_dir);
44561+ if (result != 0)
44562+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
44563+ (unsigned long long)get_inode_oid(new_inode),
44564+ result, possible_leak);
44565+ }
44566+
44567+ if (new_inode != NULL)
44568+ reiser4_update_sd(new_inode);
44569+
44570+ if (result == 0) {
44571+ memset(&old_entry, 0, sizeof old_entry);
44572+ old_entry.obj = old_inode;
44573+
44574+ dplug->build_entry_key(old_dir,
44575+ &old_name->d_name, &old_entry.key);
44576+
44577+ /* At this stage new name was introduced for
44578+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44579+ counters were updated.
44580+
44581+ We want to remove @old_name now. If @old_inode wasn't
44582+ directory this is simple.
44583+ */
44584+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
44585+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
44586+ if (result != 0 && result != -ENOMEM) {
44587+ warning("nikita-2335",
44588+ "Cannot remove old name: %i", result);
44589+ } else {
44590+ result = reiser4_del_nlink(old_inode, old_dir, 0);
44591+ if (result != 0 && result != -ENOMEM) {
44592+ warning("nikita-2337",
44593+ "Cannot drop link on old: %i", result);
44594+ }
44595+ }
44596+
44597+ if (result == 0 && is_dir) {
44598+ /* @old_inode is directory. We also have to update
44599+ dotdot entry. */
44600+ coord_t *dotdot_coord;
44601+ lock_handle dotdot_lh;
44602+ struct dentry dotdot_name;
44603+ reiser4_dir_entry_desc dotdot_entry;
44604+ reiser4_dentry_fsdata dataonstack;
44605+ reiser4_dentry_fsdata *fsdata;
44606+
44607+ memset(&dataonstack, 0, sizeof dataonstack);
44608+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
44609+ dotdot_entry.obj = old_dir;
44610+ memset(&dotdot_name, 0, sizeof dotdot_name);
44611+ dotdot_name.d_name.name = "..";
44612+ dotdot_name.d_name.len = 2;
44613+ /*
44614+ * allocate ->d_fsdata on the stack to avoid using
44615+ * reiser4_get_dentry_fsdata(). Locking is not needed,
44616+ * because dentry is private to the current thread.
44617+ */
44618+ dotdot_name.d_fsdata = &dataonstack;
44619+ init_lh(&dotdot_lh);
44620+
44621+ fsdata = &dataonstack;
44622+ dotdot_coord = &fsdata->dec.entry_coord;
44623+ coord_clear_iplug(dotdot_coord);
44624+
44625+ result = reiser4_find_entry(old_inode,
44626+ &dotdot_name,
44627+ &dotdot_lh,
44628+ ZNODE_WRITE_LOCK,
44629+ &dotdot_entry);
44630+ if (result == 0) {
44631+ /* replace_name() decreases i_nlink on
44632+ * @old_dir */
44633+ result = replace_name(new_dir,
44634+ old_inode,
44635+ old_dir,
44636+ dotdot_coord, &dotdot_lh);
44637+ } else
44638+ result = RETERR(-EIO);
44639+ done_lh(&dotdot_lh);
44640+ }
44641+ }
44642+ reiser4_update_dir(new_dir);
44643+ reiser4_update_dir(old_dir);
44644+ reiser4_update_sd(old_inode);
44645+ if (result == 0) {
44646+ file_plugin *fplug;
44647+
44648+ if (new_inode != NULL) {
44649+ /* add safe-link for target file (in case we removed
44650+ * last reference to the poor fellow */
44651+ fplug = inode_file_plugin(new_inode);
44652+ if (new_inode->i_nlink == 0)
44653+ result = safe_link_add(new_inode, SAFE_UNLINK);
44654+ }
44655+ }
44656+ exit:
44657+ context_set_commit_async(ctx);
44658+ reiser4_exit_context(ctx);
44659+ return result;
44660+}
44661+#endif
44662diff --git a/fs/reiser4/plugin/item/Makefile b/fs/reiser4/plugin/item/Makefile
44663new file mode 100644
44664index 0000000..1bae623
44665--- /dev/null
44666+++ b/fs/reiser4/plugin/item/Makefile
44667@@ -0,0 +1,18 @@
44668+obj-$(CONFIG_REISER4_FS) += item_plugins.o
44669+
44670+item_plugins-objs := \
44671+ item.o \
44672+ static_stat.o \
44673+ sde.o \
44674+ cde.o \
44675+ blackbox.o \
44676+ internal.o \
44677+ tail.o \
44678+ ctail.o \
44679+ extent.o \
44680+ extent_item_ops.o \
44681+ extent_file_ops.o \
44682+ extent_flush_ops.o
44683+
44684+
44685+
44686diff --git a/fs/reiser4/plugin/item/acl.h b/fs/reiser4/plugin/item/acl.h
44687new file mode 100644
44688index 0000000..f26762a
44689--- /dev/null
44690+++ b/fs/reiser4/plugin/item/acl.h
44691@@ -0,0 +1,66 @@
44692+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44693+
44694+/* Directory entry. */
44695+
44696+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44697+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44698+
44699+#include "../../forward.h"
44700+#include "../../dformat.h"
44701+#include "../../kassign.h"
44702+#include "../../key.h"
44703+
44704+#include <linux/fs.h>
44705+#include <linux/dcache.h> /* for struct dentry */
44706+
44707+typedef struct directory_entry_format {
44708+ /* key of object stat-data. It's not necessary to store whole
44709+ key here, because it's always key of stat-data, so minor
44710+ packing locality and offset can be omitted here. But this
44711+ relies on particular key allocation scheme for stat-data, so,
44712+ for extensibility sake, whole key can be stored here.
44713+
44714+ We store key as array of bytes, because we don't want 8-byte
44715+ alignment of dir entries.
44716+ */
44717+ obj_key_id id;
44718+ /* file name. Null terminated string. */
44719+ d8 name[0];
44720+} directory_entry_format;
44721+
44722+void print_de(const char *prefix, coord_t * coord);
44723+int extract_key_de(const coord_t * coord, reiser4_key * key);
44724+int update_key_de(const coord_t * coord, const reiser4_key * key,
44725+ lock_handle * lh);
44726+char *extract_name_de(const coord_t * coord, char *buf);
44727+unsigned extract_file_type_de(const coord_t * coord);
44728+int add_entry_de(struct inode *dir, coord_t * coord,
44729+ lock_handle * lh, const struct dentry *name,
44730+ reiser4_dir_entry_desc * entry);
44731+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44732+ lock_handle * lh, reiser4_dir_entry_desc * entry);
44733+int max_name_len_de(const struct inode *dir);
44734+
44735+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44736+
44737+char *extract_dent_name(const coord_t * coord,
44738+ directory_entry_format * dent, char *buf);
44739+
44740+#if REISER4_LARGE_KEY
44741+#define DE_NAME_BUF_LEN (24)
44742+#else
44743+#define DE_NAME_BUF_LEN (16)
44744+#endif
44745+
44746+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44747+#endif
44748+
44749+/* Make Linus happy.
44750+ Local variables:
44751+ c-indentation-style: "K&R"
44752+ mode-name: "LC"
44753+ c-basic-offset: 8
44754+ tab-width: 8
44755+ fill-column: 120
44756+ End:
44757+*/
44758diff --git a/fs/reiser4/plugin/item/blackbox.c b/fs/reiser4/plugin/item/blackbox.c
44759new file mode 100644
44760index 0000000..f13ff64
44761--- /dev/null
44762+++ b/fs/reiser4/plugin/item/blackbox.c
44763@@ -0,0 +1,142 @@
44764+/* Copyright 2003 by Hans Reiser, licensing governed by
44765+ * reiser4/README */
44766+
44767+/* Black box item implementation */
44768+
44769+#include "../../forward.h"
44770+#include "../../debug.h"
44771+#include "../../dformat.h"
44772+#include "../../kassign.h"
44773+#include "../../coord.h"
44774+#include "../../tree.h"
44775+#include "../../lock.h"
44776+
44777+#include "blackbox.h"
44778+#include "item.h"
44779+#include "../plugin.h"
44780+
44781+int
44782+store_black_box(reiser4_tree * tree,
44783+ const reiser4_key * key, void *data, int length)
44784+{
44785+ int result;
44786+ reiser4_item_data idata;
44787+ coord_t coord;
44788+ lock_handle lh;
44789+
44790+ memset(&idata, 0, sizeof idata);
44791+
44792+ idata.data = data;
44793+ idata.user = 0;
44794+ idata.length = length;
44795+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44796+
44797+ init_lh(&lh);
44798+ result = insert_by_key(tree, key,
44799+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44800+
44801+ assert("nikita-3413",
44802+ ergo(result == 0,
44803+ WITH_COORD(&coord,
44804+ item_length_by_coord(&coord) == length)));
44805+
44806+ done_lh(&lh);
44807+ return result;
44808+}
44809+
44810+int
44811+load_black_box(reiser4_tree * tree,
44812+ reiser4_key * key, void *data, int length, int exact)
44813+{
44814+ int result;
44815+ coord_t coord;
44816+ lock_handle lh;
44817+
44818+ init_lh(&lh);
44819+ result = coord_by_key(tree, key,
44820+ &coord, &lh, ZNODE_READ_LOCK,
44821+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44822+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44823+
44824+ if (result == 0) {
44825+ int ilen;
44826+
44827+ result = zload(coord.node);
44828+ if (result == 0) {
44829+ ilen = item_length_by_coord(&coord);
44830+ if (ilen <= length) {
44831+ memcpy(data, item_body_by_coord(&coord), ilen);
44832+ unit_key_by_coord(&coord, key);
44833+ } else if (exact) {
44834+ /*
44835+ * item is larger than buffer provided by the
44836+ * user. Only issue a warning if @exact is
44837+ * set. If @exact is false, we are iterating
44838+ * over all safe-links and here we are reaching
44839+ * the end of the iteration.
44840+ */
44841+ warning("nikita-3415",
44842+ "Wrong black box length: %i > %i",
44843+ ilen, length);
44844+ result = RETERR(-EIO);
44845+ }
44846+ zrelse(coord.node);
44847+ }
44848+ }
44849+
44850+ done_lh(&lh);
44851+ return result;
44852+
44853+}
44854+
44855+int
44856+update_black_box(reiser4_tree * tree,
44857+ const reiser4_key * key, void *data, int length)
44858+{
44859+ int result;
44860+ coord_t coord;
44861+ lock_handle lh;
44862+
44863+ init_lh(&lh);
44864+ result = coord_by_key(tree, key,
44865+ &coord, &lh, ZNODE_READ_LOCK,
44866+ FIND_EXACT,
44867+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44868+ if (result == 0) {
44869+ int ilen;
44870+
44871+ result = zload(coord.node);
44872+ if (result == 0) {
44873+ ilen = item_length_by_coord(&coord);
44874+ if (length <= ilen) {
44875+ memcpy(item_body_by_coord(&coord), data,
44876+ length);
44877+ } else {
44878+ warning("nikita-3437",
44879+ "Wrong black box length: %i < %i",
44880+ ilen, length);
44881+ result = RETERR(-EIO);
44882+ }
44883+ zrelse(coord.node);
44884+ }
44885+ }
44886+
44887+ done_lh(&lh);
44888+ return result;
44889+
44890+}
44891+
44892+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44893+{
44894+ return reiser4_cut_tree(tree, key, key, NULL, 1);
44895+}
44896+
44897+/* Make Linus happy.
44898+ Local variables:
44899+ c-indentation-style: "K&R"
44900+ mode-name: "LC"
44901+ c-basic-offset: 8
44902+ tab-width: 8
44903+ fill-column: 120
44904+ End:
44905+*/
44906diff --git a/fs/reiser4/plugin/item/blackbox.h b/fs/reiser4/plugin/item/blackbox.h
44907new file mode 100644
44908index 0000000..f5b7af3
44909--- /dev/null
44910+++ b/fs/reiser4/plugin/item/blackbox.h
44911@@ -0,0 +1,33 @@
44912+/* Copyright 2003 by Hans Reiser, licensing governed by
44913+ * reiser4/README */
44914+
44915+/* "Black box" entry to fixed-width contain user supplied data */
44916+
44917+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44918+#define __FS_REISER4_BLACK_BOX_H__
44919+
44920+#include "../../forward.h"
44921+#include "../../dformat.h"
44922+#include "../../kassign.h"
44923+#include "../../key.h"
44924+
44925+extern int store_black_box(reiser4_tree * tree,
44926+ const reiser4_key * key, void *data, int length);
44927+extern int load_black_box(reiser4_tree * tree,
44928+ reiser4_key * key, void *data, int length, int exact);
44929+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44930+extern int update_black_box(reiser4_tree * tree,
44931+ const reiser4_key * key, void *data, int length);
44932+
44933+/* __FS_REISER4_BLACK_BOX_H__ */
44934+#endif
44935+
44936+/* Make Linus happy.
44937+ Local variables:
44938+ c-indentation-style: "K&R"
44939+ mode-name: "LC"
44940+ c-basic-offset: 8
44941+ tab-width: 8
44942+ fill-column: 120
44943+ End:
44944+*/
44945diff --git a/fs/reiser4/plugin/item/cde.c b/fs/reiser4/plugin/item/cde.c
44946new file mode 100644
44947index 0000000..05374ac
44948--- /dev/null
44949+++ b/fs/reiser4/plugin/item/cde.c
44950@@ -0,0 +1,1008 @@
44951+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44952+
44953+/* Directory entry implementation */
44954+
44955+/* DESCRIPTION:
44956+
44957+ This is "compound" directory item plugin implementation. This directory
44958+ item type is compound (as opposed to the "simple directory item" in
44959+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44960+ entries.
44961+
44962+ The reason behind this decision is disk space efficiency: all directory
44963+ entries inside the same directory have identical fragment in their
44964+ keys. This, of course, depends on key assignment policy. In our default key
44965+ assignment policy, all directory entries have the same locality which is
44966+ equal to the object id of their directory.
44967+
44968+ Composing directory item out of several directory entries for the same
44969+ directory allows us to store said key fragment only once. That is, this is
44970+ some ad hoc form of key compression (stem compression) that is implemented
44971+ here, because general key compression is not supposed to be implemented in
44972+ v4.0.
44973+
44974+ Another decision that was made regarding all directory item plugins, is
44975+ that they will store entry keys unaligned. This is for that sake of disk
44976+ space efficiency again.
44977+
44978+ In should be noted, that storing keys unaligned increases CPU consumption,
44979+ at least on some architectures.
44980+
44981+ Internal on-disk structure of the compound directory item is the following:
44982+
44983+ HEADER cde_item_format. Here number of entries is stored.
44984+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
44985+ ENTRY_HEADER_1 offset of entry body are stored.
44986+ ENTRY_HEADER_2 (basically two last parts of key)
44987+ ...
44988+ ENTRY_HEADER_N
44989+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
44990+ ENTRY_BODY_1 NUL-terminated name are stored.
44991+ ENTRY_BODY_2 (part of statadta key in the
44992+ sence that since all SDs have
44993+ zero offset, this offset is not
44994+ stored on disk).
44995+ ...
44996+ ENTRY_BODY_N
44997+
44998+ When it comes to the balancing, each directory entry in compound directory
44999+ item is unit, that is, something that can be cut from one item and pasted
45000+ into another item of the same type. Handling of unit cut and paste is major
45001+ reason for the complexity of code below.
45002+
45003+*/
45004+
45005+#include "../../forward.h"
45006+#include "../../debug.h"
45007+#include "../../dformat.h"
45008+#include "../../kassign.h"
45009+#include "../../key.h"
45010+#include "../../coord.h"
45011+#include "sde.h"
45012+#include "cde.h"
45013+#include "item.h"
45014+#include "../node/node.h"
45015+#include "../plugin.h"
45016+#include "../../znode.h"
45017+#include "../../carry.h"
45018+#include "../../tree.h"
45019+#include "../../inode.h"
45020+
45021+#include <linux/fs.h> /* for struct inode */
45022+#include <linux/dcache.h> /* for struct dentry */
45023+#include <linux/quotaops.h>
45024+
45025+#if 0
45026+#define CHECKME(coord) \
45027+({ \
45028+ const char *message; \
45029+ coord_t dup; \
45030+ \
45031+ coord_dup_nocheck(&dup, (coord)); \
45032+ dup.unit_pos = 0; \
45033+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
45034+})
45035+#else
45036+#define CHECKME(coord) noop
45037+#endif
45038+
45039+/* return body of compound directory item at @coord */
45040+static inline cde_item_format *formatted_at(const coord_t * coord)
45041+{
45042+ assert("nikita-1282", coord != NULL);
45043+ return item_body_by_coord(coord);
45044+}
45045+
45046+/* return entry header at @coord */
45047+static inline cde_unit_header *header_at(const coord_t *
45048+ coord /* coord of item */ ,
45049+ int idx /* index of unit */ )
45050+{
45051+ assert("nikita-1283", coord != NULL);
45052+ return &formatted_at(coord)->entry[idx];
45053+}
45054+
45055+/* return number of units in compound directory item at @coord */
45056+static int units(const coord_t * coord /* coord of item */ )
45057+{
45058+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
45059+}
45060+
45061+/* return offset of the body of @idx-th entry in @coord */
45062+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
45063+ int idx /* index of unit */ )
45064+{
45065+ if (idx < units(coord))
45066+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
45067+ else if (idx == units(coord))
45068+ return item_length_by_coord(coord);
45069+ else
45070+ impossible("nikita-1308", "Wrong idx");
45071+ return 0;
45072+}
45073+
45074+/* set offset of the body of @idx-th entry in @coord */
45075+static void set_offset(const coord_t * coord /* coord of item */ ,
45076+ int idx /* index of unit */ ,
45077+ unsigned int offset /* new offset */ )
45078+{
45079+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
45080+}
45081+
45082+static void adj_offset(const coord_t * coord /* coord of item */ ,
45083+ int idx /* index of unit */ ,
45084+ int delta /* offset change */ )
45085+{
45086+ d16 *doffset;
45087+ __u16 offset;
45088+
45089+ doffset = &header_at(coord, idx)->offset;
45090+ offset = le16_to_cpu(get_unaligned(doffset));
45091+ offset += delta;
45092+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
45093+}
45094+
45095+/* return pointer to @offset-th byte from the beginning of @coord */
45096+static char *address(const coord_t * coord /* coord of item */ ,
45097+ int offset)
45098+{
45099+ return ((char *)item_body_by_coord(coord)) + offset;
45100+}
45101+
45102+/* return pointer to the body of @idx-th entry in @coord */
45103+static directory_entry_format *entry_at(const coord_t * coord /* coord of
45104+ * item */ ,
45105+ int idx /* index of unit */ )
45106+{
45107+ return (directory_entry_format *) address(coord,
45108+ (int)offset_of(coord, idx));
45109+}
45110+
45111+/* return number of unit referenced by @coord */
45112+static int idx_of(const coord_t * coord /* coord of item */ )
45113+{
45114+ assert("nikita-1285", coord != NULL);
45115+ return coord->unit_pos;
45116+}
45117+
45118+/* find position where entry with @entry_key would be inserted into @coord */
45119+static int find(const coord_t * coord /* coord of item */ ,
45120+ const reiser4_key * entry_key /* key to look for */ ,
45121+ cmp_t * last /* result of last comparison */ )
45122+{
45123+ int entries;
45124+
45125+ int left;
45126+ int right;
45127+
45128+ cde_unit_header *header;
45129+
45130+ assert("nikita-1295", coord != NULL);
45131+ assert("nikita-1296", entry_key != NULL);
45132+ assert("nikita-1297", last != NULL);
45133+
45134+ entries = units(coord);
45135+ left = 0;
45136+ right = entries - 1;
45137+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
45138+ int median;
45139+
45140+ median = (left + right) >> 1;
45141+
45142+ header = header_at(coord, median);
45143+ *last = de_id_key_cmp(&header->hash, entry_key);
45144+ switch (*last) {
45145+ case LESS_THAN:
45146+ left = median;
45147+ break;
45148+ case GREATER_THAN:
45149+ right = median;
45150+ break;
45151+ case EQUAL_TO:{
45152+ do {
45153+ median--;
45154+ header--;
45155+ } while (median >= 0 &&
45156+ de_id_key_cmp(&header->hash,
45157+ entry_key) == EQUAL_TO);
45158+ return median + 1;
45159+ }
45160+ }
45161+ }
45162+ header = header_at(coord, left);
45163+ for (; left < entries; ++left, ++header) {
45164+ prefetch(header + 1);
45165+ *last = de_id_key_cmp(&header->hash, entry_key);
45166+ if (*last != LESS_THAN)
45167+ break;
45168+ }
45169+ if (left < entries)
45170+ return left;
45171+ else
45172+ return RETERR(-ENOENT);
45173+
45174+}
45175+
45176+/* expand @coord as to accommodate for insertion of @no new entries starting
45177+ from @pos, with total bodies size @size. */
45178+static int expand_item(const coord_t * coord /* coord of item */ ,
45179+ int pos /* unit position */ , int no /* number of new
45180+ * units*/ ,
45181+ int size /* total size of new units' data */ ,
45182+ unsigned int data_size /* free space already reserved
45183+ * in the item for insertion */ )
45184+{
45185+ int entries;
45186+ cde_unit_header *header;
45187+ char *dent;
45188+ int i;
45189+
45190+ assert("nikita-1310", coord != NULL);
45191+ assert("nikita-1311", pos >= 0);
45192+ assert("nikita-1312", no > 0);
45193+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
45194+ assert("nikita-1343",
45195+ item_length_by_coord(coord) >=
45196+ (int)(size + data_size + no * sizeof *header));
45197+
45198+ entries = units(coord);
45199+
45200+ if (pos == entries)
45201+ dent = address(coord, size);
45202+ else
45203+ dent = (char *)entry_at(coord, pos);
45204+ /* place where new header will be in */
45205+ header = header_at(coord, pos);
45206+ /* free space for new entry headers */
45207+ memmove(header + no, header,
45208+ (unsigned)(address(coord, size) - (char *)header));
45209+ /* if adding to the end initialise first new header */
45210+ if (pos == entries) {
45211+ set_offset(coord, pos, (unsigned)size);
45212+ }
45213+
45214+ /* adjust entry pointer and size */
45215+ dent = dent + no * sizeof *header;
45216+ size += no * sizeof *header;
45217+ /* free space for new entries */
45218+ memmove(dent + data_size, dent,
45219+ (unsigned)(address(coord, size) - dent));
45220+
45221+ /* increase counter */
45222+ entries += no;
45223+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
45224+
45225+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
45226+ bytes. */
45227+ for (i = 0; i <= pos; ++i)
45228+ adj_offset(coord, i, no * sizeof *header);
45229+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
45230+ sizeof *header + data_size ) bytes */
45231+ for (i = pos + no; i < entries; ++i)
45232+ adj_offset(coord, i, no * sizeof *header + data_size);
45233+ return 0;
45234+}
45235+
45236+/* insert new @entry into item */
45237+static int expand(const coord_t * coord /* coord of item */ ,
45238+ cde_entry * entry /* entry to insert */ ,
45239+ int len /* length of @entry data */ ,
45240+ int *pos /* position to insert */ ,
45241+ reiser4_dir_entry_desc * dir_entry /* parameters for new
45242+ * entry */ )
45243+{
45244+ cmp_t cmp_res;
45245+ int datasize;
45246+
45247+ *pos = find(coord, &dir_entry->key, &cmp_res);
45248+ if (*pos < 0)
45249+ *pos = units(coord);
45250+
45251+ datasize = sizeof(directory_entry_format);
45252+ if (is_longname(entry->name->name, entry->name->len))
45253+ datasize += entry->name->len + 1;
45254+
45255+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
45256+ datasize);
45257+ return 0;
45258+}
45259+
45260+/* paste body of @entry into item */
45261+static int paste_entry(const coord_t * coord /* coord of item */ ,
45262+ cde_entry * entry /* new entry */ ,
45263+ int pos /* position to insert */ ,
45264+ reiser4_dir_entry_desc * dir_entry /* parameters for
45265+ * new entry */ )
45266+{
45267+ cde_unit_header *header;
45268+ directory_entry_format *dent;
45269+ const char *name;
45270+ int len;
45271+
45272+ header = header_at(coord, pos);
45273+ dent = entry_at(coord, pos);
45274+
45275+ build_de_id_by_key(&dir_entry->key, &header->hash);
45276+ build_inode_key_id(entry->obj, &dent->id);
45277+ /* AUDIT unsafe strcpy() operation! It should be replaced with
45278+ much less CPU hungry
45279+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
45280+
45281+ Also a more major thing is that there should be a way to figure out
45282+ amount of space in dent -> name and be able to check that we are
45283+ not going to overwrite more than we supposed to */
45284+ name = entry->name->name;
45285+ len = entry->name->len;
45286+ if (is_longname(name, len)) {
45287+ strcpy((unsigned char *)dent->name, name);
45288+ put_unaligned(0, &dent->name[len]);
45289+ }
45290+ return 0;
45291+}
45292+
45293+/* estimate how much space is necessary in item to insert/paste set of entries
45294+ described in @data. */
45295+int estimate_cde(const coord_t * coord /* coord of item */ ,
45296+ const reiser4_item_data * data /* parameters for new item */ )
45297+{
45298+ cde_entry_data *e;
45299+ int result;
45300+ int i;
45301+
45302+ e = (cde_entry_data *) data->data;
45303+
45304+ assert("nikita-1288", e != NULL);
45305+ assert("nikita-1289", e->num_of_entries >= 0);
45306+
45307+ if (coord == NULL)
45308+ /* insert */
45309+ result = sizeof(cde_item_format);
45310+ else
45311+ /* paste */
45312+ result = 0;
45313+
45314+ result += e->num_of_entries *
45315+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
45316+ for (i = 0; i < e->num_of_entries; ++i) {
45317+ const char *name;
45318+ int len;
45319+
45320+ name = e->entry[i].name->name;
45321+ len = e->entry[i].name->len;
45322+ assert("nikita-2054", strlen(name) == len);
45323+ if (is_longname(name, len))
45324+ result += len + 1;
45325+ }
45326+ ((reiser4_item_data *) data)->length = result;
45327+ return result;
45328+}
45329+
45330+/* ->nr_units() method for this item plugin. */
45331+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
45332+{
45333+ return units(coord);
45334+}
45335+
45336+/* ->unit_key() method for this item plugin. */
45337+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
45338+ reiser4_key * key /* resulting key */ )
45339+{
45340+ assert("nikita-1452", coord != NULL);
45341+ assert("nikita-1345", idx_of(coord) < units(coord));
45342+ assert("nikita-1346", key != NULL);
45343+
45344+ item_key_by_coord(coord, key);
45345+ extract_key_from_de_id(extract_dir_id_from_key(key),
45346+ &header_at(coord, idx_of(coord))->hash, key);
45347+ return key;
45348+}
45349+
45350+/* mergeable_cde(): implementation of ->mergeable() item method.
45351+
45352+ Two directory items are mergeable iff they are from the same
45353+ directory. That simple.
45354+
45355+*/
45356+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
45357+ const coord_t * p2 /* coord of second item */ )
45358+{
45359+ reiser4_key k1;
45360+ reiser4_key k2;
45361+
45362+ assert("nikita-1339", p1 != NULL);
45363+ assert("nikita-1340", p2 != NULL);
45364+
45365+ return
45366+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
45367+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
45368+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
45369+
45370+}
45371+
45372+/* ->max_key_inside() method for this item plugin. */
45373+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
45374+ reiser4_key * result /* resulting key */ )
45375+{
45376+ assert("nikita-1342", coord != NULL);
45377+
45378+ item_key_by_coord(coord, result);
45379+ set_key_ordering(result, get_key_ordering(reiser4_max_key()));
45380+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
45381+ set_key_offset(result, get_key_offset(reiser4_max_key()));
45382+ return result;
45383+}
45384+
45385+/* @data contains data which are to be put into tree */
45386+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
45387+ const reiser4_key * key /* key to check */ ,
45388+ const reiser4_item_data * data /* parameters of new
45389+ * item/unit being
45390+ * created */ )
45391+{
45392+ reiser4_key item_key;
45393+
45394+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
45395+ data->iplug is initialized */
45396+ assert("vs-457", data && data->iplug);
45397+/* assert( "vs-553", data -> user == 0 );*/
45398+ item_key_by_coord(coord, &item_key);
45399+
45400+ return (item_plugin_by_coord(coord) == data->iplug) &&
45401+ (extract_dir_id_from_key(&item_key) ==
45402+ extract_dir_id_from_key(key));
45403+}
45404+
45405+#if REISER4_DEBUG
45406+/* cde_check ->check() method for compressed directory items
45407+
45408+ used for debugging, every item should have here the most complete
45409+ possible check of the consistency of the item that the inventor can
45410+ construct
45411+*/
45412+int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
45413+ const char **error /* where to store error message */)
45414+{
45415+ int i;
45416+ int result;
45417+ char *item_start;
45418+ char *item_end;
45419+ reiser4_key key;
45420+
45421+ coord_t c;
45422+
45423+ assert("nikita-1357", coord != NULL);
45424+ assert("nikita-1358", error != NULL);
45425+
45426+ if (!ergo(coord->item_pos != 0,
45427+ is_dot_key(item_key_by_coord(coord, &key)))) {
45428+ *error = "CDE doesn't start with dot";
45429+ return -1;
45430+ }
45431+ item_start = item_body_by_coord(coord);
45432+ item_end = item_start + item_length_by_coord(coord);
45433+
45434+ coord_dup(&c, coord);
45435+ result = 0;
45436+ for (i = 0; i < units(coord); ++i) {
45437+ directory_entry_format *entry;
45438+
45439+ if ((char *)(header_at(coord, i) + 1) >
45440+ item_end - units(coord) * sizeof *entry) {
45441+ *error = "CDE header is out of bounds";
45442+ result = -1;
45443+ break;
45444+ }
45445+ entry = entry_at(coord, i);
45446+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
45447+ *error = "CDE header is too low";
45448+ result = -1;
45449+ break;
45450+ }
45451+ if ((char *)(entry + 1) > item_end) {
45452+ *error = "CDE header is too high";
45453+ result = -1;
45454+ break;
45455+ }
45456+ }
45457+
45458+ return result;
45459+}
45460+#endif
45461+
45462+/* ->init() method for this item plugin. */
45463+int init_cde(coord_t * coord /* coord of item */ ,
45464+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
45465+ UNUSED_ARG)
45466+{
45467+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
45468+ return 0;
45469+}
45470+
45471+/* ->lookup() method for this item plugin. */
45472+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
45473+ lookup_bias bias /* search bias */ ,
45474+ coord_t * coord /* coord of item to lookup in */ )
45475+{
45476+ cmp_t last_comp;
45477+ int pos;
45478+
45479+ reiser4_key utmost_key;
45480+
45481+ assert("nikita-1293", coord != NULL);
45482+ assert("nikita-1294", key != NULL);
45483+
45484+ CHECKME(coord);
45485+
45486+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
45487+ coord->unit_pos = 0;
45488+ coord->between = BEFORE_UNIT;
45489+ return CBK_COORD_NOTFOUND;
45490+ }
45491+ pos = find(coord, key, &last_comp);
45492+ if (pos >= 0) {
45493+ coord->unit_pos = (int)pos;
45494+ switch (last_comp) {
45495+ case EQUAL_TO:
45496+ coord->between = AT_UNIT;
45497+ return CBK_COORD_FOUND;
45498+ case GREATER_THAN:
45499+ coord->between = BEFORE_UNIT;
45500+ return RETERR(-ENOENT);
45501+ case LESS_THAN:
45502+ default:
45503+ impossible("nikita-1298", "Broken find");
45504+ return RETERR(-EIO);
45505+ }
45506+ } else {
45507+ coord->unit_pos = units(coord) - 1;
45508+ coord->between = AFTER_UNIT;
45509+ return (bias ==
45510+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
45511+ CBK_COORD_NOTFOUND;
45512+ }
45513+}
45514+
45515+/* ->paste() method for this item plugin. */
45516+int paste_cde(coord_t * coord /* coord of item */ ,
45517+ reiser4_item_data * data /* parameters of new unit being
45518+ * inserted */ ,
45519+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
45520+{
45521+ cde_entry_data *e;
45522+ int result;
45523+ int i;
45524+
45525+ CHECKME(coord);
45526+ e = (cde_entry_data *) data->data;
45527+
45528+ result = 0;
45529+ for (i = 0; i < e->num_of_entries; ++i) {
45530+ int pos;
45531+ int phantom_size;
45532+
45533+ phantom_size = data->length;
45534+ if (units(coord) == 0)
45535+ phantom_size -= sizeof(cde_item_format);
45536+
45537+ result =
45538+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
45539+ if (result != 0)
45540+ break;
45541+ result = paste_entry(coord, e->entry + i, pos, data->arg);
45542+ if (result != 0)
45543+ break;
45544+ }
45545+ CHECKME(coord);
45546+ return result;
45547+}
45548+
45549+/* amount of space occupied by all entries starting from @idx both headers and
45550+ bodies. */
45551+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
45552+ int idx /* index of unit */ )
45553+{
45554+ assert("nikita-1299", coord != NULL);
45555+ assert("nikita-1300", idx < (int)units(coord));
45556+
45557+ return sizeof(cde_item_format) +
45558+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
45559+ idx + 1) -
45560+ offset_of(coord, 0);
45561+}
45562+
45563+/* how many but not more than @want units of @source can be merged with
45564+ item in @target node. If pend == append - we try to append last item
45565+ of @target by first units of @source. If pend == prepend - we try to
45566+ "prepend" first item in @target by last units of @source. @target
45567+ node has @free_space bytes of free space. Total size of those units
45568+ are returned via @size */
45569+int can_shift_cde(unsigned free_space /* free space in item */ ,
45570+ coord_t * coord /* coord of source item */ ,
45571+ znode * target /* target node */ ,
45572+ shift_direction pend /* shift direction */ ,
45573+ unsigned *size /* resulting number of shifted bytes */ ,
45574+ unsigned want /* maximal number of bytes to shift */ )
45575+{
45576+ int shift;
45577+
45578+ CHECKME(coord);
45579+ if (want == 0) {
45580+ *size = 0;
45581+ return 0;
45582+ }
45583+
45584+ /* pend == SHIFT_LEFT <==> shifting to the left */
45585+ if (pend == SHIFT_LEFT) {
45586+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
45587+ --shift) {
45588+ *size = part_size(coord, shift);
45589+ if (target != NULL)
45590+ *size -= sizeof(cde_item_format);
45591+ if (*size <= free_space)
45592+ break;
45593+ }
45594+ shift = shift + 1;
45595+ } else {
45596+ int total_size;
45597+
45598+ assert("nikita-1301", pend == SHIFT_RIGHT);
45599+
45600+ total_size = item_length_by_coord(coord);
45601+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
45602+ ++shift) {
45603+ *size = total_size - part_size(coord, shift);
45604+ if (target == NULL)
45605+ *size += sizeof(cde_item_format);
45606+ if (*size <= free_space)
45607+ break;
45608+ }
45609+ shift = units(coord) - shift - 1;
45610+ }
45611+ if (shift == 0)
45612+ *size = 0;
45613+ CHECKME(coord);
45614+ return shift;
45615+}
45616+
45617+/* ->copy_units() method for this item plugin. */
45618+void copy_units_cde(coord_t * target /* coord of target item */ ,
45619+ coord_t * source /* coord of source item */ ,
45620+ unsigned from /* starting unit */ ,
45621+ unsigned count /* how many units to copy */ ,
45622+ shift_direction where_is_free_space /* shift direction */ ,
45623+ unsigned free_space /* free space in item */ )
45624+{
45625+ char *header_from;
45626+ char *header_to;
45627+
45628+ char *entry_from;
45629+ char *entry_to;
45630+
45631+ int pos_in_target;
45632+ int data_size;
45633+ int data_delta;
45634+ int i;
45635+
45636+ assert("nikita-1303", target != NULL);
45637+ assert("nikita-1304", source != NULL);
45638+ assert("nikita-1305", (int)from < units(source));
45639+ assert("nikita-1307", (int)(from + count) <= units(source));
45640+
45641+ if (where_is_free_space == SHIFT_LEFT) {
45642+ assert("nikita-1453", from == 0);
45643+ pos_in_target = units(target);
45644+ } else {
45645+ assert("nikita-1309", (int)(from + count) == units(source));
45646+ pos_in_target = 0;
45647+ memmove(item_body_by_coord(target),
45648+ (char *)item_body_by_coord(target) + free_space,
45649+ item_length_by_coord(target) - free_space);
45650+ }
45651+
45652+ CHECKME(target);
45653+ CHECKME(source);
45654+
45655+ /* expand @target */
45656+ data_size =
45657+ offset_of(source, (int)(from + count)) - offset_of(source,
45658+ (int)from);
45659+
45660+ if (units(target) == 0)
45661+ free_space -= sizeof(cde_item_format);
45662+
45663+ expand_item(target, pos_in_target, (int)count,
45664+ (int)(item_length_by_coord(target) - free_space),
45665+ (unsigned)data_size);
45666+
45667+ /* copy first @count units of @source into @target */
45668+ data_delta =
45669+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
45670+
45671+ /* copy entries */
45672+ entry_from = (char *)entry_at(source, (int)from);
45673+ entry_to = (char *)entry_at(source, (int)(from + count));
45674+ memmove(entry_at(target, pos_in_target), entry_from,
45675+ (unsigned)(entry_to - entry_from));
45676+
45677+ /* copy headers */
45678+ header_from = (char *)header_at(source, (int)from);
45679+ header_to = (char *)header_at(source, (int)(from + count));
45680+ memmove(header_at(target, pos_in_target), header_from,
45681+ (unsigned)(header_to - header_from));
45682+
45683+ /* update offsets */
45684+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45685+ adj_offset(target, i, data_delta);
45686+ CHECKME(target);
45687+ CHECKME(source);
45688+}
45689+
45690+/* ->cut_units() method for this item plugin. */
45691+int cut_units_cde(coord_t * coord /* coord of item */ ,
45692+ pos_in_node_t from /* start unit pos */ ,
45693+ pos_in_node_t to /* stop unit pos */ ,
45694+ struct carry_cut_data *cdata UNUSED_ARG,
45695+ reiser4_key * smallest_removed, reiser4_key * new_first)
45696+{
45697+ char *header_from;
45698+ char *header_to;
45699+
45700+ char *entry_from;
45701+ char *entry_to;
45702+
45703+ int size;
45704+ int entry_delta;
45705+ int header_delta;
45706+ int i;
45707+
45708+ unsigned count;
45709+
45710+ CHECKME(coord);
45711+
45712+ count = to - from + 1;
45713+
45714+ assert("nikita-1454", coord != NULL);
45715+ assert("nikita-1455", (int)(from + count) <= units(coord));
45716+
45717+ if (smallest_removed)
45718+ unit_key_by_coord(coord, smallest_removed);
45719+
45720+ if (new_first) {
45721+ coord_t next;
45722+
45723+ /* not everything is cut from item head */
45724+ assert("vs-1527", from == 0);
45725+ assert("vs-1528", to < units(coord) - 1);
45726+
45727+ coord_dup(&next, coord);
45728+ next.unit_pos++;
45729+ unit_key_by_coord(&next, new_first);
45730+ }
45731+
45732+ size = item_length_by_coord(coord);
45733+ if (count == (unsigned)units(coord)) {
45734+ return size;
45735+ }
45736+
45737+ header_from = (char *)header_at(coord, (int)from);
45738+ header_to = (char *)header_at(coord, (int)(from + count));
45739+
45740+ entry_from = (char *)entry_at(coord, (int)from);
45741+ entry_to = (char *)entry_at(coord, (int)(from + count));
45742+
45743+ /* move headers */
45744+ memmove(header_from, header_to,
45745+ (unsigned)(address(coord, size) - header_to));
45746+
45747+ header_delta = header_to - header_from;
45748+
45749+ entry_from -= header_delta;
45750+ entry_to -= header_delta;
45751+ size -= header_delta;
45752+
45753+ /* copy entries */
45754+ memmove(entry_from, entry_to,
45755+ (unsigned)(address(coord, size) - entry_to));
45756+
45757+ entry_delta = entry_to - entry_from;
45758+ size -= entry_delta;
45759+
45760+ /* update offsets */
45761+
45762+ for (i = 0; i < (int)from; ++i)
45763+ adj_offset(coord, i, -header_delta);
45764+
45765+ for (i = from; i < units(coord) - (int)count; ++i)
45766+ adj_offset(coord, i, -header_delta - entry_delta);
45767+
45768+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45769+ &formatted_at(coord)->num_of_entries);
45770+
45771+ if (from == 0) {
45772+ /* entries from head was removed - move remaining to right */
45773+ memmove((char *)item_body_by_coord(coord) +
45774+ header_delta + entry_delta, item_body_by_coord(coord),
45775+ (unsigned)size);
45776+ if (REISER4_DEBUG)
45777+ memset(item_body_by_coord(coord), 0,
45778+ (unsigned)header_delta + entry_delta);
45779+ } else {
45780+ /* freed space is already at the end of item */
45781+ if (REISER4_DEBUG)
45782+ memset((char *)item_body_by_coord(coord) + size, 0,
45783+ (unsigned)header_delta + entry_delta);
45784+ }
45785+
45786+ return header_delta + entry_delta;
45787+}
45788+
45789+int kill_units_cde(coord_t * coord /* coord of item */ ,
45790+ pos_in_node_t from /* start unit pos */ ,
45791+ pos_in_node_t to /* stop unit pos */ ,
45792+ struct carry_kill_data *kdata UNUSED_ARG,
45793+ reiser4_key * smallest_removed, reiser4_key * new_first)
45794+{
45795+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45796+}
45797+
45798+/* ->s.dir.extract_key() method for this item plugin. */
45799+int extract_key_cde(const coord_t * coord /* coord of item */ ,
45800+ reiser4_key * key /* resulting key */ )
45801+{
45802+ directory_entry_format *dent;
45803+
45804+ assert("nikita-1155", coord != NULL);
45805+ assert("nikita-1156", key != NULL);
45806+
45807+ dent = entry_at(coord, idx_of(coord));
45808+ return extract_key_from_id(&dent->id, key);
45809+}
45810+
45811+int
45812+update_key_cde(const coord_t * coord, const reiser4_key * key,
45813+ lock_handle * lh UNUSED_ARG)
45814+{
45815+ directory_entry_format *dent;
45816+ obj_key_id obj_id;
45817+ int result;
45818+
45819+ assert("nikita-2344", coord != NULL);
45820+ assert("nikita-2345", key != NULL);
45821+
45822+ dent = entry_at(coord, idx_of(coord));
45823+ result = build_obj_key_id(key, &obj_id);
45824+ if (result == 0) {
45825+ dent->id = obj_id;
45826+ znode_make_dirty(coord->node);
45827+ }
45828+ return 0;
45829+}
45830+
45831+/* ->s.dir.extract_name() method for this item plugin. */
45832+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45833+{
45834+ directory_entry_format *dent;
45835+
45836+ assert("nikita-1157", coord != NULL);
45837+
45838+ dent = entry_at(coord, idx_of(coord));
45839+ return extract_dent_name(coord, dent, buf);
45840+}
45841+
45842+static int cde_bytes(int pasting, const reiser4_item_data * data)
45843+{
45844+ int result;
45845+
45846+ result = data->length;
45847+ if (!pasting)
45848+ result -= sizeof(cde_item_format);
45849+ return result;
45850+}
45851+
45852+/* ->s.dir.add_entry() method for this item plugin */
45853+int add_entry_cde(struct inode *dir /* directory object */ ,
45854+ coord_t * coord /* coord of item */ ,
45855+ lock_handle * lh /* lock handle for insertion */ ,
45856+ const struct dentry *name /* name to insert */ ,
45857+ reiser4_dir_entry_desc * dir_entry /* parameters of new
45858+ * directory entry */ )
45859+{
45860+ reiser4_item_data data;
45861+ cde_entry entry;
45862+ cde_entry_data edata;
45863+ int result;
45864+
45865+ assert("nikita-1656", coord->node == lh->node);
45866+ assert("nikita-1657", znode_is_write_locked(coord->node));
45867+
45868+ edata.num_of_entries = 1;
45869+ edata.entry = &entry;
45870+
45871+ entry.dir = dir;
45872+ entry.obj = dir_entry->obj;
45873+ entry.name = &name->d_name;
45874+
45875+ data.data = (char *)&edata;
45876+ data.user = 0; /* &edata is not user space */
45877+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45878+ data.arg = dir_entry;
45879+ assert("nikita-1302", data.iplug != NULL);
45880+
45881+ result = is_dot_key(&dir_entry->key);
45882+ data.length = estimate_cde(result ? coord : NULL, &data);
45883+
45884+ /* NOTE-NIKITA quota plugin? */
45885+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45886+ return RETERR(-EDQUOT);
45887+
45888+ if (result)
45889+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45890+ else
45891+ result = reiser4_resize_item(coord, &data, &dir_entry->key,
45892+ lh, 0);
45893+ return result;
45894+}
45895+
45896+/* ->s.dir.rem_entry() */
45897+int rem_entry_cde(struct inode *dir /* directory of item */ ,
45898+ const struct qstr *name, coord_t * coord /* coord of item */ ,
45899+ lock_handle * lh UNUSED_ARG /* lock handle for
45900+ * removal */ ,
45901+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
45902+ * directory entry
45903+ * being removed */ )
45904+{
45905+ coord_t shadow;
45906+ int result;
45907+ int length;
45908+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45909+
45910+ assert("nikita-2870", strlen(name->name) == name->len);
45911+ assert("nikita-2869",
45912+ !strcmp(name->name, extract_name_cde(coord, buf)));
45913+
45914+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45915+ if (is_longname(name->name, name->len))
45916+ length += name->len + 1;
45917+
45918+ if (inode_get_bytes(dir) < length) {
45919+ warning("nikita-2628", "Dir is broke: %llu: %llu",
45920+ (unsigned long long)get_inode_oid(dir),
45921+ inode_get_bytes(dir));
45922+
45923+ return RETERR(-EIO);
45924+ }
45925+
45926+ /* cut_node() is supposed to take pointers to _different_
45927+ coords, because it will modify them without respect to
45928+ possible aliasing. To work around this, create temporary copy
45929+ of @coord.
45930+ */
45931+ coord_dup(&shadow, coord);
45932+ result =
45933+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45934+ if (result == 0) {
45935+ /* NOTE-NIKITA quota plugin? */
45936+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
45937+ }
45938+ return result;
45939+}
45940+
45941+/* ->s.dir.max_name_len() method for this item plugin */
45942+int max_name_len_cde(const struct inode *dir /* directory */ )
45943+{
45944+ return
45945+ reiser4_tree_by_inode(dir)->nplug->max_item_size() -
45946+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
45947+ sizeof(cde_unit_header) - 2;
45948+}
45949+
45950+/* Make Linus happy.
45951+ Local variables:
45952+ c-indentation-style: "K&R"
45953+ mode-name: "LC"
45954+ c-basic-offset: 8
45955+ tab-width: 8
45956+ fill-column: 120
45957+ End:
45958+*/
45959diff --git a/fs/reiser4/plugin/item/cde.h b/fs/reiser4/plugin/item/cde.h
45960new file mode 100644
45961index 0000000..73a30d5
45962--- /dev/null
45963+++ b/fs/reiser4/plugin/item/cde.h
45964@@ -0,0 +1,87 @@
45965+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45966+
45967+/* Compound directory item. See cde.c for description. */
45968+
45969+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45970+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45971+
45972+#include "../../forward.h"
45973+#include "../../kassign.h"
45974+#include "../../dformat.h"
45975+
45976+#include <linux/fs.h> /* for struct inode */
45977+#include <linux/dcache.h> /* for struct dentry, etc */
45978+
45979+typedef struct cde_unit_header {
45980+ de_id hash;
45981+ d16 offset;
45982+} cde_unit_header;
45983+
45984+typedef struct cde_item_format {
45985+ d16 num_of_entries;
45986+ cde_unit_header entry[0];
45987+} cde_item_format;
45988+
45989+typedef struct cde_entry {
45990+ const struct inode *dir;
45991+ const struct inode *obj;
45992+ const struct qstr *name;
45993+} cde_entry;
45994+
45995+typedef struct cde_entry_data {
45996+ int num_of_entries;
45997+ cde_entry *entry;
45998+} cde_entry_data;
45999+
46000+/* plugin->item.b.* */
46001+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
46002+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
46003+ const reiser4_item_data *);
46004+int mergeable_cde(const coord_t * p1, const coord_t * p2);
46005+pos_in_node_t nr_units_cde(const coord_t * coord);
46006+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
46007+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
46008+void print_cde(const char *prefix, coord_t * coord);
46009+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
46010+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
46011+ coord_t * coord);
46012+int paste_cde(coord_t * coord, reiser4_item_data * data,
46013+ carry_plugin_info * info UNUSED_ARG);
46014+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
46015+ shift_direction pend, unsigned *size, unsigned want);
46016+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
46017+ unsigned count, shift_direction where_is_free_space,
46018+ unsigned free_space);
46019+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46020+ struct carry_cut_data *, reiser4_key * smallest_removed,
46021+ reiser4_key * new_first);
46022+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46023+ struct carry_kill_data *, reiser4_key * smallest_removed,
46024+ reiser4_key * new_first);
46025+void print_cde(const char *prefix, coord_t * coord);
46026+int reiser4_check_cde(const coord_t * coord, const char **error);
46027+
46028+/* plugin->u.item.s.dir.* */
46029+int extract_key_cde(const coord_t * coord, reiser4_key * key);
46030+int update_key_cde(const coord_t * coord, const reiser4_key * key,
46031+ lock_handle * lh);
46032+char *extract_name_cde(const coord_t * coord, char *buf);
46033+int add_entry_cde(struct inode *dir, coord_t * coord,
46034+ lock_handle * lh, const struct dentry *name,
46035+ reiser4_dir_entry_desc * entry);
46036+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
46037+ lock_handle * lh, reiser4_dir_entry_desc * entry);
46038+int max_name_len_cde(const struct inode *dir);
46039+
46040+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
46041+#endif
46042+
46043+/* Make Linus happy.
46044+ Local variables:
46045+ c-indentation-style: "K&R"
46046+ mode-name: "LC"
46047+ c-basic-offset: 8
46048+ tab-width: 8
46049+ fill-column: 120
46050+ End:
46051+*/
46052diff --git a/fs/reiser4/plugin/item/ctail.c b/fs/reiser4/plugin/item/ctail.c
46053new file mode 100644
46054index 0000000..9cb8eca
46055--- /dev/null
46056+++ b/fs/reiser4/plugin/item/ctail.c
46057@@ -0,0 +1,1570 @@
46058+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46059+
46060+/* ctails (aka "clustered tails") are items for cryptcompress objects */
46061+
46062+/* DESCRIPTION:
46063+
46064+Each cryptcompress object is stored on disk as a set of clusters sliced
46065+into ctails.
46066+
46067+Internal on-disk structure:
46068+
46069+ HEADER (1) Here stored disk cluster shift
46070+ BODY
46071+*/
46072+
46073+#include "../../forward.h"
46074+#include "../../debug.h"
46075+#include "../../dformat.h"
46076+#include "../../kassign.h"
46077+#include "../../key.h"
46078+#include "../../coord.h"
46079+#include "item.h"
46080+#include "../node/node.h"
46081+#include "../plugin.h"
46082+#include "../object.h"
46083+#include "../../znode.h"
46084+#include "../../carry.h"
46085+#include "../../tree.h"
46086+#include "../../inode.h"
46087+#include "../../super.h"
46088+#include "../../context.h"
46089+#include "../../page_cache.h"
46090+#include "../cluster.h"
46091+#include "../../flush.h"
46092+#include "../../tree_walk.h"
46093+
46094+#include <linux/pagevec.h>
46095+#include <linux/swap.h>
46096+#include <linux/fs.h>
46097+
46098+/* return body of ctail item at @coord */
46099+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
46100+{
46101+ assert("edward-60", coord != NULL);
46102+ return item_body_by_coord(coord);
46103+}
46104+
46105+static int cluster_shift_by_coord(const coord_t * coord)
46106+{
46107+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
46108+}
46109+
46110+static inline void dclust_set_extension_shift(hint_t * hint)
46111+{
46112+ assert("edward-1270",
46113+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
46114+ hint->ext_coord.extension.ctail.shift =
46115+ cluster_shift_by_coord(&hint->ext_coord.coord);
46116+}
46117+
46118+static loff_t off_by_coord(const coord_t * coord)
46119+{
46120+ reiser4_key key;
46121+ return get_key_offset(item_key_by_coord(coord, &key));
46122+}
46123+
46124+int coord_is_unprepped_ctail(const coord_t * coord)
46125+{
46126+ assert("edward-1233", coord != NULL);
46127+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
46128+ assert("edward-1235",
46129+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
46130+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
46131+
46132+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
46133+}
46134+
46135+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
46136+{
46137+ int shift;
46138+
46139+ if (inode != NULL) {
46140+ shift = inode_cluster_shift(inode);
46141+ assert("edward-1236",
46142+ ergo(!coord_is_unprepped_ctail(coord),
46143+ shift == cluster_shift_by_coord(coord)));
46144+ } else {
46145+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
46146+ shift = cluster_shift_by_coord(coord);
46147+ }
46148+ return off_by_coord(coord) >> shift;
46149+}
46150+
46151+static int disk_cluster_size(const coord_t * coord)
46152+{
46153+ assert("edward-1156",
46154+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46155+ /* calculation of disk cluster size
46156+ is meaninless if ctail is unprepped */
46157+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
46158+
46159+ return 1 << cluster_shift_by_coord(coord);
46160+}
46161+
46162+/* true if the key is of first disk cluster item */
46163+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
46164+{
46165+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
46166+
46167+ return coord_is_unprepped_ctail(coord) ||
46168+ ((get_key_offset(key) &
46169+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
46170+}
46171+
46172+static char *first_unit(coord_t * coord)
46173+{
46174+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
46175+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
46176+}
46177+
46178+/* plugin->u.item.b.max_key_inside :
46179+ tail_max_key_inside */
46180+
46181+/* plugin->u.item.b.can_contain_key */
46182+int
46183+can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
46184+ const reiser4_item_data * data)
46185+{
46186+ reiser4_key item_key;
46187+
46188+ if (item_plugin_by_coord(coord) != data->iplug)
46189+ return 0;
46190+
46191+ item_key_by_coord(coord, &item_key);
46192+ if (get_key_locality(key) != get_key_locality(&item_key) ||
46193+ get_key_objectid(key) != get_key_objectid(&item_key))
46194+ return 0;
46195+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
46196+ get_key_offset(key))
46197+ return 0;
46198+ if (is_disk_cluster_key(key, coord))
46199+ return 0;
46200+ return 1;
46201+}
46202+
46203+/* plugin->u.item.b.mergeable
46204+ c-tails of different clusters are not mergeable */
46205+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
46206+{
46207+ reiser4_key key1, key2;
46208+
46209+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
46210+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
46211+ UNIX_FILE_METADATA_ITEM_TYPE));
46212+
46213+ if (item_id_by_coord(p2) != CTAIL_ID) {
46214+ /* second item is of another type */
46215+ return 0;
46216+ }
46217+
46218+ item_key_by_coord(p1, &key1);
46219+ item_key_by_coord(p2, &key2);
46220+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
46221+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
46222+ get_key_type(&key1) != get_key_type(&key2)) {
46223+ /* items of different objects */
46224+ return 0;
46225+ }
46226+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
46227+ /* not adjacent items */
46228+ return 0;
46229+ if (is_disk_cluster_key(&key2, p2))
46230+ return 0;
46231+ return 1;
46232+}
46233+
46234+/* plugin->u.item.b.nr_units */
46235+pos_in_node_t nr_units_ctail(const coord_t * coord)
46236+{
46237+ return (item_length_by_coord(coord) -
46238+ sizeof(ctail_formatted_at(coord)->cluster_shift));
46239+}
46240+
46241+/* plugin->u.item.b.estimate:
46242+ estimate how much space is needed to insert/paste @data->length bytes
46243+ into ctail at @coord */
46244+int estimate_ctail(const coord_t * coord /* coord of item */ ,
46245+ const reiser4_item_data *
46246+ data /* parameters for new item */ )
46247+{
46248+ if (coord == NULL)
46249+ /* insert */
46250+ return (sizeof(ctail_item_format) + data->length);
46251+ else
46252+ /* paste */
46253+ return data->length;
46254+}
46255+
46256+/* ->init() method for this item plugin. */
46257+int init_ctail(coord_t * to /* coord of item */ ,
46258+ coord_t * from /* old_item */ ,
46259+ reiser4_item_data * data /* structure used for insertion */ )
46260+{
46261+ int cluster_shift; /* cpu value to convert */
46262+
46263+ if (data) {
46264+ assert("edward-463", data->length > sizeof(ctail_item_format));
46265+ cluster_shift = *((int *)(data->arg));
46266+ data->length -= sizeof(ctail_item_format);
46267+ } else {
46268+ assert("edward-464", from != NULL);
46269+ assert("edward-855", ctail_ok(from));
46270+ cluster_shift = (int)(cluster_shift_by_coord(from));
46271+ }
46272+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
46273+ assert("edward-856", ctail_ok(to));
46274+ return 0;
46275+}
46276+
46277+/* plugin->u.item.b.lookup:
46278+ NULL: We are looking for item keys only */
46279+
46280+#if REISER4_DEBUG
46281+int ctail_ok(const coord_t * coord)
46282+{
46283+ return coord_is_unprepped_ctail(coord) ||
46284+ cluster_shift_ok(cluster_shift_by_coord(coord));
46285+}
46286+
46287+/* plugin->u.item.b.check */
46288+int check_ctail(const coord_t * coord, const char **error)
46289+{
46290+ if (!ctail_ok(coord)) {
46291+ if (error)
46292+ *error = "bad cluster shift in ctail";
46293+ return 1;
46294+ }
46295+ return 0;
46296+}
46297+#endif
46298+
46299+/* plugin->u.item.b.paste */
46300+int
46301+paste_ctail(coord_t * coord, reiser4_item_data * data,
46302+ carry_plugin_info * info UNUSED_ARG)
46303+{
46304+ unsigned old_nr_units;
46305+
46306+ assert("edward-268", data->data != NULL);
46307+ /* copy only from kernel space */
46308+ assert("edward-66", data->user == 0);
46309+
46310+ old_nr_units =
46311+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
46312+ data->length;
46313+
46314+ /* ctail items never get pasted in the middle */
46315+
46316+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
46317+
46318+ /* paste at the beginning when create new item */
46319+ assert("edward-450",
46320+ item_length_by_coord(coord) ==
46321+ data->length + sizeof(ctail_item_format));
46322+ assert("edward-451", old_nr_units == 0);
46323+ } else if (coord->unit_pos == old_nr_units - 1
46324+ && coord->between == AFTER_UNIT) {
46325+
46326+ /* paste at the end */
46327+ coord->unit_pos++;
46328+ } else
46329+ impossible("edward-453", "bad paste position");
46330+
46331+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
46332+
46333+ assert("edward-857", ctail_ok(coord));
46334+
46335+ return 0;
46336+}
46337+
46338+/* plugin->u.item.b.fast_paste */
46339+
46340+/* plugin->u.item.b.can_shift
46341+ number of units is returned via return value, number of bytes via @size. For
46342+ ctail items they coincide */
46343+int
46344+can_shift_ctail(unsigned free_space, coord_t * source,
46345+ znode * target, shift_direction direction UNUSED_ARG,
46346+ unsigned *size /* number of bytes */ , unsigned want)
46347+{
46348+ /* make sure that that we do not want to shift more than we have */
46349+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
46350+
46351+ *size = min(want, free_space);
46352+
46353+ if (!target) {
46354+ /* new item will be created */
46355+ if (*size <= sizeof(ctail_item_format)) {
46356+ *size = 0;
46357+ return 0;
46358+ }
46359+ return *size - sizeof(ctail_item_format);
46360+ }
46361+ return *size;
46362+}
46363+
46364+/* plugin->u.item.b.copy_units
46365+ cooperates with ->can_shift() */
46366+void
46367+copy_units_ctail(coord_t * target, coord_t * source,
46368+ unsigned from, unsigned count /* units */ ,
46369+ shift_direction where_is_free_space,
46370+ unsigned free_space /* bytes */ )
46371+{
46372+ /* make sure that item @target is expanded already */
46373+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
46374+ assert("edward-70", free_space == count || free_space == count + 1);
46375+
46376+ assert("edward-858", ctail_ok(source));
46377+
46378+ if (where_is_free_space == SHIFT_LEFT) {
46379+ /* append item @target with @count first bytes of @source:
46380+ this restriction came from ordinary tails */
46381+ assert("edward-71", from == 0);
46382+ assert("edward-860", ctail_ok(target));
46383+
46384+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
46385+ first_unit(source), count);
46386+ } else {
46387+ /* target item is moved to right already */
46388+ reiser4_key key;
46389+
46390+ assert("edward-72", nr_units_ctail(source) == from + count);
46391+
46392+ if (free_space == count) {
46393+ init_ctail(target, source, NULL);
46394+ } else {
46395+ /* new item has been created */
46396+ assert("edward-862", ctail_ok(target));
46397+ }
46398+ memcpy(first_unit(target), first_unit(source) + from, count);
46399+
46400+ assert("edward-863", ctail_ok(target));
46401+
46402+ /* new units are inserted before first unit in an item,
46403+ therefore, we have to update item key */
46404+ item_key_by_coord(source, &key);
46405+ set_key_offset(&key, get_key_offset(&key) + from);
46406+
46407+ node_plugin_by_node(target->node)->update_item_key(target, &key,
46408+ NULL /*info */);
46409+ }
46410+}
46411+
46412+/* plugin->u.item.b.create_hook */
46413+int create_hook_ctail(const coord_t * coord, void *arg)
46414+{
46415+ assert("edward-864", znode_is_loaded(coord->node));
46416+
46417+ znode_set_convertible(coord->node);
46418+ return 0;
46419+}
46420+
46421+/* plugin->u.item.b.kill_hook */
46422+int
46423+kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
46424+ carry_kill_data * kdata)
46425+{
46426+ struct inode *inode;
46427+
46428+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
46429+ assert("edward-291", znode_is_write_locked(coord->node));
46430+
46431+ inode = kdata->inode;
46432+ if (inode) {
46433+ reiser4_key key;
46434+ item_key_by_coord(coord, &key);
46435+
46436+ if (from == 0 && is_disk_cluster_key(&key, coord)) {
46437+ /* disk cluster is killed */
46438+ cloff_t start =
46439+ off_to_clust(get_key_offset(&key), inode);
46440+ truncate_page_cluster_cryptcompress(inode, start,
46441+ kdata->params.truncate);
46442+ inode_sub_bytes(inode, inode_cluster_size(inode));
46443+ }
46444+ }
46445+ return 0;
46446+}
46447+
46448+/* for shift_hook_ctail(),
46449+ return true if the first disk cluster item has dirty child
46450+*/
46451+static int ctail_convertible(const coord_t * coord)
46452+{
46453+ int result;
46454+ reiser4_key key;
46455+ jnode *child = NULL;
46456+
46457+ assert("edward-477", coord != NULL);
46458+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
46459+
46460+ if (coord_is_unprepped_ctail(coord))
46461+ /* unprepped ctail should be converted */
46462+ return 1;
46463+
46464+ item_key_by_coord(coord, &key);
46465+ child = jlookup(current_tree,
46466+ get_key_objectid(&key),
46467+ off_to_pg(off_by_coord(coord)));
46468+ if (!child)
46469+ return 0;
46470+ result = JF_ISSET(child, JNODE_DIRTY);
46471+ jput(child);
46472+ return result;
46473+}
46474+
46475+/* FIXME-EDWARD */
46476+/* plugin->u.item.b.shift_hook */
46477+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
46478+ unsigned from UNUSED_ARG /* start unit */ ,
46479+ unsigned count UNUSED_ARG /* stop unit */ ,
46480+ znode * old_node /* old parent */ )
46481+{
46482+ assert("edward-479", item != NULL);
46483+ assert("edward-480", item->node != old_node);
46484+
46485+ if (!znode_convertible(old_node) || znode_convertible(item->node))
46486+ return 0;
46487+ if (ctail_convertible(item))
46488+ znode_set_convertible(item->node);
46489+ return 0;
46490+}
46491+
46492+static int
46493+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46494+ int cut, void *p, reiser4_key * smallest_removed,
46495+ reiser4_key * new_first)
46496+{
46497+ pos_in_node_t count; /* number of units to cut */
46498+ char *item;
46499+
46500+ count = to - from + 1;
46501+ item = item_body_by_coord(coord);
46502+
46503+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
46504+
46505+ if (smallest_removed) {
46506+ /* store smallest key removed */
46507+ item_key_by_coord(coord, smallest_removed);
46508+ set_key_offset(smallest_removed,
46509+ get_key_offset(smallest_removed) + from);
46510+ }
46511+
46512+ if (new_first) {
46513+ assert("vs-1531", from == 0);
46514+
46515+ item_key_by_coord(coord, new_first);
46516+ set_key_offset(new_first,
46517+ get_key_offset(new_first) + from + count);
46518+ }
46519+
46520+ if (!cut)
46521+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
46522+
46523+ if (from == 0) {
46524+ if (count != nr_units_ctail(coord)) {
46525+ /* part of item is removed, so move free space at the beginning
46526+ of the item and update item key */
46527+ reiser4_key key;
46528+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
46529+ item_key_by_coord(coord, &key);
46530+ set_key_offset(&key, get_key_offset(&key) + count);
46531+ node_plugin_by_node(coord->node)->update_item_key(coord,
46532+ &key,
46533+ NULL);
46534+ } else {
46535+ /* cut_units should not be called to cut evrything */
46536+ assert("vs-1532", ergo(cut, 0));
46537+ /* whole item is cut, so more then amount of space occupied
46538+ by units got freed */
46539+ count += sizeof(ctail_item_format);
46540+ }
46541+ if (REISER4_DEBUG)
46542+ memset(item, 0, count);
46543+ } else if (REISER4_DEBUG)
46544+ memset(item + sizeof(ctail_item_format) + from, 0, count);
46545+ return count;
46546+}
46547+
46548+/* plugin->u.item.b.cut_units */
46549+int
46550+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
46551+ carry_cut_data * cdata, reiser4_key * smallest_removed,
46552+ reiser4_key * new_first)
46553+{
46554+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
46555+ smallest_removed, new_first);
46556+}
46557+
46558+/* plugin->u.item.b.kill_units */
46559+int
46560+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
46561+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
46562+ reiser4_key * new_first)
46563+{
46564+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
46565+ smallest_removed, new_first);
46566+}
46567+
46568+/* plugin->u.item.s.file.read */
46569+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
46570+{
46571+ uf_coord_t *uf_coord;
46572+ coord_t *coord;
46573+
46574+ uf_coord = &hint->ext_coord;
46575+ coord = &uf_coord->coord;
46576+ assert("edward-127", f->user == 0);
46577+ assert("edward-129", coord && coord->node);
46578+ assert("edward-130", coord_is_existing_unit(coord));
46579+ assert("edward-132", znode_is_loaded(coord->node));
46580+
46581+ /* start read only from the beginning of ctail */
46582+ assert("edward-133", coord->unit_pos == 0);
46583+ /* read only whole ctails */
46584+ assert("edward-135", nr_units_ctail(coord) <= f->length);
46585+
46586+ assert("edward-136", reiser4_schedulable());
46587+ assert("edward-886", ctail_ok(coord));
46588+
46589+ if (f->data)
46590+ memcpy(f->data, (char *)first_unit(coord),
46591+ (size_t) nr_units_ctail(coord));
46592+
46593+ dclust_set_extension_shift(hint);
46594+ mark_page_accessed(znode_page(coord->node));
46595+ move_flow_forward(f, nr_units_ctail(coord));
46596+
46597+ return 0;
46598+}
46599+
46600+/* Reads a disk cluster consists of ctail items,
46601+ attaches a transform stream with plain text */
46602+int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
46603+ znode_lock_mode mode)
46604+{
46605+ int result;
46606+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
46607+ assert("edward-671", clust->hint != NULL);
46608+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
46609+ assert("edward-672", cryptcompress_inode_ok(inode));
46610+
46611+ /* set input stream */
46612+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
46613+ if (result)
46614+ return result;
46615+
46616+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
46617+ assert("edward-1340", !result);
46618+ if (result)
46619+ return result;
46620+ if (mode == ZNODE_READ_LOCK)
46621+ /* write still need the lock to insert unprepped
46622+ items, etc... */
46623+ put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
46624+
46625+ if (clust->dstat == FAKE_DISK_CLUSTER ||
46626+ clust->dstat == UNPR_DISK_CLUSTER) {
46627+ tfm_cluster_set_uptodate(&clust->tc);
46628+ return 0;
46629+ }
46630+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
46631+ if (result)
46632+ return result;
46633+ result = reiser4_inflate_cluster(clust, inode);
46634+ if (result)
46635+ return result;
46636+ tfm_cluster_set_uptodate(&clust->tc);
46637+ return 0;
46638+}
46639+
46640+/* read one locked page */
46641+int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
46642+ struct page *page, znode_lock_mode mode)
46643+{
46644+ int ret;
46645+ unsigned cloff;
46646+ char *data;
46647+ size_t pgcnt;
46648+ tfm_cluster_t *tc = &clust->tc;
46649+
46650+ assert("edward-212", PageLocked(page));
46651+
46652+ if (PageUptodate(page))
46653+ goto exit;
46654+
46655+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
46656+ clust->index = pg_to_clust(page->index, inode);
46657+ unlock_page(page);
46658+ ret = ctail_read_disk_cluster(clust, inode, mode);
46659+ lock_page(page);
46660+ if (ret)
46661+ return ret;
46662+ }
46663+ if (PageUptodate(page))
46664+ /* races with another read/write */
46665+ goto exit;
46666+
46667+ /* bytes in the page */
46668+ pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
46669+
46670+ if (pgcnt == 0) {
46671+ assert("edward-1290", 0);
46672+ return RETERR(-EINVAL);
46673+ }
46674+ assert("edward-119", tfm_cluster_is_uptodate(tc));
46675+
46676+ switch (clust->dstat) {
46677+ case UNPR_DISK_CLUSTER:
46678+ assert("edward-1285", 0);
46679+#if REISER4_DEBUG
46680+ warning("edward-1168",
46681+ "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46682+ page->index, clust->index,
46683+ (unsigned long long)get_inode_oid(inode));
46684+#endif
46685+ case FAKE_DISK_CLUSTER:
46686+ /* fill the page by zeroes */
46687+ data = kmap_atomic(page, KM_USER0);
46688+
46689+ memset(data, 0, PAGE_CACHE_SIZE);
46690+ flush_dcache_page(page);
46691+ kunmap_atomic(data, KM_USER0);
46692+ SetPageUptodate(page);
46693+ break;
46694+ case PREP_DISK_CLUSTER:
46695+ /* fill the page by transformed data */
46696+ assert("edward-1058", !PageUptodate(page));
46697+ assert("edward-120", tc->len <= inode_cluster_size(inode));
46698+
46699+ /* start page offset in the cluster */
46700+ cloff = pg_to_off_to_cloff(page->index, inode);
46701+
46702+ data = kmap(page);
46703+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46704+ memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46705+ flush_dcache_page(page);
46706+ kunmap(page);
46707+ SetPageUptodate(page);
46708+ break;
46709+ default:
46710+ impossible("edward-1169", "bad disk cluster state");
46711+ }
46712+ exit:
46713+ return 0;
46714+}
46715+
46716+/* plugin->u.item.s.file.readpage */
46717+int readpage_ctail(void *vp, struct page *page)
46718+{
46719+ int result;
46720+ hint_t *hint;
46721+ reiser4_cluster_t *clust = vp;
46722+
46723+ assert("edward-114", clust != NULL);
46724+ assert("edward-115", PageLocked(page));
46725+ assert("edward-116", !PageUptodate(page));
46726+ assert("edward-117", !jprivate(page) && !PagePrivate(page));
46727+ assert("edward-118", page->mapping && page->mapping->host);
46728+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46729+
46730+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46731+ if (hint == NULL) {
46732+ unlock_page(page);
46733+ return RETERR(-ENOMEM);
46734+ }
46735+ clust->hint = hint;
46736+ result = load_file_hint(clust->file, hint);
46737+ if (result) {
46738+ kfree(hint);
46739+ unlock_page(page);
46740+ return result;
46741+ }
46742+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
46743+ result = do_readpage_ctail(page->mapping->host, clust, page,
46744+ ZNODE_READ_LOCK);
46745+
46746+ assert("edward-213", PageLocked(page));
46747+ assert("edward-1163", ergo(!result, PageUptodate(page)));
46748+ assert("edward-868",
46749+ ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46750+
46751+ unlock_page(page);
46752+ done_lh(&hint->lh);
46753+ hint->ext_coord.valid = 0;
46754+ save_file_hint(clust->file, hint);
46755+ kfree(hint);
46756+ tfm_cluster_clr_uptodate(&clust->tc);
46757+
46758+ return result;
46759+}
46760+
46761+/* Helper function for ->readpages() */
46762+static int
46763+ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46764+{
46765+ int i;
46766+ int result;
46767+ assert("edward-779", clust != NULL);
46768+ assert("edward-1059", clust->win == NULL);
46769+ assert("edward-780", inode != NULL);
46770+
46771+ result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46772+ if (result)
46773+ return result;
46774+ result = ctail_read_disk_cluster(clust, inode, ZNODE_READ_LOCK);
46775+ if (result)
46776+ goto out;
46777+ /* at this point stream with valid plain text is attached */
46778+ assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46779+
46780+ for (i = 0; i < clust->nr_pages; i++) {
46781+ struct page *page = clust->pages[i];
46782+ lock_page(page);
46783+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46784+ unlock_page(page);
46785+ if (result)
46786+ break;
46787+ }
46788+ tfm_cluster_clr_uptodate(&clust->tc);
46789+ out:
46790+ reiser4_release_cluster_pages(clust);
46791+ return result;
46792+}
46793+
46794+/* filler for read_cache_pages() */
46795+static int ctail_readpages_filler(void * data, struct page * page)
46796+{
46797+ int ret = 0;
46798+ reiser4_cluster_t * clust = data;
46799+ struct inode * inode = clust->file->f_dentry->d_inode;
46800+
46801+ if (PageUptodate(page)) {
46802+ unlock_page(page);
46803+ return 0;
46804+ }
46805+ unlock_page(page);
46806+ move_cluster_forward(clust, inode, page->index);
46807+ ret = ctail_read_page_cluster(clust, inode);
46808+ if (ret)
46809+ return ret;
46810+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
46811+
46812+ lock_page(page);
46813+ ret = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46814+ assert("edward-1061", ergo(!ret, PageUptodate(page)));
46815+ unlock_page(page);
46816+
46817+ return ret;
46818+}
46819+
46820+/* We populate a bit more then upper readahead suggests:
46821+ with each nominated page we read the whole page cluster
46822+ this page belongs to. */
46823+int readpages_ctail(struct file *file, struct address_space *mapping,
46824+ struct list_head *pages)
46825+{
46826+ int ret = 0;
46827+ hint_t *hint;
46828+ reiser4_cluster_t clust;
46829+ struct inode *inode = mapping->host;
46830+
46831+ assert("edward-1521", inode == file->f_dentry->d_inode);
46832+
46833+ cluster_init_read(&clust, NULL);
46834+ clust.file = file;
46835+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46836+ if (hint == NULL) {
46837+ warning("vs-28", "failed to allocate hint");
46838+ ret = RETERR(-ENOMEM);
46839+ goto exit1;
46840+ }
46841+ clust.hint = hint;
46842+ ret = load_file_hint(clust.file, hint);
46843+ if (ret) {
46844+ warning("edward-1522", "failed to load hint");
46845+ goto exit2;
46846+ }
46847+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
46848+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46849+ if (ret) {
46850+ warning("edward-1523", "failed to alloc pgset");
46851+ goto exit3;
46852+ }
46853+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
46854+
46855+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46856+ exit3:
46857+ done_lh(&hint->lh);
46858+ save_file_hint(file, hint);
46859+ hint->ext_coord.valid = 0;
46860+ exit2:
46861+ kfree(hint);
46862+ exit1:
46863+ put_cluster_handle(&clust);
46864+ return ret;
46865+}
46866+
46867+/*
46868+ plugin->u.item.s.file.append_key
46869+ key of the first item of the next disk cluster
46870+*/
46871+reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46872+{
46873+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46874+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46875+
46876+ item_key_by_coord(coord, key);
46877+ set_key_offset(key,
46878+ ((__u64) (clust_by_coord(coord, NULL)) +
46879+ 1) << cluster_shift_by_coord(coord));
46880+ return key;
46881+}
46882+
46883+static int
46884+insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46885+{
46886+ int result;
46887+ char buf[UCTAIL_NR_UNITS];
46888+ reiser4_item_data data;
46889+ reiser4_key key;
46890+ int shift = (int)UCTAIL_SHIFT;
46891+
46892+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46893+ result = key_by_inode_cryptcompress(inode,
46894+ clust_to_off(clust->index, inode),
46895+ &key);
46896+ if (result)
46897+ return result;
46898+ data.user = 0;
46899+ data.iplug = item_plugin_by_id(CTAIL_ID);
46900+ data.arg = &shift;
46901+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46902+ data.data = buf;
46903+
46904+ result = insert_by_coord(&clust->hint->ext_coord.coord,
46905+ &data, &key, clust->hint->ext_coord.lh, 0);
46906+ return result;
46907+}
46908+
46909+static int
46910+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46911+ struct inode *inode)
46912+{
46913+ int result;
46914+ carry_pool *pool;
46915+ carry_level *lowest_level;
46916+ reiser4_item_data *data;
46917+ carry_op *op;
46918+ int cluster_shift = inode_cluster_shift(inode);
46919+
46920+ pool =
46921+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46922+ sizeof(*data));
46923+ if (IS_ERR(pool))
46924+ return PTR_ERR(pool);
46925+ lowest_level = (carry_level *) (pool + 1);
46926+ init_carry_level(lowest_level, pool);
46927+ data = (reiser4_item_data *) (lowest_level + 3);
46928+
46929+ assert("edward-466", coord->between == AFTER_ITEM
46930+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46931+ || coord->between == EMPTY_NODE
46932+ || coord->between == BEFORE_UNIT);
46933+
46934+ if (coord->between == AFTER_UNIT) {
46935+ coord->unit_pos = 0;
46936+ coord->between = AFTER_ITEM;
46937+ }
46938+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46939+ 0 /* operate directly on coord -> node */);
46940+ if (IS_ERR(op) || (op == NULL)) {
46941+ done_carry_pool(pool);
46942+ return RETERR(op ? PTR_ERR(op) : -EIO);
46943+ }
46944+ data->user = 0;
46945+ data->iplug = item_plugin_by_id(CTAIL_ID);
46946+ data->arg = &cluster_shift;
46947+
46948+ data->length = 0;
46949+ data->data = NULL;
46950+
46951+ op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46952+ op->u.insert_flow.insert_point = coord;
46953+ op->u.insert_flow.flow = f;
46954+ op->u.insert_flow.data = data;
46955+ op->u.insert_flow.new_nodes = 0;
46956+
46957+ lowest_level->track_type = CARRY_TRACK_CHANGE;
46958+ lowest_level->tracked = lh;
46959+
46960+ result = reiser4_carry(lowest_level, NULL);
46961+ done_carry_pool(pool);
46962+
46963+ return result;
46964+}
46965+
46966+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46967+static int insert_cryptcompress_flow_in_place(coord_t * coord,
46968+ lock_handle * lh, flow_t * f,
46969+ struct inode *inode)
46970+{
46971+ int ret;
46972+ coord_t pos;
46973+ lock_handle lock;
46974+
46975+ assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46976+ assert("edward-484", coord->between == AT_UNIT
46977+ || coord->between == AFTER_ITEM);
46978+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46979+
46980+ coord_dup(&pos, coord);
46981+ pos.unit_pos = 0;
46982+ pos.between = AFTER_ITEM;
46983+
46984+ init_lh(&lock);
46985+ copy_lh(&lock, lh);
46986+
46987+ ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
46988+ done_lh(&lock);
46989+ assert("edward-1347", znode_is_write_locked(lh->node));
46990+ assert("edward-1228", !ret);
46991+ return ret;
46992+}
46993+
46994+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46995+static int overwrite_ctail(coord_t * coord, flow_t * f)
46996+{
46997+ unsigned count;
46998+
46999+ assert("edward-269", f->user == 0);
47000+ assert("edward-270", f->data != NULL);
47001+ assert("edward-271", f->length > 0);
47002+ assert("edward-272", coord_is_existing_unit(coord));
47003+ assert("edward-273", coord->unit_pos == 0);
47004+ assert("edward-274", znode_is_write_locked(coord->node));
47005+ assert("edward-275", reiser4_schedulable());
47006+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
47007+ assert("edward-1243", ctail_ok(coord));
47008+
47009+ count = nr_units_ctail(coord);
47010+
47011+ if (count > f->length)
47012+ count = f->length;
47013+ memcpy(first_unit(coord), f->data, count);
47014+ move_flow_forward(f, count);
47015+ coord->unit_pos += count;
47016+ return 0;
47017+}
47018+
47019+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
47020+ cut ctail (part or whole) starting from next unit position */
47021+static int cut_ctail(coord_t * coord)
47022+{
47023+ coord_t stop;
47024+
47025+ assert("edward-435", coord->between == AT_UNIT &&
47026+ coord->item_pos < coord_num_items(coord) &&
47027+ coord->unit_pos <= coord_num_units(coord));
47028+
47029+ if (coord->unit_pos == coord_num_units(coord))
47030+ /* nothing to cut */
47031+ return 0;
47032+ coord_dup(&stop, coord);
47033+ stop.unit_pos = coord_last_unit_pos(coord);
47034+
47035+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
47036+}
47037+
47038+int
47039+ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
47040+{
47041+ int result;
47042+ assert("edward-1244", inode != NULL);
47043+ assert("edward-1245", clust->hint != NULL);
47044+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
47045+ assert("edward-1247", clust->reserved == 1);
47046+
47047+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
47048+ if (cbk_errored(result))
47049+ return result;
47050+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
47051+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
47052+
47053+ assert("edward-1295",
47054+ clust->hint->ext_coord.lh->node ==
47055+ clust->hint->ext_coord.coord.node);
47056+
47057+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
47058+
47059+ result = insert_unprepped_ctail(clust, inode);
47060+ all_grabbed2free();
47061+
47062+ assert("edward-1251", !result);
47063+ assert("edward-1252", cryptcompress_inode_ok(inode));
47064+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
47065+ assert("edward-1254",
47066+ reiser4_clustered_blocks(reiser4_get_current_sb()));
47067+ assert("edward-1255",
47068+ znode_convertible(clust->hint->ext_coord.coord.node));
47069+
47070+ return result;
47071+}
47072+
47073+static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
47074+{
47075+ int result = 0;
47076+ convert_item_info_t *info;
47077+
47078+ assert("edward-468", pos != NULL);
47079+ assert("edward-469", pos->sq != NULL);
47080+ assert("edward-845", item_convert_data(pos) != NULL);
47081+
47082+ info = item_convert_data(pos);
47083+ assert("edward-679", info->flow.data != NULL);
47084+
47085+ switch (mode) {
47086+ case CRC_APPEND_ITEM:
47087+ assert("edward-1229", info->flow.length != 0);
47088+ assert("edward-1256",
47089+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
47090+ result =
47091+ insert_cryptcompress_flow_in_place(&pos->coord,
47092+ &pos->lock,
47093+ &info->flow,
47094+ info->inode);
47095+ break;
47096+ case CRC_OVERWRITE_ITEM:
47097+ assert("edward-1230", info->flow.length != 0);
47098+ overwrite_ctail(&pos->coord, &info->flow);
47099+ if (info->flow.length != 0)
47100+ break;
47101+ case CRC_CUT_ITEM:
47102+ assert("edward-1231", info->flow.length == 0);
47103+ result = cut_ctail(&pos->coord);
47104+ break;
47105+ default:
47106+ result = RETERR(-EIO);
47107+ impossible("edward-244", "bad convert mode");
47108+ }
47109+ return result;
47110+}
47111+
47112+/* plugin->u.item.f.scan */
47113+int scan_ctail(flush_scan * scan)
47114+{
47115+ int result = 0;
47116+ struct page *page;
47117+ struct inode *inode;
47118+ jnode *node = scan->node;
47119+
47120+ assert("edward-227", scan->node != NULL);
47121+ assert("edward-228", jnode_is_cluster_page(scan->node));
47122+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
47123+
47124+ page = jnode_page(node);
47125+ inode = page->mapping->host;
47126+
47127+ if (!reiser4_scanning_left(scan))
47128+ return result;
47129+ if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
47130+ znode_make_dirty(scan->parent_lock.node);
47131+
47132+ if (!znode_convertible(scan->parent_lock.node)) {
47133+ if (JF_ISSET(scan->node, JNODE_DIRTY))
47134+ znode_set_convertible(scan->parent_lock.node);
47135+ else {
47136+ warning("edward-681",
47137+ "cluster page is already processed");
47138+ return -EAGAIN;
47139+ }
47140+ }
47141+ return result;
47142+}
47143+
47144+/* If true, this function attaches children */
47145+static int should_attach_convert_idata(flush_pos_t * pos)
47146+{
47147+ int result;
47148+ assert("edward-431", pos != NULL);
47149+ assert("edward-432", pos->child == NULL);
47150+ assert("edward-619", znode_is_write_locked(pos->coord.node));
47151+ assert("edward-470",
47152+ item_plugin_by_coord(&pos->coord) ==
47153+ item_plugin_by_id(CTAIL_ID));
47154+
47155+ /* check for leftmost child */
47156+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
47157+
47158+ if (!pos->child)
47159+ return 0;
47160+ spin_lock_jnode(pos->child);
47161+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
47162+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
47163+ spin_unlock_jnode(pos->child);
47164+ if (!result && pos->child) {
47165+ /* existing child isn't to attach, clear up this one */
47166+ jput(pos->child);
47167+ pos->child = NULL;
47168+ }
47169+ return result;
47170+}
47171+
47172+/* plugin->init_convert_data() */
47173+static int
47174+init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
47175+{
47176+ assert("edward-813", idata != NULL);
47177+ assert("edward-814", inode != NULL);
47178+
47179+ idata->inode = inode;
47180+ idata->d_cur = DC_FIRST_ITEM;
47181+ idata->d_next = DC_INVALID_STATE;
47182+
47183+ return 0;
47184+}
47185+
47186+static int alloc_item_convert_data(convert_info_t * sq)
47187+{
47188+ assert("edward-816", sq != NULL);
47189+ assert("edward-817", sq->itm == NULL);
47190+
47191+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
47192+ if (sq->itm == NULL)
47193+ return RETERR(-ENOMEM);
47194+ return 0;
47195+}
47196+
47197+static void free_item_convert_data(convert_info_t * sq)
47198+{
47199+ assert("edward-818", sq != NULL);
47200+ assert("edward-819", sq->itm != NULL);
47201+ assert("edward-820", sq->iplug != NULL);
47202+
47203+ kfree(sq->itm);
47204+ sq->itm = NULL;
47205+ return;
47206+}
47207+
47208+static int alloc_convert_data(flush_pos_t * pos)
47209+{
47210+ assert("edward-821", pos != NULL);
47211+ assert("edward-822", pos->sq == NULL);
47212+
47213+ pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
47214+ if (!pos->sq)
47215+ return RETERR(-ENOMEM);
47216+ memset(pos->sq, 0, sizeof(*pos->sq));
47217+ cluster_init_write(&pos->sq->clust, NULL);
47218+ return 0;
47219+}
47220+
47221+void free_convert_data(flush_pos_t * pos)
47222+{
47223+ convert_info_t *sq;
47224+
47225+ assert("edward-823", pos != NULL);
47226+ assert("edward-824", pos->sq != NULL);
47227+
47228+ sq = pos->sq;
47229+ if (sq->itm)
47230+ free_item_convert_data(sq);
47231+ put_cluster_handle(&sq->clust);
47232+ kfree(pos->sq);
47233+ pos->sq = NULL;
47234+ return;
47235+}
47236+
47237+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
47238+{
47239+ convert_info_t *sq;
47240+
47241+ assert("edward-825", pos != NULL);
47242+ assert("edward-826", pos->sq != NULL);
47243+ assert("edward-827", item_convert_data(pos) != NULL);
47244+ assert("edward-828", inode != NULL);
47245+
47246+ sq = pos->sq;
47247+
47248+ memset(sq->itm, 0, sizeof(*sq->itm));
47249+
47250+ /* iplug->init_convert_data() */
47251+ return init_convert_data_ctail(sq->itm, inode);
47252+}
47253+
47254+/* create and attach disk cluster info used by 'convert' phase of the flush
47255+ squalloc() */
47256+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
47257+{
47258+ int ret = 0;
47259+ convert_item_info_t *info;
47260+ reiser4_cluster_t *clust;
47261+ file_plugin *fplug = inode_file_plugin(inode);
47262+ compression_plugin *cplug = inode_compression_plugin(inode);
47263+
47264+ assert("edward-248", pos != NULL);
47265+ assert("edward-249", pos->child != NULL);
47266+ assert("edward-251", inode != NULL);
47267+ assert("edward-682", cryptcompress_inode_ok(inode));
47268+ assert("edward-252",
47269+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
47270+ assert("edward-473",
47271+ item_plugin_by_coord(&pos->coord) ==
47272+ item_plugin_by_id(CTAIL_ID));
47273+
47274+ if (!pos->sq) {
47275+ ret = alloc_convert_data(pos);
47276+ if (ret)
47277+ return ret;
47278+ }
47279+ clust = &pos->sq->clust;
47280+ ret = grab_coa(&clust->tc, cplug);
47281+ if (ret)
47282+ goto err;
47283+ ret = set_cluster_by_page(clust,
47284+ jnode_page(pos->child),
47285+ MAX_CLUSTER_NRPAGES);
47286+ if (ret)
47287+ goto err;
47288+
47289+ assert("edward-829", pos->sq != NULL);
47290+ assert("edward-250", item_convert_data(pos) == NULL);
47291+
47292+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
47293+
47294+ ret = alloc_item_convert_data(pos->sq);
47295+ if (ret)
47296+ goto err;
47297+ ret = init_item_convert_data(pos, inode);
47298+ if (ret)
47299+ goto err;
47300+ info = item_convert_data(pos);
47301+
47302+ ret = flush_cluster_pages(clust, pos->child, inode);
47303+ if (ret)
47304+ goto err;
47305+
47306+ reiser4_deflate_cluster(clust, inode);
47307+ inc_item_convert_count(pos);
47308+
47309+ /* make flow by transformed stream */
47310+ fplug->flow_by_inode(info->inode,
47311+ (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
47312+ 0 /* kernel space */ ,
47313+ clust->tc.len,
47314+ clust_to_off(clust->index, inode),
47315+ WRITE_OP, &info->flow);
47316+ jput(pos->child);
47317+
47318+ assert("edward-683", cryptcompress_inode_ok(inode));
47319+ return 0;
47320+ err:
47321+ jput(pos->child);
47322+ free_convert_data(pos);
47323+ return ret;
47324+}
47325+
47326+/* clear up disk cluster info */
47327+static void detach_convert_idata(convert_info_t * sq)
47328+{
47329+ convert_item_info_t *info;
47330+
47331+ assert("edward-253", sq != NULL);
47332+ assert("edward-840", sq->itm != NULL);
47333+
47334+ info = sq->itm;
47335+ assert("edward-255", info->inode != NULL);
47336+ assert("edward-1212", info->flow.length == 0);
47337+
47338+ free_item_convert_data(sq);
47339+ return;
47340+}
47341+
47342+/* plugin->u.item.f.utmost_child */
47343+
47344+/* This function sets leftmost child for a first cluster item,
47345+ if the child exists, and NULL in other cases.
47346+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
47347+
47348+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
47349+{
47350+ reiser4_key key;
47351+
47352+ item_key_by_coord(coord, &key);
47353+
47354+ assert("edward-257", coord != NULL);
47355+ assert("edward-258", child != NULL);
47356+ assert("edward-259", side == LEFT_SIDE);
47357+ assert("edward-260",
47358+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
47359+
47360+ if (!is_disk_cluster_key(&key, coord))
47361+ *child = NULL;
47362+ else
47363+ *child = jlookup(current_tree,
47364+ get_key_objectid(item_key_by_coord
47365+ (coord, &key)),
47366+ off_to_pg(get_key_offset(&key)));
47367+ return 0;
47368+}
47369+
47370+/* Returns true if @p2 is the next item to @p1
47371+ in the _same_ disk cluster.
47372+ Disk cluster is a set of items. If ->clustered() != NULL,
47373+ with each item the whole disk cluster should be read/modified
47374+*/
47375+static int clustered_ctail(const coord_t * p1, const coord_t * p2)
47376+{
47377+ return mergeable_ctail(p1, p2);
47378+}
47379+
47380+/* Go rightward and check for next disk cluster item, set
47381+ d_next to DC_CHAINED_ITEM, if the last one exists.
47382+ If the current position is last item, go to right neighbor.
47383+ Skip empty nodes. Note, that right neighbors may be not in
47384+ the slum because of races. If so, make it dirty and
47385+ convertible.
47386+*/
47387+static int next_item_dc_stat(flush_pos_t * pos)
47388+{
47389+ int ret = 0;
47390+ int stop = 0;
47391+ znode *cur;
47392+ coord_t coord;
47393+ lock_handle lh;
47394+ lock_handle right_lock;
47395+
47396+ assert("edward-1232", !node_is_empty(pos->coord.node));
47397+ assert("edward-1014",
47398+ pos->coord.item_pos < coord_num_items(&pos->coord));
47399+ assert("edward-1015", chaining_data_present(pos));
47400+ assert("edward-1017",
47401+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
47402+
47403+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
47404+
47405+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
47406+ return ret;
47407+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
47408+ return ret;
47409+
47410+ /* check next slum item */
47411+ init_lh(&right_lock);
47412+ cur = pos->coord.node;
47413+
47414+ while (!stop) {
47415+ init_lh(&lh);
47416+ ret = reiser4_get_right_neighbor(&lh,
47417+ cur,
47418+ ZNODE_WRITE_LOCK,
47419+ GN_CAN_USE_UPPER_LEVELS);
47420+ if (ret)
47421+ break;
47422+ ret = zload(lh.node);
47423+ if (ret) {
47424+ done_lh(&lh);
47425+ break;
47426+ }
47427+ coord_init_before_first_item(&coord, lh.node);
47428+
47429+ if (node_is_empty(lh.node)) {
47430+ znode_make_dirty(lh.node);
47431+ znode_set_convertible(lh.node);
47432+ stop = 0;
47433+ } else if (clustered_ctail(&pos->coord, &coord)) {
47434+
47435+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
47436+
47437+ if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
47438+ /*
47439+ warning("edward-1024",
47440+ "next slum item mergeable, "
47441+ "but znode %p isn't dirty\n",
47442+ lh.node);
47443+ */
47444+ znode_make_dirty(lh.node);
47445+ }
47446+ if (!znode_convertible(lh.node)) {
47447+ /*
47448+ warning("edward-1272",
47449+ "next slum item mergeable, "
47450+ "but znode %p isn't convertible\n",
47451+ lh.node);
47452+ */
47453+ znode_set_convertible(lh.node);
47454+ }
47455+ stop = 1;
47456+ } else
47457+ stop = 1;
47458+ zrelse(lh.node);
47459+ done_lh(&right_lock);
47460+ copy_lh(&right_lock, &lh);
47461+ done_lh(&lh);
47462+ cur = right_lock.node;
47463+ }
47464+ done_lh(&right_lock);
47465+
47466+ if (ret == -E_NO_NEIGHBOR)
47467+ ret = 0;
47468+ return ret;
47469+}
47470+
47471+static int
47472+assign_convert_mode(convert_item_info_t * idata,
47473+ cryptcompress_write_mode_t * mode)
47474+{
47475+ int result = 0;
47476+
47477+ assert("edward-1025", idata != NULL);
47478+
47479+ if (idata->flow.length) {
47480+ /* append or overwrite */
47481+ switch (idata->d_cur) {
47482+ case DC_FIRST_ITEM:
47483+ case DC_CHAINED_ITEM:
47484+ *mode = CRC_OVERWRITE_ITEM;
47485+ break;
47486+ case DC_AFTER_CLUSTER:
47487+ *mode = CRC_APPEND_ITEM;
47488+ break;
47489+ default:
47490+ impossible("edward-1018", "wrong current item state");
47491+ }
47492+ } else {
47493+ /* cut or invalidate */
47494+ switch (idata->d_cur) {
47495+ case DC_FIRST_ITEM:
47496+ case DC_CHAINED_ITEM:
47497+ *mode = CRC_CUT_ITEM;
47498+ break;
47499+ case DC_AFTER_CLUSTER:
47500+ result = 1;
47501+ break;
47502+ default:
47503+ impossible("edward-1019", "wrong current item state");
47504+ }
47505+ }
47506+ return result;
47507+}
47508+
47509+/* plugin->u.item.f.convert */
47510+/* write ctail in guessed mode */
47511+int convert_ctail(flush_pos_t * pos)
47512+{
47513+ int result;
47514+ int nr_items;
47515+ cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
47516+
47517+ assert("edward-1020", pos != NULL);
47518+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
47519+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
47520+ assert("edward-1258", ctail_ok(&pos->coord));
47521+ assert("edward-261", pos->coord.node != NULL);
47522+
47523+ nr_items = coord_num_items(&pos->coord);
47524+ if (!chaining_data_present(pos)) {
47525+ if (should_attach_convert_idata(pos)) {
47526+ /* attach convert item info */
47527+ struct inode *inode;
47528+
47529+ assert("edward-264", pos->child != NULL);
47530+ assert("edward-265", jnode_page(pos->child) != NULL);
47531+ assert("edward-266",
47532+ jnode_page(pos->child)->mapping != NULL);
47533+
47534+ inode = jnode_page(pos->child)->mapping->host;
47535+
47536+ assert("edward-267", inode != NULL);
47537+
47538+ /* attach item convert info by child and put the last one */
47539+ result = attach_convert_idata(pos, inode);
47540+ pos->child = NULL;
47541+ if (result == -E_REPEAT) {
47542+ /* jnode became clean, or there is no dirty
47543+ pages (nothing to update in disk cluster) */
47544+ warning("edward-1021",
47545+ "convert_ctail: nothing to attach");
47546+ return 0;
47547+ }
47548+ if (result != 0)
47549+ return result;
47550+ } else
47551+ /* unconvertible */
47552+ return 0;
47553+ } else {
47554+ /* use old convert info */
47555+
47556+ convert_item_info_t *idata;
47557+
47558+ idata = item_convert_data(pos);
47559+
47560+ result = assign_convert_mode(idata, &mode);
47561+ if (result) {
47562+ /* disk cluster is over,
47563+ nothing to update anymore */
47564+ detach_convert_idata(pos->sq);
47565+ return 0;
47566+ }
47567+ }
47568+
47569+ assert("edward-433", chaining_data_present(pos));
47570+ assert("edward-1022",
47571+ pos->coord.item_pos < coord_num_items(&pos->coord));
47572+
47573+ result = next_item_dc_stat(pos);
47574+ if (result) {
47575+ detach_convert_idata(pos->sq);
47576+ return result;
47577+ }
47578+ result = do_convert_ctail(pos, mode);
47579+ if (result) {
47580+ detach_convert_idata(pos->sq);
47581+ return result;
47582+ }
47583+ switch (mode) {
47584+ case CRC_CUT_ITEM:
47585+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
47586+ assert("edward-1215",
47587+ coord_num_items(&pos->coord) == nr_items ||
47588+ coord_num_items(&pos->coord) == nr_items - 1);
47589+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
47590+ break;
47591+ if (coord_num_items(&pos->coord) != nr_items) {
47592+ /* the item was killed, no more chained items */
47593+ detach_convert_idata(pos->sq);
47594+ if (!node_is_empty(pos->coord.node))
47595+ /* make sure the next item will be scanned */
47596+ coord_init_before_item(&pos->coord);
47597+ break;
47598+ }
47599+ case CRC_APPEND_ITEM:
47600+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
47601+ detach_convert_idata(pos->sq);
47602+ break;
47603+ case CRC_OVERWRITE_ITEM:
47604+ if (coord_is_unprepped_ctail(&pos->coord)) {
47605+ /* convert unpprepped ctail to prepped one */
47606+ int shift;
47607+ shift =
47608+ inode_cluster_shift(item_convert_data(pos)->inode);
47609+ assert("edward-1259", cluster_shift_ok(shift));
47610+ put_unaligned((d8)shift,
47611+ &ctail_formatted_at(&pos->coord)->
47612+ cluster_shift);
47613+ }
47614+ break;
47615+ }
47616+ return result;
47617+}
47618+
47619+/* Make Linus happy.
47620+ Local variables:
47621+ c-indentation-style: "K&R"
47622+ mode-name: "LC"
47623+ c-basic-offset: 8
47624+ tab-width: 8
47625+ fill-column: 120
47626+ End:
47627+*/
47628diff --git a/fs/reiser4/plugin/item/ctail.h b/fs/reiser4/plugin/item/ctail.h
47629new file mode 100644
47630index 0000000..ead4418
47631--- /dev/null
47632+++ b/fs/reiser4/plugin/item/ctail.h
47633@@ -0,0 +1,97 @@
47634+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47635+
47636+#if !defined( __FS_REISER4_CTAIL_H__ )
47637+#define __FS_REISER4_CTAIL_H__
47638+
47639+/* Disk format of ctail item */
47640+typedef struct ctail_item_format {
47641+ /* packed shift; size of (prepped) disk cluster
47642+ is calculated as (1 << cluster_shift) */
47643+ d8 cluster_shift;
47644+ /* ctail body */
47645+ d8 body[0];
47646+} __attribute__ ((packed)) ctail_item_format;
47647+
47648+/* Unprepped disk cluster is represented by a single ctail item
47649+ with the following "magic" attributes: */
47650+/* "magic" cluster_shift */
47651+#define UCTAIL_SHIFT 0xff
47652+/* How many units unprepped ctail item has */
47653+#define UCTAIL_NR_UNITS 1
47654+
47655+/* The following is a set of various item states in a disk cluster.
47656+ Disk cluster is a set of items whose keys belong to the interval
47657+ [dc_key , dc_key + disk_cluster_size - 1] */
47658+typedef enum {
47659+ DC_INVALID_STATE = 0,
47660+ DC_FIRST_ITEM = 1,
47661+ DC_CHAINED_ITEM = 2,
47662+ DC_AFTER_CLUSTER = 3
47663+} dc_item_stat;
47664+
47665+/* ctail-specific extension.
47666+ In particular this describes parameters of disk cluster an item belongs to */
47667+typedef struct {
47668+ int shift; /* this contains cluster_shift extracted from
47669+ ctail_item_format (above), or UCTAIL_SHIFT
47670+ (the last one is the "magic" of unprepped disk clusters)*/
47671+ int dsize; /* size of a prepped disk cluster */
47672+ int ncount; /* count of nodes occupied by a disk cluster */
47673+} ctail_coord_extension_t;
47674+
47675+struct cut_list;
47676+
47677+/* plugin->item.b.* */
47678+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47679+ const reiser4_item_data *);
47680+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47681+pos_in_node_t nr_units_ctail(const coord_t * coord);
47682+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47683+void print_ctail(const char *prefix, coord_t * coord);
47684+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47685+
47686+int paste_ctail(coord_t * coord, reiser4_item_data * data,
47687+ carry_plugin_info * info UNUSED_ARG);
47688+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47689+int can_shift_ctail(unsigned free_space, coord_t * coord,
47690+ znode * target, shift_direction pend, unsigned *size,
47691+ unsigned want);
47692+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47693+ unsigned count, shift_direction where_is_free_space,
47694+ unsigned free_space);
47695+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47696+ carry_cut_data *, reiser4_key * smallest_removed,
47697+ reiser4_key * new_first);
47698+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47699+ carry_kill_data *, reiser4_key * smallest_removed,
47700+ reiser4_key * new_first);
47701+int ctail_ok(const coord_t * coord);
47702+int check_ctail(const coord_t * coord, const char **error);
47703+
47704+/* plugin->u.item.s.* */
47705+int read_ctail(struct file *, flow_t *, hint_t *);
47706+int readpage_ctail(void *, struct page *);
47707+int readpages_ctail(struct file *, struct address_space *, struct list_head *);
47708+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47709+int create_hook_ctail(const coord_t * coord, void *arg);
47710+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47711+ carry_kill_data *);
47712+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47713+
47714+/* plugin->u.item.f */
47715+int utmost_child_ctail(const coord_t *, sideof, jnode **);
47716+int scan_ctail(flush_scan *);
47717+int convert_ctail(flush_pos_t *);
47718+size_t inode_scaled_cluster_size(struct inode *);
47719+
47720+#endif /* __FS_REISER4_CTAIL_H__ */
47721+
47722+/* Make Linus happy.
47723+ Local variables:
47724+ c-indentation-style: "K&R"
47725+ mode-name: "LC"
47726+ c-basic-offset: 8
47727+ tab-width: 8
47728+ fill-column: 120
47729+ End:
47730+*/
47731diff --git a/fs/reiser4/plugin/item/extent.c b/fs/reiser4/plugin/item/extent.c
47732new file mode 100644
47733index 0000000..e35a4d5
47734--- /dev/null
47735+++ b/fs/reiser4/plugin/item/extent.c
47736@@ -0,0 +1,197 @@
47737+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47738+
47739+#include "item.h"
47740+#include "../../key.h"
47741+#include "../../super.h"
47742+#include "../../carry.h"
47743+#include "../../inode.h"
47744+#include "../../page_cache.h"
47745+#include "../../flush.h"
47746+#include "../object.h"
47747+
47748+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47749+/* Audited by: green(2002.06.13) */
47750+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47751+ int nr_extents)
47752+{
47753+ data->data = ext_unit;
47754+ /* data->data is kernel space */
47755+ data->user = 0;
47756+ data->length = sizeof(reiser4_extent) * nr_extents;
47757+ data->arg = NULL;
47758+ data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47759+ return data;
47760+}
47761+
47762+/* how many bytes are addressed by @nr first extents of the extent item */
47763+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
47764+{
47765+ pos_in_node_t i;
47766+ reiser4_block_nr blocks;
47767+ reiser4_extent *ext;
47768+
47769+ ext = item_body_by_coord(coord);
47770+ assert("vs-263", nr <= nr_units_extent(coord));
47771+
47772+ blocks = 0;
47773+ for (i = 0; i < nr; i++, ext++) {
47774+ blocks += extent_get_width(ext);
47775+ }
47776+
47777+ return blocks * current_blocksize;
47778+}
47779+
47780+extent_state state_of_extent(reiser4_extent * ext)
47781+{
47782+ switch ((int)extent_get_start(ext)) {
47783+ case 0:
47784+ return HOLE_EXTENT;
47785+ case 1:
47786+ return UNALLOCATED_EXTENT;
47787+ default:
47788+ break;
47789+ }
47790+ return ALLOCATED_EXTENT;
47791+}
47792+
47793+int extent_is_unallocated(const coord_t * item)
47794+{
47795+ assert("jmacd-5133", item_is_extent(item));
47796+
47797+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47798+}
47799+
47800+/* set extent's start and width */
47801+void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
47802+ reiser4_block_nr width)
47803+{
47804+ extent_set_start(ext, start);
47805+ extent_set_width(ext, width);
47806+}
47807+
47808+/**
47809+ * reiser4_replace_extent - replace extent and paste 1 or 2 after it
47810+ * @un_extent: coordinate of extent to be overwritten
47811+ * @lh: need better comment
47812+ * @key: need better comment
47813+ * @exts_to_add: data prepared for insertion into tree
47814+ * @replace: need better comment
47815+ * @flags: need better comment
47816+ * @return_insert_position: need better comment
47817+ *
47818+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
47819+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47820+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47821+ * set to extent which was overwritten.
47822+ */
47823+int reiser4_replace_extent(struct replace_handle *h,
47824+ int return_inserted_position)
47825+{
47826+ int result;
47827+ znode *orig_znode;
47828+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
47829+
47830+ assert("vs-990", coord_is_existing_unit(h->coord));
47831+ assert("vs-1375", znode_is_write_locked(h->coord->node));
47832+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47833+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47834+ assert("vs-1427", ergo(h->nr_new_extents == 2,
47835+ extent_get_width(&h->new_extents[1]) != 0));
47836+
47837+ /* compose structure for paste */
47838+ init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47839+
47840+ coord_dup(&h->coord_after, h->coord);
47841+ init_lh(&h->lh_after);
47842+ copy_lh(&h->lh_after, h->lh);
47843+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47844+ reiser4_tap_monitor(&h->watch);
47845+
47846+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47847+ orig_znode = h->coord->node;
47848+
47849+#if REISER4_DEBUG
47850+ /* make sure that key is set properly */
47851+ unit_key_by_coord(h->coord, &h->tmp);
47852+ set_key_offset(&h->tmp,
47853+ get_key_offset(&h->tmp) +
47854+ extent_get_width(&h->overwrite) * current_blocksize);
47855+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47856+#endif
47857+
47858+ /* set insert point after unit to be replaced */
47859+ h->coord->between = AFTER_UNIT;
47860+
47861+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47862+ &h->paste_key, &h->item, h->flags);
47863+ if (!result) {
47864+ /* now we have to replace the unit after which new units were
47865+ inserted. Its position is tracked by @watch */
47866+ reiser4_extent *ext;
47867+ znode *node;
47868+
47869+ node = h->coord_after.node;
47870+ if (node != orig_znode) {
47871+ coord_clear_iplug(&h->coord_after);
47872+ result = zload(node);
47873+ }
47874+
47875+ if (likely(!result)) {
47876+ ext = extent_by_coord(&h->coord_after);
47877+
47878+ assert("vs-987", znode_is_loaded(node));
47879+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47880+
47881+ /* overwrite extent unit */
47882+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47883+ znode_make_dirty(node);
47884+
47885+ if (node != orig_znode)
47886+ zrelse(node);
47887+
47888+ if (return_inserted_position == 0) {
47889+ /* coord and lh are to be set to overwritten
47890+ extent */
47891+ assert("vs-1662",
47892+ WITH_DATA(node, !memcmp(&h->overwrite,
47893+ extent_by_coord(
47894+ &h->coord_after),
47895+ sizeof(reiser4_extent))));
47896+
47897+ *h->coord = h->coord_after;
47898+ done_lh(h->lh);
47899+ copy_lh(h->lh, &h->lh_after);
47900+ } else {
47901+ /* h->coord and h->lh are to be set to first of
47902+ inserted units */
47903+ assert("vs-1663",
47904+ WITH_DATA(h->coord->node,
47905+ !memcmp(&h->new_extents[0],
47906+ extent_by_coord(h->coord),
47907+ sizeof(reiser4_extent))));
47908+ assert("vs-1664", h->lh->node == h->coord->node);
47909+ }
47910+ }
47911+ }
47912+ reiser4_tap_done(&h->watch);
47913+
47914+ return result;
47915+}
47916+
47917+lock_handle *znode_lh(znode *node)
47918+{
47919+ assert("vs-1371", znode_is_write_locked(node));
47920+ assert("vs-1372", znode_is_wlocked_once(node));
47921+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
47922+}
47923+
47924+/*
47925+ * Local variables:
47926+ * c-indentation-style: "K&R"
47927+ * mode-name: "LC"
47928+ * c-basic-offset: 8
47929+ * tab-width: 8
47930+ * fill-column: 79
47931+ * scroll-step: 1
47932+ * End:
47933+ */
47934diff --git a/fs/reiser4/plugin/item/extent.h b/fs/reiser4/plugin/item/extent.h
47935new file mode 100644
47936index 0000000..d817d1b
47937--- /dev/null
47938+++ b/fs/reiser4/plugin/item/extent.h
47939@@ -0,0 +1,231 @@
47940+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47941+
47942+#ifndef __REISER4_EXTENT_H__
47943+#define __REISER4_EXTENT_H__
47944+
47945+/* on disk extent */
47946+typedef struct {
47947+ reiser4_dblock_nr start;
47948+ reiser4_dblock_nr width;
47949+} reiser4_extent;
47950+
47951+typedef struct extent_stat {
47952+ int unallocated_units;
47953+ int unallocated_blocks;
47954+ int allocated_units;
47955+ int allocated_blocks;
47956+ int hole_units;
47957+ int hole_blocks;
47958+} extent_stat;
47959+
47960+/* extents in an extent item can be either holes, or unallocated or allocated
47961+ extents */
47962+typedef enum {
47963+ HOLE_EXTENT,
47964+ UNALLOCATED_EXTENT,
47965+ ALLOCATED_EXTENT
47966+} extent_state;
47967+
47968+#define HOLE_EXTENT_START 0
47969+#define UNALLOCATED_EXTENT_START 1
47970+#define UNALLOCATED_EXTENT_START2 2
47971+
47972+typedef struct {
47973+ reiser4_block_nr pos_in_unit;
47974+ reiser4_block_nr width; /* width of current unit */
47975+ pos_in_node_t nr_units; /* number of units */
47976+ int ext_offset; /* offset from the beginning of zdata() */
47977+ unsigned long expected_page;
47978+#if REISER4_DEBUG
47979+ reiser4_extent extent;
47980+#endif
47981+} extent_coord_extension_t;
47982+
47983+/* macros to set/get fields of on-disk extent */
47984+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47985+{
47986+ return le64_to_cpu(ext->start);
47987+}
47988+
47989+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47990+{
47991+ return le64_to_cpu(ext->width);
47992+}
47993+
47994+extern __u64 reiser4_current_block_count(void);
47995+
47996+static inline void
47997+extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47998+{
47999+ cassert(sizeof(ext->start) == 8);
48000+ assert("nikita-2510",
48001+ ergo(start > 1, start < reiser4_current_block_count()));
48002+ put_unaligned(cpu_to_le64(start), &ext->start);
48003+}
48004+
48005+static inline void
48006+extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
48007+{
48008+ cassert(sizeof(ext->width) == 8);
48009+ assert("", width > 0);
48010+ put_unaligned(cpu_to_le64(width), &ext->width);
48011+ assert("nikita-2511",
48012+ ergo(extent_get_start(ext) > 1,
48013+ extent_get_start(ext) + width <=
48014+ reiser4_current_block_count()));
48015+}
48016+
48017+#define extent_item(coord) \
48018+({ \
48019+ assert("nikita-3143", item_is_extent(coord)); \
48020+ ((reiser4_extent *)item_body_by_coord (coord)); \
48021+})
48022+
48023+#define extent_by_coord(coord) \
48024+({ \
48025+ assert("nikita-3144", item_is_extent(coord)); \
48026+ (extent_item (coord) + (coord)->unit_pos); \
48027+})
48028+
48029+#define width_by_coord(coord) \
48030+({ \
48031+ assert("nikita-3145", item_is_extent(coord)); \
48032+ extent_get_width (extent_by_coord(coord)); \
48033+})
48034+
48035+struct carry_cut_data;
48036+struct carry_kill_data;
48037+
48038+/* plugin->u.item.b.* */
48039+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
48040+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
48041+ const reiser4_item_data *);
48042+int mergeable_extent(const coord_t * p1, const coord_t * p2);
48043+pos_in_node_t nr_units_extent(const coord_t *);
48044+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
48045+void init_coord_extent(coord_t *);
48046+int init_extent(coord_t *, reiser4_item_data *);
48047+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
48048+int can_shift_extent(unsigned free_space,
48049+ coord_t * source, znode * target, shift_direction,
48050+ unsigned *size, unsigned want);
48051+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
48052+ unsigned count, shift_direction where_is_free_space,
48053+ unsigned free_space);
48054+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
48055+ struct carry_kill_data *);
48056+int create_hook_extent(const coord_t * coord, void *arg);
48057+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48058+ struct carry_cut_data *, reiser4_key * smallest_removed,
48059+ reiser4_key * new_first);
48060+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48061+ struct carry_kill_data *, reiser4_key * smallest_removed,
48062+ reiser4_key * new_first);
48063+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
48064+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
48065+void print_extent(const char *, coord_t *);
48066+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
48067+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
48068+ reiser4_block_nr * block);
48069+void item_stat_extent(const coord_t * coord, void *vp);
48070+int reiser4_check_extent(const coord_t * coord, const char **error);
48071+
48072+/* plugin->u.item.s.file.* */
48073+ssize_t reiser4_write_extent(struct file *, const char __user *,
48074+ size_t, loff_t *);
48075+int reiser4_read_extent(struct file *, flow_t *, hint_t *);
48076+int reiser4_readpage_extent(void *, struct page *);
48077+int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
48078+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
48079+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
48080+int get_block_address_extent(const coord_t *, sector_t block,
48081+ sector_t * result);
48082+
48083+/* these are used in flush.c
48084+ FIXME-VS: should they be somewhere in item_plugin? */
48085+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
48086+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
48087+ reiser4_key * stop_key);
48088+
48089+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
48090+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
48091+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
48092+
48093+/* plugin->u.item.f. */
48094+int reiser4_scan_extent(flush_scan * scan);
48095+extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
48096+
48097+reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
48098+ int nr_extents);
48099+reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
48100+extent_state state_of_extent(reiser4_extent * ext);
48101+void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
48102+ reiser4_block_nr width);
48103+int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
48104+ int *plugged_hole);
48105+
48106+#include "../../coord.h"
48107+#include "../../lock.h"
48108+#include "../../tap.h"
48109+
48110+struct replace_handle {
48111+ /* these are to be set before calling reiser4_replace_extent */
48112+ coord_t *coord;
48113+ lock_handle *lh;
48114+ reiser4_key key;
48115+ reiser4_key *pkey;
48116+ reiser4_extent overwrite;
48117+ reiser4_extent new_extents[2];
48118+ int nr_new_extents;
48119+ unsigned flags;
48120+
48121+ /* these are used by reiser4_replace_extent */
48122+ reiser4_item_data item;
48123+ coord_t coord_after;
48124+ lock_handle lh_after;
48125+ tap_t watch;
48126+ reiser4_key paste_key;
48127+#if REISER4_DEBUG
48128+ reiser4_extent orig_ext;
48129+ reiser4_key tmp;
48130+#endif
48131+};
48132+
48133+/* this structure is kmalloced before calling make_extent to avoid excessive
48134+ stack consumption on plug_hole->reiser4_replace_extent */
48135+struct make_extent_handle {
48136+ uf_coord_t *uf_coord;
48137+ reiser4_block_nr blocknr;
48138+ int created;
48139+ struct inode *inode;
48140+ union {
48141+ struct {
48142+ } append;
48143+ struct replace_handle replace;
48144+ } u;
48145+};
48146+
48147+int reiser4_replace_extent(struct replace_handle *,
48148+ int return_inserted_position);
48149+lock_handle *znode_lh(znode *);
48150+
48151+/* the reiser4 repacker support */
48152+struct repacker_cursor;
48153+extern int process_extent_backward_for_repacking(tap_t *,
48154+ struct repacker_cursor *);
48155+extern int mark_extent_for_repacking(tap_t *, int);
48156+
48157+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
48158+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
48159+
48160+/* __REISER4_EXTENT_H__ */
48161+#endif
48162+/*
48163+ Local variables:
48164+ c-indentation-style: "K&R"
48165+ mode-name: "LC"
48166+ c-basic-offset: 8
48167+ tab-width: 8
48168+ fill-column: 120
48169+ End:
48170+*/
48171diff --git a/fs/reiser4/plugin/item/extent_file_ops.c b/fs/reiser4/plugin/item/extent_file_ops.c
48172new file mode 100644
48173index 0000000..cf337c4
48174--- /dev/null
48175+++ b/fs/reiser4/plugin/item/extent_file_ops.c
48176@@ -0,0 +1,1435 @@
48177+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48178+
48179+#include "item.h"
48180+#include "../../inode.h"
48181+#include "../../page_cache.h"
48182+#include "../object.h"
48183+
48184+#include <linux/quotaops.h>
48185+#include <linux/swap.h>
48186+#include "../../../../mm/filemap.h"
48187+
48188+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
48189+{
48190+ reiser4_extent *ext;
48191+
48192+ ext = (reiser4_extent *) (zdata(node) + offset);
48193+ return ext;
48194+}
48195+
48196+/**
48197+ * check_uf_coord - verify coord extension
48198+ * @uf_coord:
48199+ * @key:
48200+ *
48201+ * Makes sure that all fields of @uf_coord are set properly. If @key is
48202+ * specified - check whether @uf_coord is set correspondingly.
48203+ */
48204+static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
48205+{
48206+#if REISER4_DEBUG
48207+ const coord_t *coord;
48208+ const extent_coord_extension_t *ext_coord;
48209+ reiser4_extent *ext;
48210+
48211+ coord = &uf_coord->coord;
48212+ ext_coord = &uf_coord->extension.extent;
48213+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
48214+
48215+ assert("",
48216+ WITH_DATA(coord->node,
48217+ (uf_coord->valid == 1 &&
48218+ coord_is_iplug_set(coord) &&
48219+ item_is_extent(coord) &&
48220+ ext_coord->nr_units == nr_units_extent(coord) &&
48221+ ext == extent_by_coord(coord) &&
48222+ ext_coord->width == extent_get_width(ext) &&
48223+ coord->unit_pos < ext_coord->nr_units &&
48224+ ext_coord->pos_in_unit < ext_coord->width &&
48225+ memcmp(ext, &ext_coord->extent,
48226+ sizeof(reiser4_extent)) == 0)));
48227+ if (key) {
48228+ reiser4_key coord_key;
48229+
48230+ unit_key_by_coord(&uf_coord->coord, &coord_key);
48231+ set_key_offset(&coord_key,
48232+ get_key_offset(&coord_key) +
48233+ (uf_coord->extension.extent.
48234+ pos_in_unit << PAGE_CACHE_SHIFT));
48235+ assert("", keyeq(key, &coord_key));
48236+ }
48237+#endif
48238+}
48239+
48240+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
48241+{
48242+ check_uf_coord(uf_coord, NULL);
48243+
48244+ return ext_by_offset(uf_coord->coord.node,
48245+ uf_coord->extension.extent.ext_offset);
48246+}
48247+
48248+#if REISER4_DEBUG
48249+
48250+/**
48251+ * offset_is_in_unit
48252+ *
48253+ *
48254+ *
48255+ */
48256+/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
48257+ pos_in_unit inside of unit correspondingly */
48258+static int offset_is_in_unit(const coord_t *coord, loff_t off)
48259+{
48260+ reiser4_key unit_key;
48261+ __u64 unit_off;
48262+ reiser4_extent *ext;
48263+
48264+ ext = extent_by_coord(coord);
48265+
48266+ unit_key_extent(coord, &unit_key);
48267+ unit_off = get_key_offset(&unit_key);
48268+ if (off < unit_off)
48269+ return 0;
48270+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
48271+ return 0;
48272+ return 1;
48273+}
48274+
48275+static int
48276+coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
48277+{
48278+ reiser4_key item_key;
48279+
48280+ assert("vs-771", coord_is_existing_unit(coord));
48281+ assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
48282+ assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
48283+
48284+ return offset_is_in_unit(coord, get_key_offset(key));
48285+}
48286+
48287+#endif
48288+
48289+/**
48290+ * can_append -
48291+ * @key:
48292+ * @coord:
48293+ *
48294+ * Returns 1 if @key is equal to an append key of item @coord is set to
48295+ */
48296+static int can_append(const reiser4_key *key, const coord_t *coord)
48297+{
48298+ reiser4_key append_key;
48299+
48300+ return keyeq(key, append_key_extent(coord, &append_key));
48301+}
48302+
48303+/**
48304+ * append_hole
48305+ * @coord:
48306+ * @lh:
48307+ * @key:
48308+ *
48309+ */
48310+static int append_hole(coord_t *coord, lock_handle *lh,
48311+ const reiser4_key *key)
48312+{
48313+ reiser4_key append_key;
48314+ reiser4_block_nr hole_width;
48315+ reiser4_extent *ext, new_ext;
48316+ reiser4_item_data idata;
48317+
48318+ /* last item of file may have to be appended with hole */
48319+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
48320+ assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
48321+
48322+ /* key of first byte which is not addressed by this extent */
48323+ append_key_extent(coord, &append_key);
48324+
48325+ assert("", keyle(&append_key, key));
48326+
48327+ /*
48328+ * extent item has to be appended with hole. Calculate length of that
48329+ * hole
48330+ */
48331+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
48332+ current_blocksize - 1) >> current_blocksize_bits);
48333+ assert("vs-954", hole_width > 0);
48334+
48335+ /* set coord after last unit */
48336+ coord_init_after_item_end(coord);
48337+
48338+ /* get last extent in the item */
48339+ ext = extent_by_coord(coord);
48340+ if (state_of_extent(ext) == HOLE_EXTENT) {
48341+ /*
48342+ * last extent of a file is hole extent. Widen that extent by
48343+ * @hole_width blocks. Note that we do not worry about
48344+ * overflowing - extent width is 64 bits
48345+ */
48346+ reiser4_set_extent(ext, HOLE_EXTENT_START,
48347+ extent_get_width(ext) + hole_width);
48348+ znode_make_dirty(coord->node);
48349+ return 0;
48350+ }
48351+
48352+ /* append last item of the file with hole extent unit */
48353+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
48354+ state_of_extent(ext) == UNALLOCATED_EXTENT));
48355+
48356+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
48357+ init_new_extent(&idata, &new_ext, 1);
48358+ return insert_into_item(coord, lh, &append_key, &idata, 0);
48359+}
48360+
48361+/**
48362+ * check_jnodes
48363+ * @twig: longterm locked twig node
48364+ * @key:
48365+ *
48366+ */
48367+static void check_jnodes(znode *twig, const reiser4_key *key, int count)
48368+{
48369+#if REISER4_DEBUG
48370+ coord_t c;
48371+ reiser4_key node_key, jnode_key;
48372+
48373+ jnode_key = *key;
48374+
48375+ assert("", twig != NULL);
48376+ assert("", znode_get_level(twig) == TWIG_LEVEL);
48377+ assert("", znode_is_write_locked(twig));
48378+
48379+ zload(twig);
48380+ /* get the smallest key in twig node */
48381+ coord_init_first_unit(&c, twig);
48382+ unit_key_by_coord(&c, &node_key);
48383+ assert("", keyle(&node_key, &jnode_key));
48384+
48385+ coord_init_last_unit(&c, twig);
48386+ unit_key_by_coord(&c, &node_key);
48387+ if (item_plugin_by_coord(&c)->s.file.append_key)
48388+ item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
48389+ set_key_offset(&jnode_key,
48390+ get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
48391+ assert("", keylt(&jnode_key, &node_key));
48392+ zrelse(twig);
48393+#endif
48394+}
48395+
48396+/**
48397+ * append_last_extent - append last file item
48398+ * @uf_coord: coord to start insertion from
48399+ * @jnodes: array of jnodes
48400+ * @count: number of jnodes in the array
48401+ *
48402+ * There is already at least one extent item of file @inode in the tree. Append
48403+ * the last of them with unallocated extent unit of width @count. Assign
48404+ * fake block numbers to jnodes corresponding to the inserted extent.
48405+ */
48406+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48407+ jnode **jnodes, int count)
48408+{
48409+ int result;
48410+ reiser4_extent new_ext;
48411+ reiser4_item_data idata;
48412+ coord_t *coord;
48413+ extent_coord_extension_t *ext_coord;
48414+ reiser4_extent *ext;
48415+ reiser4_block_nr block;
48416+ jnode *node;
48417+ int i;
48418+
48419+ coord = &uf_coord->coord;
48420+ ext_coord = &uf_coord->extension.extent;
48421+ ext = ext_by_ext_coord(uf_coord);
48422+
48423+ /* check correctness of position in the item */
48424+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
48425+ assert("vs-1311", coord->between == AFTER_UNIT);
48426+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
48427+
48428+ if (!can_append(key, coord)) {
48429+ /* hole extent has to be inserted */
48430+ result = append_hole(coord, uf_coord->lh, key);
48431+ uf_coord->valid = 0;
48432+ return result;
48433+ }
48434+
48435+ if (count == 0)
48436+ return 0;
48437+
48438+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
48439+
48440+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
48441+ count);
48442+ BUG_ON(result != 0);
48443+
48444+ switch (state_of_extent(ext)) {
48445+ case UNALLOCATED_EXTENT:
48446+ /*
48447+ * last extent unit of the file is unallocated one. Increase
48448+ * its width by @count
48449+ */
48450+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
48451+ extent_get_width(ext) + count);
48452+ znode_make_dirty(coord->node);
48453+
48454+ /* update coord extension */
48455+ ext_coord->width += count;
48456+ ON_DEBUG(extent_set_width
48457+ (&uf_coord->extension.extent.extent,
48458+ ext_coord->width));
48459+ break;
48460+
48461+ case HOLE_EXTENT:
48462+ case ALLOCATED_EXTENT:
48463+ /*
48464+ * last extent unit of the file is either hole or allocated
48465+ * one. Append one unallocated extent of width @count
48466+ */
48467+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
48468+ init_new_extent(&idata, &new_ext, 1);
48469+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
48470+ uf_coord->valid = 0;
48471+ if (result)
48472+ return result;
48473+ break;
48474+
48475+ default:
48476+ return RETERR(-EIO);
48477+ }
48478+
48479+ /*
48480+ * make sure that we hold long term locked twig node containing all
48481+ * jnodes we are about to capture
48482+ */
48483+ check_jnodes(uf_coord->lh->node, key, count);
48484+
48485+ /*
48486+ * assign fake block numbers to all jnodes. FIXME: make sure whether
48487+ * twig node containing inserted extent item is locked
48488+ */
48489+ block = fake_blocknr_unformatted(count);
48490+ for (i = 0; i < count; i ++, block ++) {
48491+ node = jnodes[i];
48492+ spin_lock_jnode(node);
48493+ JF_SET(node, JNODE_CREATED);
48494+ jnode_set_block(node, &block);
48495+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48496+ BUG_ON(result != 0);
48497+ jnode_make_dirty_locked(node);
48498+ spin_unlock_jnode(node);
48499+ }
48500+ return count;
48501+}
48502+
48503+/**
48504+ * insert_first_hole - inser hole extent into tree
48505+ * @coord:
48506+ * @lh:
48507+ * @key:
48508+ *
48509+ *
48510+ */
48511+static int insert_first_hole(coord_t *coord, lock_handle *lh,
48512+ const reiser4_key *key)
48513+{
48514+ reiser4_extent new_ext;
48515+ reiser4_item_data idata;
48516+ reiser4_key item_key;
48517+ reiser4_block_nr hole_width;
48518+
48519+ /* @coord must be set for inserting of new item */
48520+ assert("vs-711", coord_is_between_items(coord));
48521+
48522+ item_key = *key;
48523+ set_key_offset(&item_key, 0ull);
48524+
48525+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
48526+ current_blocksize_bits);
48527+ assert("vs-710", hole_width > 0);
48528+
48529+ /* compose body of hole extent and insert item into tree */
48530+ reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
48531+ init_new_extent(&idata, &new_ext, 1);
48532+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
48533+}
48534+
48535+
48536+/**
48537+ * insert_first_extent - insert first file item
48538+ * @inode: inode of file
48539+ * @uf_coord: coord to start insertion from
48540+ * @jnodes: array of jnodes
48541+ * @count: number of jnodes in the array
48542+ * @inode:
48543+ *
48544+ * There are no items of file @inode in the tree yet. Insert unallocated extent
48545+ * of width @count into tree or hole extent if writing not to the
48546+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
48547+ * unallocated extent. Returns number of jnodes or error code.
48548+ */
48549+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48550+ jnode **jnodes, int count,
48551+ struct inode *inode)
48552+{
48553+ int result;
48554+ int i;
48555+ reiser4_extent new_ext;
48556+ reiser4_item_data idata;
48557+ reiser4_block_nr block;
48558+ unix_file_info_t *uf_info;
48559+ jnode *node;
48560+
48561+ /* first extent insertion starts at leaf level */
48562+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
48563+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
48564+
48565+ if (get_key_offset(key) != 0) {
48566+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
48567+ uf_coord->valid = 0;
48568+ uf_info = unix_file_inode_data(inode);
48569+
48570+ /*
48571+ * first item insertion is only possible when writing to empty
48572+ * file or performing tail conversion
48573+ */
48574+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
48575+ (reiser4_inode_get_flag(inode,
48576+ REISER4_PART_MIXED) &&
48577+ reiser4_inode_get_flag(inode,
48578+ REISER4_PART_IN_CONV))));
48579+ /* if file was empty - update its state */
48580+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
48581+ uf_info->container = UF_CONTAINER_EXTENTS;
48582+ return result;
48583+ }
48584+
48585+ if (count == 0)
48586+ return 0;
48587+
48588+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
48589+ BUG_ON(result != 0);
48590+
48591+ /*
48592+ * prepare for tree modification: compose body of item and item data
48593+ * structure needed for insertion
48594+ */
48595+ reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
48596+ init_new_extent(&idata, &new_ext, 1);
48597+
48598+ /* insert extent item into the tree */
48599+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
48600+ uf_coord->lh);
48601+ if (result)
48602+ return result;
48603+
48604+ /*
48605+ * make sure that we hold long term locked twig node containing all
48606+ * jnodes we are about to capture
48607+ */
48608+ check_jnodes(uf_coord->lh->node, key, count);
48609+ /*
48610+ * assign fake block numbers to all jnodes, capture and mark them dirty
48611+ */
48612+ block = fake_blocknr_unformatted(count);
48613+ for (i = 0; i < count; i ++, block ++) {
48614+ node = jnodes[i];
48615+ spin_lock_jnode(node);
48616+ JF_SET(node, JNODE_CREATED);
48617+ jnode_set_block(node, &block);
48618+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48619+ BUG_ON(result != 0);
48620+ jnode_make_dirty_locked(node);
48621+ spin_unlock_jnode(node);
48622+ }
48623+
48624+ /*
48625+ * invalidate coordinate, research must be performed to continue
48626+ * because write will continue on twig level
48627+ */
48628+ uf_coord->valid = 0;
48629+ return count;
48630+}
48631+
48632+/**
48633+ * plug_hole - replace hole extent with unallocated and holes
48634+ * @uf_coord:
48635+ * @key:
48636+ * @node:
48637+ * @h: structure containing coordinate, lock handle, key, etc
48638+ *
48639+ * Creates an unallocated extent of width 1 within a hole. In worst case two
48640+ * additional extents can be created.
48641+ */
48642+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
48643+{
48644+ struct replace_handle rh;
48645+ reiser4_extent *ext;
48646+ reiser4_block_nr width, pos_in_unit;
48647+ coord_t *coord;
48648+ extent_coord_extension_t *ext_coord;
48649+ int return_inserted_position;
48650+
48651+ check_uf_coord(uf_coord, key);
48652+
48653+ rh.coord = coord_by_uf_coord(uf_coord);
48654+ rh.lh = uf_coord->lh;
48655+ rh.flags = 0;
48656+
48657+ coord = coord_by_uf_coord(uf_coord);
48658+ ext_coord = ext_coord_by_uf_coord(uf_coord);
48659+ ext = ext_by_ext_coord(uf_coord);
48660+
48661+ width = ext_coord->width;
48662+ pos_in_unit = ext_coord->pos_in_unit;
48663+
48664+ *how = 0;
48665+ if (width == 1) {
48666+ reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
48667+ znode_make_dirty(coord->node);
48668+ /* update uf_coord */
48669+ ON_DEBUG(ext_coord->extent = *ext);
48670+ *how = 1;
48671+ return 0;
48672+ } else if (pos_in_unit == 0) {
48673+ /* we deal with first element of extent */
48674+ if (coord->unit_pos) {
48675+ /* there is an extent to the left */
48676+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
48677+ /*
48678+ * left neighboring unit is an unallocated
48679+ * extent. Increase its width and decrease
48680+ * width of hole
48681+ */
48682+ extent_set_width(ext - 1,
48683+ extent_get_width(ext - 1) + 1);
48684+ extent_set_width(ext, width - 1);
48685+ znode_make_dirty(coord->node);
48686+
48687+ /* update coord extension */
48688+ coord->unit_pos--;
48689+ ext_coord->width = extent_get_width(ext - 1);
48690+ ext_coord->pos_in_unit = ext_coord->width - 1;
48691+ ext_coord->ext_offset -= sizeof(reiser4_extent);
48692+ ON_DEBUG(ext_coord->extent =
48693+ *extent_by_coord(coord));
48694+ *how = 2;
48695+ return 0;
48696+ }
48697+ }
48698+ /* extent for replace */
48699+ reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
48700+ /* extent to be inserted */
48701+ reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
48702+ width - 1);
48703+ rh.nr_new_extents = 1;
48704+
48705+ /* have reiser4_replace_extent to return with @coord and
48706+ @uf_coord->lh set to unit which was replaced */
48707+ return_inserted_position = 0;
48708+ *how = 3;
48709+ } else if (pos_in_unit == width - 1) {
48710+ /* we deal with last element of extent */
48711+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
48712+ /* there is an extent unit to the right */
48713+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
48714+ /*
48715+ * right neighboring unit is an unallocated
48716+ * extent. Increase its width and decrease
48717+ * width of hole
48718+ */
48719+ extent_set_width(ext + 1,
48720+ extent_get_width(ext + 1) + 1);
48721+ extent_set_width(ext, width - 1);
48722+ znode_make_dirty(coord->node);
48723+
48724+ /* update coord extension */
48725+ coord->unit_pos++;
48726+ ext_coord->width = extent_get_width(ext + 1);
48727+ ext_coord->pos_in_unit = 0;
48728+ ext_coord->ext_offset += sizeof(reiser4_extent);
48729+ ON_DEBUG(ext_coord->extent =
48730+ *extent_by_coord(coord));
48731+ *how = 4;
48732+ return 0;
48733+ }
48734+ }
48735+ /* extent for replace */
48736+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
48737+ /* extent to be inserted */
48738+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
48739+ 1);
48740+ rh.nr_new_extents = 1;
48741+
48742+ /* have reiser4_replace_extent to return with @coord and
48743+ @uf_coord->lh set to unit which was inserted */
48744+ return_inserted_position = 1;
48745+ *how = 5;
48746+ } else {
48747+ /* extent for replace */
48748+ reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
48749+ pos_in_unit);
48750+ /* extents to be inserted */
48751+ reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
48752+ 1);
48753+ reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
48754+ width - pos_in_unit - 1);
48755+ rh.nr_new_extents = 2;
48756+
48757+ /* have reiser4_replace_extent to return with @coord and
48758+ @uf_coord->lh set to first of units which were inserted */
48759+ return_inserted_position = 1;
48760+ *how = 6;
48761+ }
48762+ unit_key_by_coord(coord, &rh.paste_key);
48763+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
48764+ extent_get_width(&rh.overwrite) * current_blocksize);
48765+
48766+ uf_coord->valid = 0;
48767+ return reiser4_replace_extent(&rh, return_inserted_position);
48768+}
48769+
48770+/**
48771+ * overwrite_one_block -
48772+ * @uf_coord:
48773+ * @key:
48774+ * @node:
48775+ *
48776+ * If @node corresponds to hole extent - create unallocated extent for it and
48777+ * assign fake block number. If @node corresponds to allocated extent - assign
48778+ * block number of jnode
48779+ */
48780+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
48781+ jnode *node, int *hole_plugged)
48782+{
48783+ int result;
48784+ extent_coord_extension_t *ext_coord;
48785+ reiser4_extent *ext;
48786+ reiser4_block_nr block;
48787+ int how;
48788+
48789+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
48790+
48791+ result = 0;
48792+ ext_coord = ext_coord_by_uf_coord(uf_coord);
48793+ ext = ext_by_ext_coord(uf_coord);
48794+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
48795+
48796+ switch (state_of_extent(ext)) {
48797+ case ALLOCATED_EXTENT:
48798+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
48799+ break;
48800+
48801+ case HOLE_EXTENT:
48802+ result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
48803+ BUG_ON(result != 0);
48804+ result = plug_hole(uf_coord, key, &how);
48805+ if (result)
48806+ return result;
48807+ block = fake_blocknr_unformatted(1);
48808+ if (hole_plugged)
48809+ *hole_plugged = 1;
48810+ JF_SET(node, JNODE_CREATED);
48811+ break;
48812+
48813+ default:
48814+ return RETERR(-EIO);
48815+ }
48816+
48817+ jnode_set_block(node, &block);
48818+ return 0;
48819+}
48820+
48821+/**
48822+ * move_coord - move coordinate forward
48823+ * @uf_coord:
48824+ *
48825+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
48826+ * the last one already or is invalid.
48827+ */
48828+static int move_coord(uf_coord_t *uf_coord)
48829+{
48830+ extent_coord_extension_t *ext_coord;
48831+
48832+ if (uf_coord->valid == 0)
48833+ return 1;
48834+ ext_coord = &uf_coord->extension.extent;
48835+ ext_coord->pos_in_unit ++;
48836+ if (ext_coord->pos_in_unit < ext_coord->width)
48837+ /* coordinate moved within the unit */
48838+ return 0;
48839+
48840+ /* end of unit is reached. Try to move to next unit */
48841+ ext_coord->pos_in_unit = 0;
48842+ uf_coord->coord.unit_pos ++;
48843+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48844+ /* coordinate moved to next unit */
48845+ ext_coord->ext_offset += sizeof(reiser4_extent);
48846+ ext_coord->width =
48847+ extent_get_width(ext_by_offset
48848+ (uf_coord->coord.node,
48849+ ext_coord->ext_offset));
48850+ ON_DEBUG(ext_coord->extent =
48851+ *ext_by_offset(uf_coord->coord.node,
48852+ ext_coord->ext_offset));
48853+ return 0;
48854+ }
48855+ /* end of item is reached */
48856+ uf_coord->valid = 0;
48857+ return 1;
48858+}
48859+
48860+/**
48861+ * overwrite_extent -
48862+ * @inode:
48863+ *
48864+ * Returns number of handled jnodes.
48865+ */
48866+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48867+ jnode **jnodes, int count, int *plugged_hole)
48868+{
48869+ int result;
48870+ reiser4_key k;
48871+ int i;
48872+ jnode *node;
48873+
48874+ k = *key;
48875+ for (i = 0; i < count; i ++) {
48876+ node = jnodes[i];
48877+ if (*jnode_get_block(node) == 0) {
48878+ result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48879+ if (result)
48880+ return result;
48881+ }
48882+ /*
48883+ * make sure that we hold long term locked twig node containing
48884+ * all jnodes we are about to capture
48885+ */
48886+ check_jnodes(uf_coord->lh->node, &k, 1);
48887+ /*
48888+ * assign fake block numbers to all jnodes, capture and mark
48889+ * them dirty
48890+ */
48891+ spin_lock_jnode(node);
48892+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48893+ BUG_ON(result != 0);
48894+ jnode_make_dirty_locked(node);
48895+ spin_unlock_jnode(node);
48896+
48897+ if (uf_coord->valid == 0)
48898+ return i + 1;
48899+
48900+ check_uf_coord(uf_coord, &k);
48901+
48902+ if (move_coord(uf_coord)) {
48903+ /*
48904+ * failed to move to the next node pointer. Either end
48905+ * of file or end of twig node is reached. In the later
48906+ * case we might go to the right neighbor.
48907+ */
48908+ uf_coord->valid = 0;
48909+ return i + 1;
48910+ }
48911+ set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48912+ }
48913+
48914+ return count;
48915+}
48916+
48917+/**
48918+ * reiser4_update_extent
48919+ * @file:
48920+ * @jnodes:
48921+ * @count:
48922+ * @off:
48923+ *
48924+ */
48925+int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
48926+ int *plugged_hole)
48927+{
48928+ int result;
48929+ znode *loaded;
48930+ uf_coord_t uf_coord;
48931+ coord_t *coord;
48932+ lock_handle lh;
48933+ reiser4_key key;
48934+
48935+ assert("", reiser4_lock_counters()->d_refs == 0);
48936+
48937+ key_by_inode_and_offset_common(inode, pos, &key);
48938+
48939+ init_uf_coord(&uf_coord, &lh);
48940+ coord = &uf_coord.coord;
48941+ result = find_file_item_nohint(coord, &lh, &key,
48942+ ZNODE_WRITE_LOCK, inode);
48943+ if (IS_CBKERR(result)) {
48944+ assert("", reiser4_lock_counters()->d_refs == 0);
48945+ return result;
48946+ }
48947+
48948+ result = zload(coord->node);
48949+ BUG_ON(result != 0);
48950+ loaded = coord->node;
48951+
48952+ if (coord->between == AFTER_UNIT) {
48953+ /*
48954+ * append existing extent item with unallocated extent of width
48955+ * nr_jnodes
48956+ */
48957+ init_coord_extension_extent(&uf_coord,
48958+ get_key_offset(&key));
48959+ result = append_last_extent(&uf_coord, &key,
48960+ &node, 1);
48961+ } else if (coord->between == AT_UNIT) {
48962+ /*
48963+ * overwrite
48964+ * not optimal yet. Will be optimized if new write will show
48965+ * performance win.
48966+ */
48967+ init_coord_extension_extent(&uf_coord,
48968+ get_key_offset(&key));
48969+ result = overwrite_extent(&uf_coord, &key,
48970+ &node, 1, plugged_hole);
48971+ } else {
48972+ /*
48973+ * there are no items of this file in the tree yet. Create
48974+ * first item of the file inserting one unallocated extent of
48975+ * width nr_jnodes
48976+ */
48977+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48978+ }
48979+ assert("", result == 1 || result < 0);
48980+ zrelse(loaded);
48981+ done_lh(&lh);
48982+ assert("", reiser4_lock_counters()->d_refs == 0);
48983+ return (result == 1) ? 0 : result;
48984+}
48985+
48986+/**
48987+ * update_extents
48988+ * @file:
48989+ * @jnodes:
48990+ * @count:
48991+ * @off:
48992+ *
48993+ */
48994+static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48995+{
48996+ struct inode *inode;
48997+ struct hint hint;
48998+ reiser4_key key;
48999+ int result;
49000+ znode *loaded;
49001+
49002+ result = load_file_hint(file, &hint);
49003+ BUG_ON(result != 0);
49004+
49005+ inode = file->f_dentry->d_inode;
49006+ if (count != 0)
49007+ /*
49008+ * count == 0 is special case: expanding truncate
49009+ */
49010+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
49011+ key_by_inode_and_offset_common(inode, pos, &key);
49012+
49013+ assert("", reiser4_lock_counters()->d_refs == 0);
49014+
49015+ do {
49016+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
49017+ if (IS_CBKERR(result)) {
49018+ assert("", reiser4_lock_counters()->d_refs == 0);
49019+ return result;
49020+ }
49021+
49022+ result = zload(hint.ext_coord.coord.node);
49023+ BUG_ON(result != 0);
49024+ loaded = hint.ext_coord.coord.node;
49025+
49026+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
49027+ /*
49028+ * append existing extent item with unallocated extent
49029+ * of width nr_jnodes
49030+ */
49031+ if (hint.ext_coord.valid == 0)
49032+ /* NOTE: get statistics on this */
49033+ init_coord_extension_extent(&hint.ext_coord,
49034+ get_key_offset(&key));
49035+ result = append_last_extent(&hint.ext_coord, &key,
49036+ jnodes, count);
49037+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
49038+ /*
49039+ * overwrite
49040+ * not optimal yet. Will be optimized if new write will
49041+ * show performance win.
49042+ */
49043+ if (hint.ext_coord.valid == 0)
49044+ /* NOTE: get statistics on this */
49045+ init_coord_extension_extent(&hint.ext_coord,
49046+ get_key_offset(&key));
49047+ result = overwrite_extent(&hint.ext_coord, &key,
49048+ jnodes, count, NULL);
49049+ } else {
49050+ /*
49051+ * there are no items of this file in the tree
49052+ * yet. Create first item of the file inserting one
49053+ * unallocated extent of * width nr_jnodes
49054+ */
49055+ result = insert_first_extent(&hint.ext_coord, &key,
49056+ jnodes, count, inode);
49057+ }
49058+ zrelse(loaded);
49059+ if (result < 0) {
49060+ done_lh(hint.ext_coord.lh);
49061+ break;
49062+ }
49063+
49064+ jnodes += result;
49065+ count -= result;
49066+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
49067+
49068+ /* seal and unlock znode */
49069+ if (hint.ext_coord.valid)
49070+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
49071+ else
49072+ reiser4_unset_hint(&hint);
49073+
49074+ } while (count > 0);
49075+
49076+ save_file_hint(file, &hint);
49077+ assert("", reiser4_lock_counters()->d_refs == 0);
49078+ return result;
49079+}
49080+
49081+/**
49082+ * write_extent_reserve_space - reserve space for extent write operation
49083+ * @inode:
49084+ *
49085+ * Estimates and reserves space which may be required for writing
49086+ * WRITE_GRANULARITY pages of file.
49087+ */
49088+static int write_extent_reserve_space(struct inode *inode)
49089+{
49090+ __u64 count;
49091+ reiser4_tree *tree;
49092+
49093+ /*
49094+ * to write WRITE_GRANULARITY pages to a file by extents we have to
49095+ * reserve disk space for:
49096+
49097+ * 1. find_file_item may have to insert empty node to the tree (empty
49098+ * leaf node between two extent items). This requires 1 block and
49099+ * number of blocks which are necessary to perform insertion of an
49100+ * internal item into twig level.
49101+
49102+ * 2. for each of written pages there might be needed 1 block and
49103+ * number of blocks which might be necessary to perform insertion of or
49104+ * paste to an extent item.
49105+
49106+ * 3. stat data update
49107+ */
49108+ tree = reiser4_tree_by_inode(inode);
49109+ count = estimate_one_insert_item(tree) +
49110+ WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
49111+ estimate_one_insert_item(tree);
49112+ grab_space_enable();
49113+ return reiser4_grab_space(count, 0 /* flags */);
49114+}
49115+
49116+/**
49117+ * reiser4_write_extent - write method of extent item plugin
49118+ * @file: file to write to
49119+ * @buf: address of user-space buffer
49120+ * @write_amount: number of bytes to write
49121+ * @off: position in file to write to
49122+ *
49123+ */
49124+ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
49125+ size_t count, loff_t *pos)
49126+{
49127+ int have_to_update_extent;
49128+ int nr_pages;
49129+ struct page *page;
49130+ jnode *jnodes[WRITE_GRANULARITY + 1];
49131+ struct inode *inode;
49132+ unsigned long index;
49133+ unsigned long end;
49134+ int i;
49135+ int to_page, page_off;
49136+ size_t left, written;
49137+ int result;
49138+
49139+ inode = file->f_dentry->d_inode;
49140+ if (write_extent_reserve_space(inode))
49141+ return RETERR(-ENOSPC);
49142+
49143+ if (count == 0) {
49144+ /* truncate case */
49145+ update_extents(file, jnodes, 0, *pos);
49146+ return 0;
49147+ }
49148+
49149+ BUG_ON(get_current_context()->trans->atom != NULL);
49150+
49151+ index = *pos >> PAGE_CACHE_SHIFT;
49152+ /* calculate number of pages which are to be written */
49153+ end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
49154+ nr_pages = end - index + 1;
49155+ assert("", nr_pages <= WRITE_GRANULARITY + 1);
49156+
49157+ /* get pages and jnodes */
49158+ for (i = 0; i < nr_pages; i ++) {
49159+ page = find_or_create_page(inode->i_mapping, index + i,
49160+ reiser4_ctx_gfp_mask_get());
49161+ if (page == NULL) {
49162+ while(i --) {
49163+ unlock_page(jnode_page(jnodes[i]));
49164+ page_cache_release(jnode_page(jnodes[i]));
49165+ }
49166+ return RETERR(-ENOMEM);
49167+ }
49168+
49169+ jnodes[i] = jnode_of_page(page);
49170+ if (IS_ERR(jnodes[i])) {
49171+ unlock_page(page);
49172+ page_cache_release(page);
49173+ while (i --) {
49174+ jput(jnodes[i]);
49175+ page_cache_release(jnode_page(jnodes[i]));
49176+ }
49177+ return RETERR(-ENOMEM);
49178+ }
49179+ /* prevent jnode and page from disconnecting */
49180+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
49181+ unlock_page(page);
49182+ }
49183+
49184+ BUG_ON(get_current_context()->trans->atom != NULL);
49185+
49186+ have_to_update_extent = 0;
49187+
49188+ left = count;
49189+ page_off = (*pos & (PAGE_CACHE_SIZE - 1));
49190+ for (i = 0; i < nr_pages; i ++) {
49191+ to_page = PAGE_CACHE_SIZE - page_off;
49192+ if (to_page > left)
49193+ to_page = left;
49194+ page = jnode_page(jnodes[i]);
49195+ if (page_offset(page) < inode->i_size &&
49196+ !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
49197+ /*
49198+ * the above is not optimal for partial write to last
49199+ * page of file when file size is not at boundary of
49200+ * page
49201+ */
49202+ lock_page(page);
49203+ if (!PageUptodate(page)) {
49204+ result = readpage_unix_file(NULL, page);
49205+ BUG_ON(result != 0);
49206+ /* wait for read completion */
49207+ lock_page(page);
49208+ BUG_ON(!PageUptodate(page));
49209+ } else
49210+ result = 0;
49211+ unlock_page(page);
49212+ }
49213+
49214+ BUG_ON(get_current_context()->trans->atom != NULL);
49215+ fault_in_pages_readable(buf, to_page);
49216+ BUG_ON(get_current_context()->trans->atom != NULL);
49217+
49218+ lock_page(page);
49219+ if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
49220+ void *kaddr;
49221+
49222+ kaddr = kmap_atomic(page, KM_USER0);
49223+ memset(kaddr, 0, page_off);
49224+ memset(kaddr + page_off + to_page, 0,
49225+ PAGE_CACHE_SIZE - (page_off + to_page));
49226+ flush_dcache_page(page);
49227+ kunmap_atomic(kaddr, KM_USER0);
49228+ }
49229+
49230+ written = filemap_copy_from_user(page, page_off, buf, to_page);
49231+ flush_dcache_page(page);
49232+ reiser4_set_page_dirty_internal(page);
49233+ unlock_page(page);
49234+ mark_page_accessed(page);
49235+ SetPageUptodate(page);
49236+ page_cache_release(page);
49237+
49238+ if (jnodes[i]->blocknr == 0)
49239+ have_to_update_extent ++;
49240+
49241+ page_off = 0;
49242+ buf += to_page;
49243+ left -= to_page;
49244+ BUG_ON(get_current_context()->trans->atom != NULL);
49245+ }
49246+
49247+ if (have_to_update_extent) {
49248+ update_extents(file, jnodes, nr_pages, *pos);
49249+ } else {
49250+ for (i = 0; i < nr_pages; i ++) {
49251+ spin_lock_jnode(jnodes[i]);
49252+ result = reiser4_try_capture(jnodes[i],
49253+ ZNODE_WRITE_LOCK, 0);
49254+ BUG_ON(result != 0);
49255+ jnode_make_dirty_locked(jnodes[i]);
49256+ spin_unlock_jnode(jnodes[i]);
49257+ }
49258+ }
49259+
49260+ for (i = 0; i < nr_pages; i ++) {
49261+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
49262+ jput(jnodes[i]);
49263+ }
49264+
49265+ /* the only error handled so far is EFAULT on copy_from_user */
49266+ return (count - left) ? (count - left) : -EFAULT;
49267+}
49268+
49269+static inline void zero_page(struct page *page)
49270+{
49271+ char *kaddr = kmap_atomic(page, KM_USER0);
49272+
49273+ memset(kaddr, 0, PAGE_CACHE_SIZE);
49274+ flush_dcache_page(page);
49275+ kunmap_atomic(kaddr, KM_USER0);
49276+ SetPageUptodate(page);
49277+ unlock_page(page);
49278+}
49279+
49280+int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
49281+ struct page *page)
49282+{
49283+ jnode *j;
49284+ struct address_space *mapping;
49285+ unsigned long index;
49286+ oid_t oid;
49287+ reiser4_block_nr block;
49288+
49289+ mapping = page->mapping;
49290+ oid = get_inode_oid(mapping->host);
49291+ index = page->index;
49292+
49293+ switch (state_of_extent(ext)) {
49294+ case HOLE_EXTENT:
49295+ /*
49296+ * it is possible to have hole page with jnode, if page was
49297+ * eflushed previously.
49298+ */
49299+ j = jfind(mapping, index);
49300+ if (j == NULL) {
49301+ zero_page(page);
49302+ return 0;
49303+ }
49304+ spin_lock_jnode(j);
49305+ if (!jnode_page(j)) {
49306+ jnode_attach_page(j, page);
49307+ } else {
49308+ BUG_ON(jnode_page(j) != page);
49309+ assert("vs-1504", jnode_page(j) == page);
49310+ }
49311+ block = *jnode_get_io_block(j);
49312+ spin_unlock_jnode(j);
49313+ if (block == 0) {
49314+ zero_page(page);
49315+ jput(j);
49316+ return 0;
49317+ }
49318+ break;
49319+
49320+ case ALLOCATED_EXTENT:
49321+ j = jnode_of_page(page);
49322+ if (IS_ERR(j))
49323+ return PTR_ERR(j);
49324+ if (*jnode_get_block(j) == 0) {
49325+ reiser4_block_nr blocknr;
49326+
49327+ blocknr = extent_get_start(ext) + pos;
49328+ jnode_set_block(j, &blocknr);
49329+ } else
49330+ assert("vs-1403",
49331+ j->blocknr == extent_get_start(ext) + pos);
49332+ break;
49333+
49334+ case UNALLOCATED_EXTENT:
49335+ j = jfind(mapping, index);
49336+ assert("nikita-2688", j);
49337+ assert("vs-1426", jnode_page(j) == NULL);
49338+
49339+ spin_lock_jnode(j);
49340+ jnode_attach_page(j, page);
49341+ spin_unlock_jnode(j);
49342+ break;
49343+
49344+ default:
49345+ warning("vs-957", "wrong extent\n");
49346+ return RETERR(-EIO);
49347+ }
49348+
49349+ BUG_ON(j == 0);
49350+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
49351+ jput(j);
49352+ return 0;
49353+}
49354+
49355+/* Implements plugin->u.item.s.file.read operation for extent items. */
49356+int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
49357+{
49358+ int result;
49359+ struct page *page;
49360+ unsigned long cur_page, next_page;
49361+ unsigned long page_off, count;
49362+ struct address_space *mapping;
49363+ loff_t file_off;
49364+ uf_coord_t *uf_coord;
49365+ coord_t *coord;
49366+ extent_coord_extension_t *ext_coord;
49367+ unsigned long nr_pages;
49368+ char *kaddr;
49369+
49370+ assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
49371+ assert("vs-572", flow->user == 1);
49372+ assert("vs-1351", flow->length > 0);
49373+
49374+ uf_coord = &hint->ext_coord;
49375+
49376+ check_uf_coord(uf_coord, NULL);
49377+ assert("vs-33", uf_coord->lh == &hint->lh);
49378+
49379+ coord = &uf_coord->coord;
49380+ assert("vs-1119", znode_is_rlocked(coord->node));
49381+ assert("vs-1120", znode_is_loaded(coord->node));
49382+ assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
49383+
49384+ mapping = file->f_dentry->d_inode->i_mapping;
49385+ ext_coord = &uf_coord->extension.extent;
49386+
49387+ /* offset in a file to start read from */
49388+ file_off = get_key_offset(&flow->key);
49389+ /* offset within the page to start read from */
49390+ page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
49391+ /* bytes which can be read from the page which contains file_off */
49392+ count = PAGE_CACHE_SIZE - page_off;
49393+
49394+ /* index of page containing offset read is to start from */
49395+ cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
49396+ next_page = cur_page;
49397+ /* number of pages flow spans over */
49398+ nr_pages =
49399+ ((file_off + flow->length + PAGE_CACHE_SIZE -
49400+ 1) >> PAGE_CACHE_SHIFT) - cur_page;
49401+
49402+ /* we start having twig node read locked. However, we do not want to
49403+ keep that lock all the time readahead works. So, set a sel and
49404+ release twig node. */
49405+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
49406+ /* &hint->lh is done-ed */
49407+
49408+ do {
49409+ reiser4_txn_restart_current();
49410+ page = read_mapping_page(mapping, cur_page, file);
49411+ if (IS_ERR(page))
49412+ return PTR_ERR(page);
49413+ lock_page(page);
49414+ if (!PageUptodate(page)) {
49415+ unlock_page(page);
49416+ page_cache_release(page);
49417+ warning("jmacd-97178", "extent_read: page is not up to date");
49418+ return RETERR(-EIO);
49419+ }
49420+ mark_page_accessed(page);
49421+ unlock_page(page);
49422+
49423+ /* If users can be writing to this page using arbitrary virtual
49424+ addresses, take care about potential aliasing before reading
49425+ the page on the kernel side.
49426+ */
49427+ if (mapping_writably_mapped(mapping))
49428+ flush_dcache_page(page);
49429+
49430+ assert("nikita-3034", reiser4_schedulable());
49431+
49432+ /* number of bytes which are to be read from the page */
49433+ if (count > flow->length)
49434+ count = flow->length;
49435+
49436+ result = fault_in_pages_writeable(flow->data, count);
49437+ if (result) {
49438+ page_cache_release(page);
49439+ return RETERR(-EFAULT);
49440+ }
49441+
49442+ kaddr = kmap_atomic(page, KM_USER0);
49443+ result = __copy_to_user_inatomic(flow->data,
49444+ kaddr + page_off, count);
49445+ kunmap_atomic(kaddr, KM_USER0);
49446+ if (result != 0) {
49447+ kaddr = kmap(page);
49448+ result = __copy_to_user(flow->data, kaddr + page_off, count);
49449+ kunmap(page);
49450+ if (unlikely(result))
49451+ return RETERR(-EFAULT);
49452+ }
49453+
49454+ page_cache_release(page);
49455+
49456+ /* increase key (flow->key), update user area pointer (flow->data) */
49457+ move_flow_forward(flow, count);
49458+
49459+ page_off = 0;
49460+ cur_page ++;
49461+ count = PAGE_CACHE_SIZE;
49462+ nr_pages--;
49463+ } while (flow->length);
49464+
49465+ return 0;
49466+}
49467+
49468+/*
49469+ plugin->s.file.readpage
49470+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
49471+ or
49472+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
49473+
49474+ At the beginning: coord->node is read locked, zloaded, page is
49475+ locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
49476+*/
49477+int reiser4_readpage_extent(void *vp, struct page *page)
49478+{
49479+ uf_coord_t *uf_coord = vp;
49480+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
49481+ ON_DEBUG(reiser4_key key);
49482+
49483+ assert("vs-1040", PageLocked(page));
49484+ assert("vs-1050", !PageUptodate(page));
49485+ assert("vs-1039", page->mapping && page->mapping->host);
49486+
49487+ assert("vs-1044", znode_is_loaded(coord->node));
49488+ assert("vs-758", item_is_extent(coord));
49489+ assert("vs-1046", coord_is_existing_unit(coord));
49490+ assert("vs-1045", znode_is_rlocked(coord->node));
49491+ assert("vs-1047",
49492+ page->mapping->host->i_ino ==
49493+ get_key_objectid(item_key_by_coord(coord, &key)));
49494+ check_uf_coord(uf_coord, NULL);
49495+
49496+ return reiser4_do_readpage_extent(
49497+ ext_by_ext_coord(uf_coord),
49498+ uf_coord->extension.extent.pos_in_unit, page);
49499+}
49500+
49501+/**
49502+ * get_block_address_extent
49503+ * @coord:
49504+ * @block:
49505+ * @result:
49506+ *
49507+ *
49508+ */
49509+int get_block_address_extent(const coord_t *coord, sector_t block,
49510+ sector_t *result)
49511+{
49512+ reiser4_extent *ext;
49513+
49514+ if (!coord_is_existing_unit(coord))
49515+ return RETERR(-EINVAL);
49516+
49517+ ext = extent_by_coord(coord);
49518+
49519+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
49520+ /* FIXME: bad things may happen if it is unallocated extent */
49521+ *result = 0;
49522+ else {
49523+ reiser4_key key;
49524+
49525+ unit_key_by_coord(coord, &key);
49526+ assert("vs-1645",
49527+ block >= get_key_offset(&key) >> current_blocksize_bits);
49528+ assert("vs-1646",
49529+ block <
49530+ (get_key_offset(&key) >> current_blocksize_bits) +
49531+ extent_get_width(ext));
49532+ *result =
49533+ extent_get_start(ext) + (block -
49534+ (get_key_offset(&key) >>
49535+ current_blocksize_bits));
49536+ }
49537+ return 0;
49538+}
49539+
49540+/*
49541+ plugin->u.item.s.file.append_key
49542+ key of first byte which is the next to last byte by addressed by this extent
49543+*/
49544+reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
49545+{
49546+ item_key_by_coord(coord, key);
49547+ set_key_offset(key,
49548+ get_key_offset(key) + reiser4_extent_size(coord,
49549+ nr_units_extent
49550+ (coord)));
49551+
49552+ assert("vs-610", get_key_offset(key)
49553+ && (get_key_offset(key) & (current_blocksize - 1)) == 0);
49554+ return key;
49555+}
49556+
49557+/* plugin->u.item.s.file.init_coord_extension */
49558+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
49559+{
49560+ coord_t *coord;
49561+ extent_coord_extension_t *ext_coord;
49562+ reiser4_key key;
49563+ loff_t offset;
49564+
49565+ assert("vs-1295", uf_coord->valid == 0);
49566+
49567+ coord = &uf_coord->coord;
49568+ assert("vs-1288", coord_is_iplug_set(coord));
49569+ assert("vs-1327", znode_is_loaded(coord->node));
49570+
49571+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
49572+ return;
49573+
49574+ ext_coord = &uf_coord->extension.extent;
49575+ ext_coord->nr_units = nr_units_extent(coord);
49576+ ext_coord->ext_offset =
49577+ (char *)extent_by_coord(coord) - zdata(coord->node);
49578+ ext_coord->width = extent_get_width(extent_by_coord(coord));
49579+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
49580+ uf_coord->valid = 1;
49581+
49582+ /* pos_in_unit is the only uninitialized field in extended coord */
49583+ if (coord->between == AFTER_UNIT) {
49584+ assert("vs-1330",
49585+ coord->unit_pos == nr_units_extent(coord) - 1);
49586+
49587+ ext_coord->pos_in_unit = ext_coord->width - 1;
49588+ } else {
49589+ /* AT_UNIT */
49590+ unit_key_by_coord(coord, &key);
49591+ offset = get_key_offset(&key);
49592+
49593+ assert("vs-1328", offset <= lookuped);
49594+ assert("vs-1329",
49595+ lookuped <
49596+ offset + ext_coord->width * current_blocksize);
49597+ ext_coord->pos_in_unit =
49598+ ((lookuped - offset) >> current_blocksize_bits);
49599+ }
49600+}
49601+
49602+/*
49603+ * Local variables:
49604+ * c-indentation-style: "K&R"
49605+ * mode-name: "LC"
49606+ * c-basic-offset: 8
49607+ * tab-width: 8
49608+ * fill-column: 79
49609+ * scroll-step: 1
49610+ * End:
49611+ */
49612diff --git a/fs/reiser4/plugin/item/extent_flush_ops.c b/fs/reiser4/plugin/item/extent_flush_ops.c
49613new file mode 100644
49614index 0000000..02dda3e
49615--- /dev/null
49616+++ b/fs/reiser4/plugin/item/extent_flush_ops.c
49617@@ -0,0 +1,1028 @@
49618+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49619+
49620+#include "item.h"
49621+#include "../../tree.h"
49622+#include "../../jnode.h"
49623+#include "../../super.h"
49624+#include "../../flush.h"
49625+#include "../../carry.h"
49626+#include "../object.h"
49627+
49628+#include <linux/pagemap.h>
49629+
49630+static reiser4_block_nr extent_unit_start(const coord_t * item);
49631+
49632+/* Return either first or last extent (depending on @side) of the item
49633+ @coord is set to. Set @pos_in_unit either to first or to last block
49634+ of extent. */
49635+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
49636+ reiser4_block_nr * pos_in_unit)
49637+{
49638+ reiser4_extent *ext;
49639+
49640+ if (side == LEFT_SIDE) {
49641+ /* get first extent of item */
49642+ ext = extent_item(coord);
49643+ *pos_in_unit = 0;
49644+ } else {
49645+ /* get last extent of item and last position within it */
49646+ assert("vs-363", side == RIGHT_SIDE);
49647+ ext = extent_item(coord) + coord_last_unit_pos(coord);
49648+ *pos_in_unit = extent_get_width(ext) - 1;
49649+ }
49650+
49651+ return ext;
49652+}
49653+
49654+/* item_plugin->f.utmost_child */
49655+/* Return the child. Coord is set to extent item. Find jnode corresponding
49656+ either to first or to last unformatted node pointed by the item */
49657+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
49658+{
49659+ reiser4_extent *ext;
49660+ reiser4_block_nr pos_in_unit;
49661+
49662+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
49663+
49664+ switch (state_of_extent(ext)) {
49665+ case HOLE_EXTENT:
49666+ *childp = NULL;
49667+ return 0;
49668+ case ALLOCATED_EXTENT:
49669+ case UNALLOCATED_EXTENT:
49670+ break;
49671+ default:
49672+ /* this should never happen */
49673+ assert("vs-1417", 0);
49674+ }
49675+
49676+ {
49677+ reiser4_key key;
49678+ reiser4_tree *tree;
49679+ unsigned long index;
49680+
49681+ if (side == LEFT_SIDE) {
49682+ /* get key of first byte addressed by the extent */
49683+ item_key_by_coord(coord, &key);
49684+ } else {
49685+ /* get key of byte which next after last byte addressed by the extent */
49686+ append_key_extent(coord, &key);
49687+ }
49688+
49689+ assert("vs-544",
49690+ (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
49691+ /* index of first or last (depending on @side) page addressed
49692+ by the extent */
49693+ index =
49694+ (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
49695+ if (side == RIGHT_SIDE)
49696+ index--;
49697+
49698+ tree = coord->node->zjnode.tree;
49699+ *childp = jlookup(tree, get_key_objectid(&key), index);
49700+ }
49701+
49702+ return 0;
49703+}
49704+
49705+/* item_plugin->f.utmost_child_real_block */
49706+/* Return the child's block, if allocated. */
49707+int
49708+utmost_child_real_block_extent(const coord_t * coord, sideof side,
49709+ reiser4_block_nr * block)
49710+{
49711+ reiser4_extent *ext;
49712+
49713+ ext = extent_by_coord(coord);
49714+
49715+ switch (state_of_extent(ext)) {
49716+ case ALLOCATED_EXTENT:
49717+ *block = extent_get_start(ext);
49718+ if (side == RIGHT_SIDE)
49719+ *block += extent_get_width(ext) - 1;
49720+ break;
49721+ case HOLE_EXTENT:
49722+ case UNALLOCATED_EXTENT:
49723+ *block = 0;
49724+ break;
49725+ default:
49726+ /* this should never happen */
49727+ assert("vs-1418", 0);
49728+ }
49729+
49730+ return 0;
49731+}
49732+
49733+/* item_plugin->f.scan */
49734+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
49735+ This scan continues, advancing the parent coordinate, until either it encounters a
49736+ formatted child or it finishes scanning this node.
49737+
49738+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
49739+ not sure this is last property (same atom) is enforced, but it should be the case since
49740+ one atom must write the parent and the others must read the parent, thus fusing?). In
49741+ any case, the code below asserts this case for unallocated extents. Unallocated
49742+ extents are thus optimized because we can skip to the endpoint when scanning.
49743+
49744+ It returns control to reiser4_scan_extent, handles these terminating conditions,
49745+ e.g., by loading the next twig.
49746+*/
49747+int reiser4_scan_extent(flush_scan * scan)
49748+{
49749+ coord_t coord;
49750+ jnode *neighbor;
49751+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
49752+ reiser4_block_nr unit_start;
49753+ __u64 oid;
49754+ reiser4_key key;
49755+ int ret = 0, allocated, incr;
49756+ reiser4_tree *tree;
49757+
49758+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
49759+ scan->stop = 1;
49760+ return 0; /* Race with truncate, this node is already
49761+ * truncated. */
49762+ }
49763+
49764+ coord_dup(&coord, &scan->parent_coord);
49765+
49766+ assert("jmacd-1404", !reiser4_scan_finished(scan));
49767+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
49768+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
49769+
49770+ /* The scan_index variable corresponds to the current page index of the
49771+ unformatted block scan position. */
49772+ scan_index = index_jnode(scan->node);
49773+
49774+ assert("jmacd-7889", item_is_extent(&coord));
49775+
49776+ repeat:
49777+ /* objectid of file */
49778+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
49779+
49780+ allocated = !extent_is_unallocated(&coord);
49781+ /* Get the values of this extent unit: */
49782+ unit_index = extent_unit_index(&coord);
49783+ unit_width = extent_unit_width(&coord);
49784+ unit_start = extent_unit_start(&coord);
49785+
49786+ assert("jmacd-7187", unit_width > 0);
49787+ assert("jmacd-7188", scan_index >= unit_index);
49788+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
49789+
49790+ /* Depending on the scan direction, we set different maximum values for scan_index
49791+ (scan_max) and the number of nodes that would be passed if the scan goes the
49792+ entire way (scan_dist). Incr is an integer reflecting the incremental
49793+ direction of scan_index. */
49794+ if (reiser4_scanning_left(scan)) {
49795+ scan_max = unit_index;
49796+ scan_dist = scan_index - unit_index;
49797+ incr = -1;
49798+ } else {
49799+ scan_max = unit_index + unit_width - 1;
49800+ scan_dist = scan_max - unit_index;
49801+ incr = +1;
49802+ }
49803+
49804+ tree = coord.node->zjnode.tree;
49805+
49806+ /* If the extent is allocated we have to check each of its blocks. If the extent
49807+ is unallocated we can skip to the scan_max. */
49808+ if (allocated) {
49809+ do {
49810+ neighbor = jlookup(tree, oid, scan_index);
49811+ if (neighbor == NULL)
49812+ goto stop_same_parent;
49813+
49814+ if (scan->node != neighbor
49815+ && !reiser4_scan_goto(scan, neighbor)) {
49816+ /* @neighbor was jput() by reiser4_scan_goto */
49817+ goto stop_same_parent;
49818+ }
49819+
49820+ ret = scan_set_current(scan, neighbor, 1, &coord);
49821+ if (ret != 0) {
49822+ goto exit;
49823+ }
49824+
49825+ /* reference to @neighbor is stored in @scan, no need
49826+ to jput(). */
49827+ scan_index += incr;
49828+
49829+ } while (incr + scan_max != scan_index);
49830+
49831+ } else {
49832+ /* Optimized case for unallocated extents, skip to the end. */
49833+ neighbor = jlookup(tree, oid, scan_max /*index */ );
49834+ if (neighbor == NULL) {
49835+ /* Race with truncate */
49836+ scan->stop = 1;
49837+ ret = 0;
49838+ goto exit;
49839+ }
49840+
49841+ assert("zam-1043",
49842+ reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
49843+
49844+ ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49845+ if (ret != 0) {
49846+ goto exit;
49847+ }
49848+ }
49849+
49850+ if (coord_sideof_unit(&coord, scan->direction) == 0
49851+ && item_is_extent(&coord)) {
49852+ /* Continue as long as there are more extent units. */
49853+
49854+ scan_index =
49855+ extent_unit_index(&coord) +
49856+ (reiser4_scanning_left(scan) ?
49857+ extent_unit_width(&coord) - 1 : 0);
49858+ goto repeat;
49859+ }
49860+
49861+ if (0) {
49862+ stop_same_parent:
49863+
49864+ /* If we are scanning left and we stop in the middle of an allocated
49865+ extent, we know the preceder immediately.. */
49866+ /* middle of extent is (scan_index - unit_index) != 0. */
49867+ if (reiser4_scanning_left(scan) &&
49868+ (scan_index - unit_index) != 0) {
49869+ /* FIXME(B): Someone should step-through and verify that this preceder
49870+ calculation is indeed correct. */
49871+ /* @unit_start is starting block (number) of extent
49872+ unit. Flush stopped at the @scan_index block from
49873+ the beginning of the file, which is (scan_index -
49874+ unit_index) block within extent.
49875+ */
49876+ if (unit_start) {
49877+ /* skip preceder update when we are at hole */
49878+ scan->preceder_blk =
49879+ unit_start + scan_index - unit_index;
49880+ check_preceder(scan->preceder_blk);
49881+ }
49882+ }
49883+
49884+ /* In this case, we leave coord set to the parent of scan->node. */
49885+ scan->stop = 1;
49886+
49887+ } else {
49888+ /* In this case, we are still scanning, coord is set to the next item which is
49889+ either off-the-end of the node or not an extent. */
49890+ assert("jmacd-8912", scan->stop == 0);
49891+ assert("jmacd-7812",
49892+ (coord_is_after_sideof_unit(&coord, scan->direction)
49893+ || !item_is_extent(&coord)));
49894+ }
49895+
49896+ ret = 0;
49897+ exit:
49898+ return ret;
49899+}
49900+
49901+/* ask block allocator for some blocks */
49902+static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49903+ reiser4_block_nr wanted_count,
49904+ reiser4_block_nr *first_allocated,
49905+ reiser4_block_nr *allocated,
49906+ block_stage_t block_stage)
49907+{
49908+ *allocated = wanted_count;
49909+ preceder->max_dist = 0; /* scan whole disk, if needed */
49910+
49911+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49912+ preceder->block_stage = block_stage;
49913+
49914+ /* FIXME: we do not handle errors here now */
49915+ check_me("vs-420",
49916+ reiser4_alloc_blocks(preceder, first_allocated, allocated,
49917+ BA_PERMANENT) == 0);
49918+ /* update flush_pos's preceder to last allocated block number */
49919+ preceder->blk = *first_allocated + *allocated - 1;
49920+}
49921+
49922+/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49923+ will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49924+ to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49925+static reiser4_block_nr reserve_replace(void)
49926+{
49927+ reiser4_block_nr grabbed, needed;
49928+
49929+ grabbed = get_current_context()->grabbed_blocks;
49930+ needed = estimate_one_insert_into_item(current_tree);
49931+ check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49932+ return grabbed;
49933+}
49934+
49935+static void free_replace_reserved(reiser4_block_nr grabbed)
49936+{
49937+ reiser4_context *ctx;
49938+
49939+ ctx = get_current_context();
49940+ grabbed2free(ctx, get_super_private(ctx->super),
49941+ ctx->grabbed_blocks - grabbed);
49942+}
49943+
49944+/* Block offset of first block addressed by unit */
49945+__u64 extent_unit_index(const coord_t * item)
49946+{
49947+ reiser4_key key;
49948+
49949+ assert("vs-648", coord_is_existing_unit(item));
49950+ unit_key_by_coord(item, &key);
49951+ return get_key_offset(&key) >> current_blocksize_bits;
49952+}
49953+
49954+/* AUDIT shouldn't return value be of reiser4_block_nr type?
49955+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
49956+__u64 extent_unit_width(const coord_t * item)
49957+{
49958+ assert("vs-649", coord_is_existing_unit(item));
49959+ return width_by_coord(item);
49960+}
49961+
49962+/* Starting block location of this unit */
49963+static reiser4_block_nr extent_unit_start(const coord_t * item)
49964+{
49965+ return extent_get_start(extent_by_coord(item));
49966+}
49967+
49968+/**
49969+ * split_allocated_extent -
49970+ * @coord:
49971+ * @pos_in_unit:
49972+ *
49973+ * replace allocated extent with two allocated extents
49974+ */
49975+static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49976+{
49977+ int result;
49978+ struct replace_handle *h;
49979+ reiser4_extent *ext;
49980+ reiser4_block_nr grabbed;
49981+
49982+ ext = extent_by_coord(coord);
49983+ assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49984+ assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49985+
49986+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49987+ if (h == NULL)
49988+ return RETERR(-ENOMEM);
49989+ h->coord = coord;
49990+ h->lh = znode_lh(coord->node);
49991+ h->pkey = &h->key;
49992+ unit_key_by_coord(coord, h->pkey);
49993+ set_key_offset(h->pkey,
49994+ (get_key_offset(h->pkey) +
49995+ pos_in_unit * current_blocksize));
49996+ reiser4_set_extent(&h->overwrite, extent_get_start(ext),
49997+ pos_in_unit);
49998+ reiser4_set_extent(&h->new_extents[0],
49999+ extent_get_start(ext) + pos_in_unit,
50000+ extent_get_width(ext) - pos_in_unit);
50001+ h->nr_new_extents = 1;
50002+ h->flags = COPI_DONT_SHIFT_LEFT;
50003+ h->paste_key = h->key;
50004+
50005+ /* reserve space for extent unit paste, @grabbed is reserved before */
50006+ grabbed = reserve_replace();
50007+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
50008+ extent */);
50009+ /* restore reserved */
50010+ free_replace_reserved(grabbed);
50011+ kfree(h);
50012+ return result;
50013+}
50014+
50015+/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
50016+ one). Return 1 if it succeeded, 0 - otherwise */
50017+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
50018+ reiser4_extent *replace)
50019+{
50020+ assert("vs-1415", extent_by_coord(coord) == ext);
50021+
50022+ if (coord->unit_pos == 0
50023+ || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
50024+ /* @ext either does not exist or is not allocated extent */
50025+ return 0;
50026+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
50027+ extent_get_start(replace))
50028+ return 0;
50029+
50030+ /* we can glue, widen previous unit */
50031+ extent_set_width(ext - 1,
50032+ extent_get_width(ext - 1) + extent_get_width(replace));
50033+
50034+ if (extent_get_width(ext) != extent_get_width(replace)) {
50035+ /* make current extent narrower */
50036+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
50037+ extent_set_start(ext,
50038+ extent_get_start(ext) +
50039+ extent_get_width(replace));
50040+ extent_set_width(ext,
50041+ extent_get_width(ext) -
50042+ extent_get_width(replace));
50043+ } else {
50044+ /* current extent completely glued with its left neighbor, remove it */
50045+ coord_t from, to;
50046+
50047+ coord_dup(&from, coord);
50048+ from.unit_pos = nr_units_extent(coord) - 1;
50049+ coord_dup(&to, &from);
50050+
50051+ /* currently cut from extent can cut either from the beginning or from the end. Move place which got
50052+ freed after unit removal to end of item */
50053+ memmove(ext, ext + 1,
50054+ (from.unit_pos -
50055+ coord->unit_pos) * sizeof(reiser4_extent));
50056+ /* wipe part of item which is going to be cut, so that node_check will not be confused */
50057+ cut_node_content(&from, &to, NULL, NULL, NULL);
50058+ }
50059+ znode_make_dirty(coord->node);
50060+ /* move coord back */
50061+ coord->unit_pos--;
50062+ return 1;
50063+}
50064+
50065+/**
50066+ * conv_extent - replace extent with 2 ones
50067+ * @coord: coordinate of extent to be replaced
50068+ * @replace: extent to overwrite the one @coord is set to
50069+ *
50070+ * Overwrites extent @coord is set to and paste one extent unit after
50071+ * overwritten one if @replace is shorter than initial extent
50072+ */
50073+static int conv_extent(coord_t *coord, reiser4_extent *replace)
50074+{
50075+ int result;
50076+ struct replace_handle *h;
50077+ reiser4_extent *ext;
50078+ reiser4_block_nr start, width, new_width;
50079+ reiser4_block_nr grabbed;
50080+ extent_state state;
50081+
50082+ ext = extent_by_coord(coord);
50083+ state = state_of_extent(ext);
50084+ start = extent_get_start(ext);
50085+ width = extent_get_width(ext);
50086+ new_width = extent_get_width(replace);
50087+
50088+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
50089+ state == ALLOCATED_EXTENT));
50090+ assert("vs-1459", width >= new_width);
50091+
50092+ if (try_to_merge_with_left(coord, ext, replace)) {
50093+ /* merged @replace with left neighbor. Current unit is either
50094+ removed or narrowed */
50095+ return 0;
50096+ }
50097+
50098+ if (width == new_width) {
50099+ /* replace current extent with @replace */
50100+ *ext = *replace;
50101+ znode_make_dirty(coord->node);
50102+ return 0;
50103+ }
50104+
50105+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
50106+ if (h == NULL)
50107+ return RETERR(-ENOMEM);
50108+ h->coord = coord;
50109+ h->lh = znode_lh(coord->node);
50110+ h->pkey = &h->key;
50111+ unit_key_by_coord(coord, h->pkey);
50112+ set_key_offset(h->pkey,
50113+ (get_key_offset(h->pkey) + new_width * current_blocksize));
50114+ h->overwrite = *replace;
50115+
50116+ /* replace @ext with @replace and padding extent */
50117+ reiser4_set_extent(&h->new_extents[0],
50118+ (state == ALLOCATED_EXTENT) ?
50119+ (start + new_width) :
50120+ UNALLOCATED_EXTENT_START,
50121+ width - new_width);
50122+ h->nr_new_extents = 1;
50123+ h->flags = COPI_DONT_SHIFT_LEFT;
50124+ h->paste_key = h->key;
50125+
50126+ /* reserve space for extent unit paste, @grabbed is reserved before */
50127+ grabbed = reserve_replace();
50128+ result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
50129+ extent */);
50130+
50131+ /* restore reserved */
50132+ free_replace_reserved(grabbed);
50133+ kfree(h);
50134+ return result;
50135+}
50136+
50137+/**
50138+ * assign_real_blocknrs
50139+ * @flush_pos:
50140+ * @oid: objectid of file jnodes to assign block number to belongs to
50141+ * @index: first jnode on the range
50142+ * @count: number of jnodes to assign block numbers to
50143+ * @first: start of allocated block range
50144+ *
50145+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
50146+ * @index. Jnodes get lookuped with jlookup.
50147+ */
50148+static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
50149+ unsigned long index, reiser4_block_nr count,
50150+ reiser4_block_nr first)
50151+{
50152+ unsigned long i;
50153+ reiser4_tree *tree;
50154+ txn_atom *atom;
50155+ int nr;
50156+
50157+ atom = atom_locked_by_fq(flush_pos->fq);
50158+ assert("vs-1468", atom);
50159+ BUG_ON(atom == NULL);
50160+
50161+ nr = 0;
50162+ tree = current_tree;
50163+ for (i = 0; i < count; ++i, ++index) {
50164+ jnode *node;
50165+
50166+ node = jlookup(tree, oid, index);
50167+ assert("", node != NULL);
50168+ BUG_ON(node == NULL);
50169+
50170+ spin_lock_jnode(node);
50171+ assert("", !jnode_is_flushprepped(node));
50172+ assert("vs-1475", node->atom == atom);
50173+ assert("vs-1476", atomic_read(&node->x_count) > 0);
50174+
50175+ JF_CLR(node, JNODE_FLUSH_RESERVED);
50176+ jnode_set_block(node, &first);
50177+ unformatted_make_reloc(node, flush_pos->fq);
50178+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
50179+ FQ_LIST, 0));
50180+ spin_unlock_jnode(node);
50181+ first++;
50182+
50183+ atomic_dec(&node->x_count);
50184+ nr ++;
50185+ }
50186+
50187+ spin_unlock_atom(atom);
50188+ return;
50189+}
50190+
50191+/**
50192+ * make_node_ovrwr - assign node to overwrite set
50193+ * @jnodes: overwrite set list head
50194+ * @node: jnode to belong to overwrite set
50195+ *
50196+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
50197+ * which is an accumulator for nodes before they get to overwrite set list of
50198+ * atom.
50199+ */
50200+static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
50201+{
50202+ spin_lock_jnode(node);
50203+
50204+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
50205+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
50206+
50207+ JF_SET(node, JNODE_OVRWR);
50208+ list_move_tail(&node->capture_link, jnodes);
50209+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
50210+
50211+ spin_unlock_jnode(node);
50212+}
50213+
50214+/**
50215+ * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
50216+ * @flush_pos: flush position
50217+ * @oid: objectid of file jnodes belong to
50218+ * @index: starting index
50219+ * @width: extent width
50220+ *
50221+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
50222+ * overwrite set. Starting from the one with index @index. If end of slum is
50223+ * detected (node is not found or flushprepped) - stop iterating and set flush
50224+ * position's state to POS_INVALID.
50225+ */
50226+static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
50227+ unsigned long index, reiser4_block_nr width)
50228+{
50229+ unsigned long i;
50230+ reiser4_tree *tree;
50231+ jnode *node;
50232+ txn_atom *atom;
50233+ LIST_HEAD(jnodes);
50234+
50235+ tree = current_tree;
50236+
50237+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
50238+ assert("vs-1478", atom);
50239+
50240+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
50241+ node = jlookup(tree, oid, index);
50242+ if (!node) {
50243+ flush_pos->state = POS_INVALID;
50244+ break;
50245+ }
50246+ if (jnode_check_flushprepped(node)) {
50247+ flush_pos->state = POS_INVALID;
50248+ atomic_dec(&node->x_count);
50249+ break;
50250+ }
50251+ if (node->atom != atom) {
50252+ flush_pos->state = POS_INVALID;
50253+ atomic_dec(&node->x_count);
50254+ break;
50255+ }
50256+ make_node_ovrwr(&jnodes, node);
50257+ atomic_dec(&node->x_count);
50258+ }
50259+
50260+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
50261+ spin_unlock_atom(atom);
50262+}
50263+
50264+/**
50265+ * allocated_extent_slum_size
50266+ * @flush_pos:
50267+ * @oid:
50268+ * @index:
50269+ * @count:
50270+ *
50271+ *
50272+ */
50273+static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
50274+ unsigned long index, unsigned long count)
50275+{
50276+ unsigned long i;
50277+ reiser4_tree *tree;
50278+ txn_atom *atom;
50279+ int nr;
50280+
50281+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
50282+ assert("vs-1468", atom);
50283+
50284+ nr = 0;
50285+ tree = current_tree;
50286+ for (i = 0; i < count; ++i, ++index) {
50287+ jnode *node;
50288+
50289+ node = jlookup(tree, oid, index);
50290+ if (!node)
50291+ break;
50292+
50293+ if (jnode_check_flushprepped(node)) {
50294+ atomic_dec(&node->x_count);
50295+ break;
50296+ }
50297+
50298+ if (node->atom != atom) {
50299+ /*
50300+ * this is possible on overwrite: extent_write may
50301+ * capture several unformatted nodes without capturing
50302+ * any formatted nodes.
50303+ */
50304+ atomic_dec(&node->x_count);
50305+ break;
50306+ }
50307+
50308+ assert("vs-1476", atomic_read(&node->x_count) > 1);
50309+ atomic_dec(&node->x_count);
50310+ nr ++;
50311+ }
50312+
50313+ spin_unlock_atom(atom);
50314+ return nr;
50315+}
50316+
50317+/**
50318+ * alloc_extent
50319+ * @flush_pos:
50320+ *
50321+ *
50322+ * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
50323+ * is set to. It is to prepare for flushing sequence of not flushprepped nodes
50324+ * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
50325+ * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
50326+ * set to 1 and to overwrite set otherwise
50327+ */
50328+int reiser4_alloc_extent(flush_pos_t *flush_pos)
50329+{
50330+ coord_t *coord;
50331+ reiser4_extent *ext;
50332+ reiser4_extent replace_ext;
50333+ oid_t oid;
50334+ reiser4_block_nr protected;
50335+ reiser4_block_nr start;
50336+ __u64 index;
50337+ __u64 width;
50338+ extent_state state;
50339+ int result;
50340+ reiser4_block_nr first_allocated;
50341+ __u64 allocated;
50342+ reiser4_key key;
50343+ block_stage_t block_stage;
50344+
50345+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
50346+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
50347+ && item_is_extent(&flush_pos->coord));
50348+
50349+ coord = &flush_pos->coord;
50350+
50351+ ext = extent_by_coord(coord);
50352+ state = state_of_extent(ext);
50353+ if (state == HOLE_EXTENT) {
50354+ flush_pos->state = POS_INVALID;
50355+ return 0;
50356+ }
50357+
50358+ item_key_by_coord(coord, &key);
50359+ oid = get_key_objectid(&key);
50360+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
50361+ start = extent_get_start(ext);
50362+ width = extent_get_width(ext);
50363+
50364+ assert("vs-1457", width > flush_pos->pos_in_unit);
50365+
50366+ if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
50367+ /* relocate */
50368+ if (flush_pos->pos_in_unit) {
50369+ /* split extent unit into two */
50370+ result =
50371+ split_allocated_extent(coord,
50372+ flush_pos->pos_in_unit);
50373+ flush_pos->pos_in_unit = 0;
50374+ return result;
50375+ }
50376+
50377+ /* limit number of nodes to allocate */
50378+ if (flush_pos->nr_to_write < width)
50379+ width = flush_pos->nr_to_write;
50380+
50381+ if (state == ALLOCATED_EXTENT) {
50382+ /*
50383+ * all protected nodes are not flushprepped, therefore
50384+ * they are counted as flush_reserved
50385+ */
50386+ block_stage = BLOCK_FLUSH_RESERVED;
50387+ protected = allocated_extent_slum_size(flush_pos, oid,
50388+ index, width);
50389+ if (protected == 0) {
50390+ flush_pos->state = POS_INVALID;
50391+ flush_pos->pos_in_unit = 0;
50392+ return 0;
50393+ }
50394+ } else {
50395+ block_stage = BLOCK_UNALLOCATED;
50396+ protected = width;
50397+ }
50398+
50399+ /*
50400+ * look at previous unit if possible. If it is allocated, make
50401+ * preceder more precise
50402+ */
50403+ if (coord->unit_pos &&
50404+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50405+ reiser4_pos_hint(flush_pos)->blk =
50406+ extent_get_start(ext - 1) +
50407+ extent_get_width(ext - 1);
50408+
50409+ /* allocate new block numbers for protected nodes */
50410+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
50411+ protected,
50412+ &first_allocated, &allocated,
50413+ block_stage);
50414+
50415+ if (state == ALLOCATED_EXTENT)
50416+ /*
50417+ * on relocating - free nodes which are going to be
50418+ * relocated
50419+ */
50420+ reiser4_dealloc_blocks(&start, &allocated,
50421+ BLOCK_ALLOCATED, BA_DEFER);
50422+
50423+ /* assign new block numbers to protected nodes */
50424+ assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
50425+
50426+ /* prepare extent which will replace current one */
50427+ reiser4_set_extent(&replace_ext, first_allocated, allocated);
50428+
50429+ /* adjust extent item */
50430+ result = conv_extent(coord, &replace_ext);
50431+ if (result != 0 && result != -ENOMEM) {
50432+ warning("vs-1461",
50433+ "Failed to allocate extent. Should not happen\n");
50434+ return result;
50435+ }
50436+
50437+ /*
50438+ * break flush: we prepared for flushing as many blocks as we
50439+ * were asked for
50440+ */
50441+ if (flush_pos->nr_to_write == allocated)
50442+ flush_pos->state = POS_INVALID;
50443+ } else {
50444+ /* overwrite */
50445+ mark_jnodes_overwrite(flush_pos, oid, index, width);
50446+ }
50447+ flush_pos->pos_in_unit = 0;
50448+ return 0;
50449+}
50450+
50451+/* if @key is glueable to the item @coord is set to */
50452+static int must_insert(const coord_t *coord, const reiser4_key *key)
50453+{
50454+ reiser4_key last;
50455+
50456+ if (item_id_by_coord(coord) == EXTENT_POINTER_ID
50457+ && keyeq(append_key_extent(coord, &last), key))
50458+ return 0;
50459+ return 1;
50460+}
50461+
50462+/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
50463+ or modify last unit of last item to have greater width */
50464+static int put_unit_to_end(znode *node, const reiser4_key *key,
50465+ reiser4_extent *copy_ext)
50466+{
50467+ int result;
50468+ coord_t coord;
50469+ cop_insert_flag flags;
50470+ reiser4_extent *last_ext;
50471+ reiser4_item_data data;
50472+
50473+ /* set coord after last unit in an item */
50474+ coord_init_last_unit(&coord, node);
50475+ coord.between = AFTER_UNIT;
50476+
50477+ flags =
50478+ COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
50479+ if (must_insert(&coord, key)) {
50480+ result =
50481+ insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
50482+ key, NULL /*lh */ , flags);
50483+
50484+ } else {
50485+ /* try to glue with last unit */
50486+ last_ext = extent_by_coord(&coord);
50487+ if (state_of_extent(last_ext) &&
50488+ extent_get_start(last_ext) + extent_get_width(last_ext) ==
50489+ extent_get_start(copy_ext)) {
50490+ /* widen last unit of node */
50491+ extent_set_width(last_ext,
50492+ extent_get_width(last_ext) +
50493+ extent_get_width(copy_ext));
50494+ znode_make_dirty(node);
50495+ return 0;
50496+ }
50497+
50498+ /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
50499+ result =
50500+ insert_into_item(&coord, NULL /*lh */ , key,
50501+ init_new_extent(&data, copy_ext, 1),
50502+ flags);
50503+ }
50504+
50505+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
50506+ return result;
50507+}
50508+
50509+/* @coord is set to extent unit */
50510+squeeze_result squalloc_extent(znode *left, const coord_t *coord,
50511+ flush_pos_t *flush_pos,
50512+ reiser4_key *stop_key)
50513+{
50514+ reiser4_extent *ext;
50515+ __u64 index;
50516+ __u64 width;
50517+ reiser4_block_nr start;
50518+ extent_state state;
50519+ oid_t oid;
50520+ reiser4_block_nr first_allocated;
50521+ __u64 allocated;
50522+ __u64 protected;
50523+ reiser4_extent copy_extent;
50524+ reiser4_key key;
50525+ int result;
50526+ block_stage_t block_stage;
50527+
50528+ assert("vs-1457", flush_pos->pos_in_unit == 0);
50529+ assert("vs-1467", coord_is_leftmost_unit(coord));
50530+ assert("vs-1467", item_is_extent(coord));
50531+
50532+ ext = extent_by_coord(coord);
50533+ index = extent_unit_index(coord);
50534+ start = extent_get_start(ext);
50535+ width = extent_get_width(ext);
50536+ state = state_of_extent(ext);
50537+ unit_key_by_coord(coord, &key);
50538+ oid = get_key_objectid(&key);
50539+
50540+ if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
50541+ (state == UNALLOCATED_EXTENT)) {
50542+ /* relocate */
50543+ if (state == ALLOCATED_EXTENT) {
50544+ /* all protected nodes are not flushprepped, therefore
50545+ * they are counted as flush_reserved */
50546+ block_stage = BLOCK_FLUSH_RESERVED;
50547+ protected = allocated_extent_slum_size(flush_pos, oid,
50548+ index, width);
50549+ if (protected == 0) {
50550+ flush_pos->state = POS_INVALID;
50551+ flush_pos->pos_in_unit = 0;
50552+ return 0;
50553+ }
50554+ } else {
50555+ block_stage = BLOCK_UNALLOCATED;
50556+ protected = width;
50557+ }
50558+
50559+ /*
50560+ * look at previous unit if possible. If it is allocated, make
50561+ * preceder more precise
50562+ */
50563+ if (coord->unit_pos &&
50564+ (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50565+ reiser4_pos_hint(flush_pos)->blk =
50566+ extent_get_start(ext - 1) +
50567+ extent_get_width(ext - 1);
50568+
50569+ /* allocate new block numbers for protected nodes */
50570+ extent_allocate_blocks(reiser4_pos_hint(flush_pos),
50571+ protected,
50572+ &first_allocated, &allocated,
50573+ block_stage);
50574+
50575+ /* prepare extent which will be copied to left */
50576+ reiser4_set_extent(&copy_extent, first_allocated, allocated);
50577+
50578+ result = put_unit_to_end(left, &key, &copy_extent);
50579+ if (result == -E_NODE_FULL) {
50580+ int target_block_stage;
50581+
50582+ /* free blocks which were just allocated */
50583+ target_block_stage =
50584+ (state ==
50585+ ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
50586+ BLOCK_UNALLOCATED;
50587+ reiser4_dealloc_blocks(&first_allocated, &allocated,
50588+ target_block_stage,
50589+ BA_PERMANENT);
50590+
50591+ /* rewind the preceder. */
50592+ flush_pos->preceder.blk = first_allocated;
50593+ check_preceder(flush_pos->preceder.blk);
50594+
50595+ return SQUEEZE_TARGET_FULL;
50596+ }
50597+
50598+ if (state == ALLOCATED_EXTENT) {
50599+ /* free nodes which were relocated */
50600+ reiser4_dealloc_blocks(&start, &allocated,
50601+ BLOCK_ALLOCATED, BA_DEFER);
50602+ }
50603+
50604+ /* assign new block numbers to protected nodes */
50605+ assign_real_blocknrs(flush_pos, oid, index, allocated,
50606+ first_allocated);
50607+
50608+ set_key_offset(&key,
50609+ get_key_offset(&key) +
50610+ (allocated << current_blocksize_bits));
50611+ } else {
50612+ /*
50613+ * overwrite: try to copy unit as it is to left neighbor and
50614+ * make all first not flushprepped nodes overwrite nodes
50615+ */
50616+ reiser4_set_extent(&copy_extent, start, width);
50617+ result = put_unit_to_end(left, &key, &copy_extent);
50618+ if (result == -E_NODE_FULL)
50619+ return SQUEEZE_TARGET_FULL;
50620+
50621+ if (state != HOLE_EXTENT)
50622+ mark_jnodes_overwrite(flush_pos, oid, index, width);
50623+ set_key_offset(&key,
50624+ get_key_offset(&key) +
50625+ (width << current_blocksize_bits));
50626+ }
50627+ *stop_key = key;
50628+ return SQUEEZE_CONTINUE;
50629+}
50630+
50631+int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
50632+{
50633+ return key_by_inode_and_offset_common(inode, off, key);
50634+}
50635+
50636+/*
50637+ * Local variables:
50638+ * c-indentation-style: "K&R"
50639+ * mode-name: "LC"
50640+ * c-basic-offset: 8
50641+ * tab-width: 8
50642+ * fill-column: 79
50643+ * scroll-step: 1
50644+ * End:
50645+ */
50646diff --git a/fs/reiser4/plugin/item/extent_item_ops.c b/fs/reiser4/plugin/item/extent_item_ops.c
50647new file mode 100644
50648index 0000000..53ba8e7
50649--- /dev/null
50650+++ b/fs/reiser4/plugin/item/extent_item_ops.c
50651@@ -0,0 +1,889 @@
50652+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50653+
50654+#include "item.h"
50655+#include "../../inode.h"
50656+#include "../../tree_walk.h" /* check_sibling_list() */
50657+#include "../../page_cache.h"
50658+#include "../../carry.h"
50659+
50660+#include <linux/quotaops.h>
50661+
50662+/* item_plugin->b.max_key_inside */
50663+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50664+{
50665+ item_key_by_coord(coord, key);
50666+ set_key_offset(key, get_key_offset(reiser4_max_key()));
50667+ return key;
50668+}
50669+
50670+/* item_plugin->b.can_contain_key
50671+ this checks whether @key of @data is matching to position set by @coord */
50672+int
50673+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50674+ const reiser4_item_data * data)
50675+{
50676+ reiser4_key item_key;
50677+
50678+ if (item_plugin_by_coord(coord) != data->iplug)
50679+ return 0;
50680+
50681+ item_key_by_coord(coord, &item_key);
50682+ if (get_key_locality(key) != get_key_locality(&item_key) ||
50683+ get_key_objectid(key) != get_key_objectid(&item_key) ||
50684+ get_key_ordering(key) != get_key_ordering(&item_key))
50685+ return 0;
50686+
50687+ return 1;
50688+}
50689+
50690+/* item_plugin->b.mergeable
50691+ first item is of extent type */
50692+/* Audited by: green(2002.06.13) */
50693+int mergeable_extent(const coord_t * p1, const coord_t * p2)
50694+{
50695+ reiser4_key key1, key2;
50696+
50697+ assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50698+ /* FIXME-VS: Which is it? Assert or return 0 */
50699+ if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50700+ return 0;
50701+ }
50702+
50703+ item_key_by_coord(p1, &key1);
50704+ item_key_by_coord(p2, &key2);
50705+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
50706+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
50707+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
50708+ get_key_type(&key1) != get_key_type(&key2))
50709+ return 0;
50710+ if (get_key_offset(&key1) +
50711+ reiser4_extent_size(p1, nr_units_extent(p1)) !=
50712+ get_key_offset(&key2))
50713+ return 0;
50714+ return 1;
50715+}
50716+
50717+/* item_plugin->b.nr_units */
50718+pos_in_node_t nr_units_extent(const coord_t * coord)
50719+{
50720+ /* length of extent item has to be multiple of extent size */
50721+ assert("vs-1424",
50722+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50723+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
50724+}
50725+
50726+/* item_plugin->b.lookup */
50727+lookup_result
50728+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50729+ coord_t * coord)
50730+{ /* znode and item_pos are
50731+ set to an extent item to
50732+ look through */
50733+ reiser4_key item_key;
50734+ reiser4_block_nr lookuped, offset;
50735+ unsigned i, nr_units;
50736+ reiser4_extent *ext;
50737+ unsigned blocksize;
50738+ unsigned char blocksize_bits;
50739+
50740+ item_key_by_coord(coord, &item_key);
50741+ offset = get_key_offset(&item_key);
50742+
50743+ /* key we are looking for must be greater than key of item @coord */
50744+ assert("vs-414", keygt(key, &item_key));
50745+
50746+ assert("umka-99945",
50747+ !keygt(key, max_key_inside_extent(coord, &item_key)));
50748+
50749+ ext = extent_item(coord);
50750+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50751+
50752+ blocksize = current_blocksize;
50753+ blocksize_bits = current_blocksize_bits;
50754+
50755+ /* offset we are looking for */
50756+ lookuped = get_key_offset(key);
50757+
50758+ nr_units = nr_units_extent(coord);
50759+ /* go through all extents until the one which address given offset */
50760+ for (i = 0; i < nr_units; i++, ext++) {
50761+ offset += (extent_get_width(ext) << blocksize_bits);
50762+ if (offset > lookuped) {
50763+ /* desired byte is somewhere in this extent */
50764+ coord->unit_pos = i;
50765+ coord->between = AT_UNIT;
50766+ return CBK_COORD_FOUND;
50767+ }
50768+ }
50769+
50770+ /* set coord after last unit */
50771+ coord->unit_pos = nr_units - 1;
50772+ coord->between = AFTER_UNIT;
50773+ return CBK_COORD_FOUND;
50774+}
50775+
50776+/* item_plugin->b.paste
50777+ item @coord is set to has been appended with @data->length of free
50778+ space. data->data contains data to be pasted into the item in position
50779+ @coord->in_item.unit_pos. It must fit into that free space.
50780+ @coord must be set between units.
50781+*/
50782+int
50783+paste_extent(coord_t * coord, reiser4_item_data * data,
50784+ carry_plugin_info * info UNUSED_ARG)
50785+{
50786+ unsigned old_nr_units;
50787+ reiser4_extent *ext;
50788+ int item_length;
50789+
50790+ ext = extent_item(coord);
50791+ item_length = item_length_by_coord(coord);
50792+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50793+
50794+ /* this is also used to copy extent into newly created item, so
50795+ old_nr_units could be 0 */
50796+ assert("vs-260", item_length >= data->length);
50797+
50798+ /* make sure that coord is set properly */
50799+ assert("vs-35",
50800+ ((!coord_is_existing_unit(coord))
50801+ || (!old_nr_units && !coord->unit_pos)));
50802+
50803+ /* first unit to be moved */
50804+ switch (coord->between) {
50805+ case AFTER_UNIT:
50806+ coord->unit_pos++;
50807+ case BEFORE_UNIT:
50808+ coord->between = AT_UNIT;
50809+ break;
50810+ case AT_UNIT:
50811+ assert("vs-331", !old_nr_units && !coord->unit_pos);
50812+ break;
50813+ default:
50814+ impossible("vs-330", "coord is set improperly");
50815+ }
50816+
50817+ /* prepare space for new units */
50818+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50819+ ext + coord->unit_pos,
50820+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50821+
50822+ /* copy new data from kernel space */
50823+ assert("vs-556", data->user == 0);
50824+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50825+
50826+ /* after paste @coord is set to first of pasted units */
50827+ assert("vs-332", coord_is_existing_unit(coord));
50828+ assert("vs-333",
50829+ !memcmp(data->data, extent_by_coord(coord),
50830+ (unsigned)data->length));
50831+ return 0;
50832+}
50833+
50834+/* item_plugin->b.can_shift */
50835+int
50836+can_shift_extent(unsigned free_space, coord_t * source,
50837+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50838+ unsigned *size, unsigned want)
50839+{
50840+ *size = item_length_by_coord(source);
50841+ if (*size > free_space)
50842+ /* never split a unit of extent item */
50843+ *size = free_space - free_space % sizeof(reiser4_extent);
50844+
50845+ /* we can shift *size bytes, calculate how many do we want to shift */
50846+ if (*size > want * sizeof(reiser4_extent))
50847+ *size = want * sizeof(reiser4_extent);
50848+
50849+ if (*size % sizeof(reiser4_extent) != 0)
50850+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
50851+ sizeof(reiser4_extent));
50852+ return *size / sizeof(reiser4_extent);
50853+
50854+}
50855+
50856+/* item_plugin->b.copy_units */
50857+void
50858+copy_units_extent(coord_t * target, coord_t * source,
50859+ unsigned from, unsigned count,
50860+ shift_direction where_is_free_space, unsigned free_space)
50861+{
50862+ char *from_ext, *to_ext;
50863+
50864+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
50865+
50866+ from_ext = item_body_by_coord(source);
50867+ to_ext = item_body_by_coord(target);
50868+
50869+ if (where_is_free_space == SHIFT_LEFT) {
50870+ assert("vs-215", from == 0);
50871+
50872+ /* At this moment, item length was already updated in the item
50873+ header by shifting code, hence nr_units_extent() will
50874+ return "new" number of units---one we obtain after copying
50875+ units.
50876+ */
50877+ to_ext +=
50878+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50879+ } else {
50880+ reiser4_key key;
50881+ coord_t coord;
50882+
50883+ assert("vs-216",
50884+ from + count == coord_last_unit_pos(source) + 1);
50885+
50886+ from_ext += item_length_by_coord(source) - free_space;
50887+
50888+ /* new units are inserted before first unit in an item,
50889+ therefore, we have to update item key */
50890+ coord = *source;
50891+ coord.unit_pos = from;
50892+ unit_key_extent(&coord, &key);
50893+
50894+ node_plugin_by_node(target->node)->update_item_key(target, &key,
50895+ NULL /*info */);
50896+ }
50897+
50898+ memcpy(to_ext, from_ext, free_space);
50899+}
50900+
50901+/* item_plugin->b.create_hook
50902+ @arg is znode of leaf node for which we need to update right delimiting key */
50903+int create_hook_extent(const coord_t * coord, void *arg)
50904+{
50905+ coord_t *child_coord;
50906+ znode *node;
50907+ reiser4_key key;
50908+ reiser4_tree *tree;
50909+
50910+ if (!arg)
50911+ return 0;
50912+
50913+ child_coord = arg;
50914+ tree = znode_get_tree(coord->node);
50915+
50916+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50917+
50918+ write_lock_tree(tree);
50919+ write_lock_dk(tree);
50920+ /* find a node on the left level for which right delimiting key has to
50921+ be updated */
50922+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50923+ assert("vs-411", znode_is_left_connected(child_coord->node));
50924+ node = child_coord->node->left;
50925+ } else {
50926+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50927+ node = child_coord->node;
50928+ assert("nikita-3314", node != NULL);
50929+ }
50930+
50931+ if (node != NULL) {
50932+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
50933+
50934+ assert("nikita-3282", check_sibling_list(node));
50935+ /* break sibling links */
50936+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50937+ ON_DEBUG(node->right->left_version =
50938+ atomic_inc_return(&delim_key_version);
50939+ node->right_version =
50940+ atomic_inc_return(&delim_key_version););
50941+
50942+ node->right->left = NULL;
50943+ node->right = NULL;
50944+ }
50945+ }
50946+ write_unlock_dk(tree);
50947+ write_unlock_tree(tree);
50948+ return 0;
50949+}
50950+
50951+#define ITEM_TAIL_KILLED 0
50952+#define ITEM_HEAD_KILLED 1
50953+#define ITEM_KILLED 2
50954+
50955+/* item_plugin->b.kill_hook
50956+ this is called when @count units starting from @from-th one are going to be removed
50957+ */
50958+int
50959+kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50960+ struct carry_kill_data *kdata)
50961+{
50962+ reiser4_extent *ext;
50963+ reiser4_block_nr start, length;
50964+ const reiser4_key *pfrom_key, *pto_key;
50965+ struct inode *inode;
50966+ reiser4_tree *tree;
50967+ pgoff_t from_off, to_off, offset, skip;
50968+ int retval;
50969+
50970+ /* these are located in memory kmalloc-ed by kill_node_content */
50971+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50972+ coord_t *dup, *next;
50973+
50974+ assert("zam-811", znode_is_write_locked(coord->node));
50975+ assert("nikita-3315", kdata != NULL);
50976+ assert("vs-34", kdata->buf != NULL);
50977+
50978+ /* map structures to kdata->buf */
50979+ min_item_key = (reiser4_key *) (kdata->buf);
50980+ max_item_key = min_item_key + 1;
50981+ from_key = max_item_key + 1;
50982+ to_key = from_key + 1;
50983+ key = to_key + 1;
50984+ dup = (coord_t *) (key + 1);
50985+ next = dup + 1;
50986+
50987+ item_key_by_coord(coord, min_item_key);
50988+ max_item_key_by_coord(coord, max_item_key);
50989+
50990+ if (kdata->params.from_key) {
50991+ pfrom_key = kdata->params.from_key;
50992+ pto_key = kdata->params.to_key;
50993+ } else {
50994+ assert("vs-1549", from == coord->unit_pos);
50995+ unit_key_by_coord(coord, from_key);
50996+ pfrom_key = from_key;
50997+
50998+ coord_dup(dup, coord);
50999+ dup->unit_pos = from + count - 1;
51000+ max_unit_key_by_coord(dup, to_key);
51001+ pto_key = to_key;
51002+ }
51003+
51004+ if (!keylt(pto_key, max_item_key)) {
51005+ if (!keygt(pfrom_key, min_item_key)) {
51006+ znode *left, *right;
51007+
51008+ /* item is to be removed completely */
51009+ assert("nikita-3316", kdata->left != NULL
51010+ && kdata->right != NULL);
51011+
51012+ left = kdata->left->node;
51013+ right = kdata->right->node;
51014+
51015+ tree = current_tree;
51016+ /* we have to do two things:
51017+ *
51018+ * 1. link left and right formatted neighbors of
51019+ * extent being removed, and
51020+ *
51021+ * 2. update their delimiting keys.
51022+ *
51023+ * atomicity of these operations is protected by
51024+ * taking dk-lock and tree-lock.
51025+ */
51026+ /* if neighbors of item being removed are znodes -
51027+ * link them */
51028+ write_lock_tree(tree);
51029+ write_lock_dk(tree);
51030+ link_left_and_right(left, right);
51031+ if (left) {
51032+ /* update right delimiting key of left
51033+ * neighbor of extent item */
51034+ /*coord_t next;
51035+ reiser4_key key; */
51036+
51037+ coord_dup(next, coord);
51038+
51039+ if (coord_next_item(next))
51040+ *key = *znode_get_rd_key(coord->node);
51041+ else
51042+ item_key_by_coord(next, key);
51043+ znode_set_rd_key(left, key);
51044+ }
51045+ write_unlock_dk(tree);
51046+ write_unlock_tree(tree);
51047+
51048+ from_off =
51049+ get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
51050+ to_off =
51051+ (get_key_offset(max_item_key) +
51052+ 1) >> PAGE_CACHE_SHIFT;
51053+ retval = ITEM_KILLED;
51054+ } else {
51055+ /* tail of item is to be removed */
51056+ from_off =
51057+ (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
51058+ 1) >> PAGE_CACHE_SHIFT;
51059+ to_off =
51060+ (get_key_offset(max_item_key) +
51061+ 1) >> PAGE_CACHE_SHIFT;
51062+ retval = ITEM_TAIL_KILLED;
51063+ }
51064+ } else {
51065+ /* head of item is to be removed */
51066+ assert("vs-1571", keyeq(pfrom_key, min_item_key));
51067+ assert("vs-1572",
51068+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
51069+ 0);
51070+ assert("vs-1573",
51071+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
51072+ 1)) == 0);
51073+
51074+ if (kdata->left->node) {
51075+ /* update right delimiting key of left neighbor of extent item */
51076+ /*reiser4_key key; */
51077+
51078+ *key = *pto_key;
51079+ set_key_offset(key, get_key_offset(pto_key) + 1);
51080+
51081+ write_lock_dk(current_tree);
51082+ znode_set_rd_key(kdata->left->node, key);
51083+ write_unlock_dk(current_tree);
51084+ }
51085+
51086+ from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
51087+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
51088+ retval = ITEM_HEAD_KILLED;
51089+ }
51090+
51091+ inode = kdata->inode;
51092+ assert("vs-1545", inode != NULL);
51093+ if (inode != NULL)
51094+ /* take care of pages and jnodes corresponding to part of item being killed */
51095+ reiser4_invalidate_pages(inode->i_mapping, from_off,
51096+ to_off - from_off,
51097+ kdata->params.truncate);
51098+
51099+ ext = extent_item(coord) + from;
51100+ offset =
51101+ (get_key_offset(min_item_key) +
51102+ reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
51103+
51104+ assert("vs-1551", from_off >= offset);
51105+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
51106+ skip = from_off - offset;
51107+ offset = from_off;
51108+
51109+ while (offset < to_off) {
51110+ length = extent_get_width(ext) - skip;
51111+ if (state_of_extent(ext) == HOLE_EXTENT) {
51112+ skip = 0;
51113+ offset += length;
51114+ ext++;
51115+ continue;
51116+ }
51117+
51118+ if (offset + length > to_off) {
51119+ length = to_off - offset;
51120+ }
51121+
51122+ DQUOT_FREE_BLOCK_NODIRTY(inode, length);
51123+
51124+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51125+ /* some jnodes corresponding to this unallocated extent */
51126+ fake_allocated2free(length, 0 /* unformatted */ );
51127+
51128+ skip = 0;
51129+ offset += length;
51130+ ext++;
51131+ continue;
51132+ }
51133+
51134+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
51135+
51136+ if (length != 0) {
51137+ start = extent_get_start(ext) + skip;
51138+
51139+ /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
51140+ immediately */
51141+ reiser4_dealloc_blocks(&start, &length,
51142+ 0 /* not used */ ,
51143+ BA_DEFER
51144+ /* unformatted with defer */ );
51145+ }
51146+ skip = 0;
51147+ offset += length;
51148+ ext++;
51149+ }
51150+ return retval;
51151+}
51152+
51153+/* item_plugin->b.kill_units */
51154+int
51155+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51156+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
51157+ reiser4_key * new_first)
51158+{
51159+ reiser4_extent *ext;
51160+ reiser4_key item_key;
51161+ pos_in_node_t count;
51162+ reiser4_key from_key, to_key;
51163+ const reiser4_key *pfrom_key, *pto_key;
51164+ loff_t off;
51165+ int result;
51166+
51167+ assert("vs-1541",
51168+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
51169+ || (kdata->params.from_key != NULL
51170+ && kdata->params.to_key != NULL)));
51171+
51172+ if (kdata->params.from_key) {
51173+ pfrom_key = kdata->params.from_key;
51174+ pto_key = kdata->params.to_key;
51175+ } else {
51176+ coord_t dup;
51177+
51178+ /* calculate key range of kill */
51179+ assert("vs-1549", from == coord->unit_pos);
51180+ unit_key_by_coord(coord, &from_key);
51181+ pfrom_key = &from_key;
51182+
51183+ coord_dup(&dup, coord);
51184+ dup.unit_pos = to;
51185+ max_unit_key_by_coord(&dup, &to_key);
51186+ pto_key = &to_key;
51187+ }
51188+
51189+ item_key_by_coord(coord, &item_key);
51190+
51191+#if REISER4_DEBUG
51192+ {
51193+ reiser4_key max_item_key;
51194+
51195+ max_item_key_by_coord(coord, &max_item_key);
51196+
51197+ if (new_first) {
51198+ /* head of item is to be cut */
51199+ assert("vs-1542", keyeq(pfrom_key, &item_key));
51200+ assert("vs-1538", keylt(pto_key, &max_item_key));
51201+ } else {
51202+ /* tail of item is to be cut */
51203+ assert("vs-1540", keygt(pfrom_key, &item_key));
51204+ assert("vs-1543", !keylt(pto_key, &max_item_key));
51205+ }
51206+ }
51207+#endif
51208+
51209+ if (smallest_removed)
51210+ *smallest_removed = *pfrom_key;
51211+
51212+ if (new_first) {
51213+ /* item head is cut. Item key will change. This new key is calculated here */
51214+ assert("vs-1556",
51215+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
51216+ (PAGE_CACHE_SIZE - 1));
51217+ *new_first = *pto_key;
51218+ set_key_offset(new_first, get_key_offset(new_first) + 1);
51219+ }
51220+
51221+ count = to - from + 1;
51222+ result = kill_hook_extent(coord, from, count, kdata);
51223+ if (result == ITEM_TAIL_KILLED) {
51224+ assert("vs-1553",
51225+ get_key_offset(pfrom_key) >=
51226+ get_key_offset(&item_key) +
51227+ reiser4_extent_size(coord, from));
51228+ off =
51229+ get_key_offset(pfrom_key) -
51230+ (get_key_offset(&item_key) +
51231+ reiser4_extent_size(coord, from));
51232+ if (off) {
51233+ /* unit @from is to be cut partially. Its width decreases */
51234+ ext = extent_item(coord) + from;
51235+ extent_set_width(ext,
51236+ (off + PAGE_CACHE_SIZE -
51237+ 1) >> PAGE_CACHE_SHIFT);
51238+ count--;
51239+ }
51240+ } else {
51241+ __u64 max_to_offset;
51242+ __u64 rest;
51243+
51244+ assert("vs-1575", result == ITEM_HEAD_KILLED);
51245+ assert("", from == 0);
51246+ assert("",
51247+ ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
51248+ 1)) == 0);
51249+ assert("",
51250+ get_key_offset(pto_key) + 1 >
51251+ get_key_offset(&item_key) +
51252+ reiser4_extent_size(coord, to));
51253+ max_to_offset =
51254+ get_key_offset(&item_key) +
51255+ reiser4_extent_size(coord, to + 1) - 1;
51256+ assert("", get_key_offset(pto_key) <= max_to_offset);
51257+
51258+ rest =
51259+ (max_to_offset -
51260+ get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
51261+ if (rest) {
51262+ /* unit @to is to be cut partially */
51263+ ext = extent_item(coord) + to;
51264+
51265+ assert("", extent_get_width(ext) > rest);
51266+
51267+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
51268+ extent_set_start(ext,
51269+ extent_get_start(ext) +
51270+ (extent_get_width(ext) -
51271+ rest));
51272+
51273+ extent_set_width(ext, rest);
51274+ count--;
51275+ }
51276+ }
51277+ return count * sizeof(reiser4_extent);
51278+}
51279+
51280+/* item_plugin->b.cut_units
51281+ this is too similar to kill_units_extent */
51282+int
51283+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51284+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
51285+ reiser4_key * new_first)
51286+{
51287+ reiser4_extent *ext;
51288+ reiser4_key item_key;
51289+ pos_in_node_t count;
51290+ reiser4_key from_key, to_key;
51291+ const reiser4_key *pfrom_key, *pto_key;
51292+ loff_t off;
51293+
51294+ assert("vs-1541",
51295+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
51296+ || (cdata->params.from_key != NULL
51297+ && cdata->params.to_key != NULL)));
51298+
51299+ if (cdata->params.from_key) {
51300+ pfrom_key = cdata->params.from_key;
51301+ pto_key = cdata->params.to_key;
51302+ } else {
51303+ coord_t dup;
51304+
51305+ /* calculate key range of kill */
51306+ coord_dup(&dup, coord);
51307+ dup.unit_pos = from;
51308+ unit_key_by_coord(&dup, &from_key);
51309+
51310+ dup.unit_pos = to;
51311+ max_unit_key_by_coord(&dup, &to_key);
51312+
51313+ pfrom_key = &from_key;
51314+ pto_key = &to_key;
51315+ }
51316+
51317+ assert("vs-1555",
51318+ (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
51319+ assert("vs-1556",
51320+ (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
51321+ (PAGE_CACHE_SIZE - 1));
51322+
51323+ item_key_by_coord(coord, &item_key);
51324+
51325+#if REISER4_DEBUG
51326+ {
51327+ reiser4_key max_item_key;
51328+
51329+ assert("vs-1584",
51330+ get_key_locality(pfrom_key) ==
51331+ get_key_locality(&item_key));
51332+ assert("vs-1585",
51333+ get_key_type(pfrom_key) == get_key_type(&item_key));
51334+ assert("vs-1586",
51335+ get_key_objectid(pfrom_key) ==
51336+ get_key_objectid(&item_key));
51337+ assert("vs-1587",
51338+ get_key_ordering(pfrom_key) ==
51339+ get_key_ordering(&item_key));
51340+
51341+ max_item_key_by_coord(coord, &max_item_key);
51342+
51343+ if (new_first != NULL) {
51344+ /* head of item is to be cut */
51345+ assert("vs-1542", keyeq(pfrom_key, &item_key));
51346+ assert("vs-1538", keylt(pto_key, &max_item_key));
51347+ } else {
51348+ /* tail of item is to be cut */
51349+ assert("vs-1540", keygt(pfrom_key, &item_key));
51350+ assert("vs-1543", keyeq(pto_key, &max_item_key));
51351+ }
51352+ }
51353+#endif
51354+
51355+ if (smallest_removed)
51356+ *smallest_removed = *pfrom_key;
51357+
51358+ if (new_first) {
51359+ /* item head is cut. Item key will change. This new key is calculated here */
51360+ *new_first = *pto_key;
51361+ set_key_offset(new_first, get_key_offset(new_first) + 1);
51362+ }
51363+
51364+ count = to - from + 1;
51365+
51366+ assert("vs-1553",
51367+ get_key_offset(pfrom_key) >=
51368+ get_key_offset(&item_key) + reiser4_extent_size(coord, from));
51369+ off =
51370+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
51371+ reiser4_extent_size(coord, from));
51372+ if (off) {
51373+ /* tail of unit @from is to be cut partially. Its width decreases */
51374+ assert("vs-1582", new_first == NULL);
51375+ ext = extent_item(coord) + from;
51376+ extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
51377+ count--;
51378+ }
51379+
51380+ assert("vs-1554",
51381+ get_key_offset(pto_key) <=
51382+ get_key_offset(&item_key) +
51383+ reiser4_extent_size(coord, to + 1) - 1);
51384+ off =
51385+ (get_key_offset(&item_key) +
51386+ reiser4_extent_size(coord, to + 1) - 1) -
51387+ get_key_offset(pto_key);
51388+ if (off) {
51389+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
51390+ and width decreased. */
51391+ assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
51392+ ext = extent_item(coord) + to;
51393+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
51394+ extent_set_start(ext,
51395+ extent_get_start(ext) +
51396+ (extent_get_width(ext) -
51397+ (off >> PAGE_CACHE_SHIFT)));
51398+
51399+ extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
51400+ count--;
51401+ }
51402+ return count * sizeof(reiser4_extent);
51403+}
51404+
51405+/* item_plugin->b.unit_key */
51406+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
51407+{
51408+ assert("vs-300", coord_is_existing_unit(coord));
51409+
51410+ item_key_by_coord(coord, key);
51411+ set_key_offset(key,
51412+ (get_key_offset(key) +
51413+ reiser4_extent_size(coord, coord->unit_pos)));
51414+
51415+ return key;
51416+}
51417+
51418+/* item_plugin->b.max_unit_key */
51419+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
51420+{
51421+ assert("vs-300", coord_is_existing_unit(coord));
51422+
51423+ item_key_by_coord(coord, key);
51424+ set_key_offset(key,
51425+ (get_key_offset(key) +
51426+ reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
51427+ return key;
51428+}
51429+
51430+/* item_plugin->b.estimate
51431+ item_plugin->b.item_data_by_flow */
51432+
51433+#if REISER4_DEBUG
51434+
51435+/* item_plugin->b.check
51436+ used for debugging, every item should have here the most complete
51437+ possible check of the consistency of the item that the inventor can
51438+ construct
51439+*/
51440+int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
51441+ const char **error /* where to store error message */)
51442+{
51443+ reiser4_extent *ext, *first;
51444+ unsigned i, j;
51445+ reiser4_block_nr start, width, blk_cnt;
51446+ unsigned num_units;
51447+ reiser4_tree *tree;
51448+ oid_t oid;
51449+ reiser4_key key;
51450+ coord_t scan;
51451+
51452+ assert("vs-933", REISER4_DEBUG);
51453+
51454+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
51455+ *error = "Extent on the wrong level";
51456+ return -1;
51457+ }
51458+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
51459+ *error = "Wrong item size";
51460+ return -1;
51461+ }
51462+ ext = first = extent_item(coord);
51463+ blk_cnt = reiser4_block_count(reiser4_get_current_sb());
51464+ num_units = coord_num_units(coord);
51465+ tree = znode_get_tree(coord->node);
51466+ item_key_by_coord(coord, &key);
51467+ oid = get_key_objectid(&key);
51468+ coord_dup(&scan, coord);
51469+
51470+ for (i = 0; i < num_units; ++i, ++ext) {
51471+ __u64 index;
51472+
51473+ scan.unit_pos = i;
51474+ index = extent_unit_index(&scan);
51475+
51476+#if 0
51477+ /* check that all jnodes are present for the unallocated
51478+ * extent */
51479+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51480+ for (j = 0; j < extent_get_width(ext); j++) {
51481+ jnode *node;
51482+
51483+ node = jlookup(tree, oid, index + j);
51484+ if (node == NULL) {
51485+ print_coord("scan", &scan, 0);
51486+ *error = "Jnode missing";
51487+ return -1;
51488+ }
51489+ jput(node);
51490+ }
51491+ }
51492+#endif
51493+
51494+ start = extent_get_start(ext);
51495+ if (start < 2)
51496+ continue;
51497+ /* extent is allocated one */
51498+ width = extent_get_width(ext);
51499+ if (start >= blk_cnt) {
51500+ *error = "Start too large";
51501+ return -1;
51502+ }
51503+ if (start + width > blk_cnt) {
51504+ *error = "End too large";
51505+ return -1;
51506+ }
51507+ /* make sure that this extent does not overlap with other
51508+ allocated extents extents */
51509+ for (j = 0; j < i; j++) {
51510+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
51511+ continue;
51512+ if (!
51513+ ((extent_get_start(ext) >=
51514+ extent_get_start(first + j) +
51515+ extent_get_width(first + j))
51516+ || (extent_get_start(ext) +
51517+ extent_get_width(ext) <=
51518+ extent_get_start(first + j)))) {
51519+ *error = "Extent overlaps with others";
51520+ return -1;
51521+ }
51522+ }
51523+
51524+ }
51525+
51526+ return 0;
51527+}
51528+
51529+#endif /* REISER4_DEBUG */
51530+
51531+/*
51532+ Local variables:
51533+ c-indentation-style: "K&R"
51534+ mode-name: "LC"
51535+ c-basic-offset: 8
51536+ tab-width: 8
51537+ fill-column: 120
51538+ scroll-step: 1
51539+ End:
51540+*/
51541diff --git a/fs/reiser4/plugin/item/internal.c b/fs/reiser4/plugin/item/internal.c
51542new file mode 100644
51543index 0000000..eb79388
51544--- /dev/null
51545+++ b/fs/reiser4/plugin/item/internal.c
51546@@ -0,0 +1,396 @@
51547+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51548+
51549+/* Implementation of internal-item plugin methods. */
51550+
51551+#include "../../forward.h"
51552+#include "../../debug.h"
51553+#include "../../dformat.h"
51554+#include "../../key.h"
51555+#include "../../coord.h"
51556+#include "internal.h"
51557+#include "item.h"
51558+#include "../node/node.h"
51559+#include "../plugin.h"
51560+#include "../../jnode.h"
51561+#include "../../znode.h"
51562+#include "../../tree_walk.h"
51563+#include "../../tree_mod.h"
51564+#include "../../tree.h"
51565+#include "../../super.h"
51566+#include "../../block_alloc.h"
51567+
51568+/* see internal.h for explanation */
51569+
51570+/* plugin->u.item.b.mergeable */
51571+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51572+ const coord_t * p2 UNUSED_ARG /* second item */ )
51573+{
51574+ /* internal items are not mergeable */
51575+ return 0;
51576+}
51577+
51578+/* ->lookup() method for internal items */
51579+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51580+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51581+ coord_t * coord /* coord of item */ )
51582+{
51583+ reiser4_key ukey;
51584+
51585+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51586+ default:
51587+ impossible("", "keycmp()?!");
51588+ case LESS_THAN:
51589+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51590+ item plugin can not be taken using coord set this way */
51591+ assert("vs-681", coord->unit_pos == 0);
51592+ coord->between = AFTER_UNIT;
51593+ case EQUAL_TO:
51594+ return CBK_COORD_FOUND;
51595+ case GREATER_THAN:
51596+ return CBK_COORD_NOTFOUND;
51597+ }
51598+}
51599+
51600+/* return body of internal item at @coord */
51601+static internal_item_layout *internal_at(const coord_t * coord /* coord of
51602+ * item */ )
51603+{
51604+ assert("nikita-607", coord != NULL);
51605+ assert("nikita-1650",
51606+ item_plugin_by_coord(coord) ==
51607+ item_plugin_by_id(NODE_POINTER_ID));
51608+ return (internal_item_layout *) item_body_by_coord(coord);
51609+}
51610+
51611+void reiser4_update_internal(const coord_t * coord,
51612+ const reiser4_block_nr * blocknr)
51613+{
51614+ internal_item_layout *item = internal_at(coord);
51615+ assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51616+
51617+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51618+}
51619+
51620+/* return child block number stored in the internal item at @coord */
51621+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51622+{
51623+ assert("nikita-608", coord != NULL);
51624+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51625+}
51626+
51627+/* get znode pointed to by internal @item */
51628+static znode *znode_at(const coord_t * item /* coord of item */ ,
51629+ znode * parent /* parent node */ )
51630+{
51631+ return child_znode(item, parent, 1, 0);
51632+}
51633+
51634+/* store pointer from internal item into "block". Implementation of
51635+ ->down_link() method */
51636+void down_link_internal(const coord_t * coord /* coord of item */ ,
51637+ const reiser4_key * key UNUSED_ARG /* key to get
51638+ * pointer for */ ,
51639+ reiser4_block_nr * block /* resulting block number */ )
51640+{
51641+ ON_DEBUG(reiser4_key item_key);
51642+
51643+ assert("nikita-609", coord != NULL);
51644+ assert("nikita-611", block != NULL);
51645+ assert("nikita-612", (key == NULL) ||
51646+ /* twig horrors */
51647+ (znode_get_level(coord->node) == TWIG_LEVEL)
51648+ || keyle(item_key_by_coord(coord, &item_key), key));
51649+
51650+ *block = pointer_at(coord);
51651+ assert("nikita-2960", reiser4_blocknr_is_sane(block));
51652+}
51653+
51654+/* Get the child's block number, or 0 if the block is unallocated. */
51655+int
51656+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51657+ reiser4_block_nr * block)
51658+{
51659+ assert("jmacd-2059", coord != NULL);
51660+
51661+ *block = pointer_at(coord);
51662+ assert("nikita-2961", reiser4_blocknr_is_sane(block));
51663+
51664+ if (reiser4_blocknr_is_fake(block)) {
51665+ *block = 0;
51666+ }
51667+
51668+ return 0;
51669+}
51670+
51671+/* Return the child. */
51672+int
51673+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51674+ jnode ** childp)
51675+{
51676+ reiser4_block_nr block = pointer_at(coord);
51677+ znode *child;
51678+
51679+ assert("jmacd-2059", childp != NULL);
51680+ assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51681+
51682+ child = zlook(znode_get_tree(coord->node), &block);
51683+
51684+ if (IS_ERR(child)) {
51685+ return PTR_ERR(child);
51686+ }
51687+
51688+ *childp = ZJNODE(child);
51689+
51690+ return 0;
51691+}
51692+
51693+#if REISER4_DEBUG
51694+
51695+static void check_link(znode * left, znode * right)
51696+{
51697+ znode *scan;
51698+
51699+ for (scan = left; scan != right; scan = scan->right) {
51700+ if (ZF_ISSET(scan, JNODE_RIP))
51701+ break;
51702+ if (znode_is_right_connected(scan) && scan->right != NULL) {
51703+ if (ZF_ISSET(scan->right, JNODE_RIP))
51704+ break;
51705+ assert("nikita-3285",
51706+ znode_is_left_connected(scan->right));
51707+ assert("nikita-3265",
51708+ ergo(scan != left,
51709+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51710+ assert("nikita-3284", scan->right->left == scan);
51711+ } else
51712+ break;
51713+ }
51714+}
51715+
51716+int check__internal(const coord_t * coord, const char **error)
51717+{
51718+ reiser4_block_nr blk;
51719+ znode *child;
51720+ coord_t cpy;
51721+
51722+ blk = pointer_at(coord);
51723+ if (!reiser4_blocknr_is_sane(&blk)) {
51724+ *error = "Invalid pointer";
51725+ return -1;
51726+ }
51727+ coord_dup(&cpy, coord);
51728+ child = znode_at(&cpy, cpy.node);
51729+ if (child != NULL) {
51730+ znode *left_child;
51731+ znode *right_child;
51732+
51733+ left_child = right_child = NULL;
51734+
51735+ assert("nikita-3256", znode_invariant(child));
51736+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51737+ left_child = znode_at(&cpy, cpy.node);
51738+ if (left_child != NULL) {
51739+ read_lock_tree(znode_get_tree(child));
51740+ check_link(left_child, child);
51741+ read_unlock_tree(znode_get_tree(child));
51742+ zput(left_child);
51743+ }
51744+ }
51745+ coord_dup(&cpy, coord);
51746+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51747+ right_child = znode_at(&cpy, cpy.node);
51748+ if (right_child != NULL) {
51749+ read_lock_tree(znode_get_tree(child));
51750+ check_link(child, right_child);
51751+ read_unlock_tree(znode_get_tree(child));
51752+ zput(right_child);
51753+ }
51754+ }
51755+ zput(child);
51756+ }
51757+ return 0;
51758+}
51759+
51760+#endif /* REISER4_DEBUG */
51761+
51762+/* return true only if this item really points to "block" */
51763+/* Audited by: green(2002.06.14) */
51764+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51765+ const reiser4_block_nr * block /* block number to
51766+ * check */ )
51767+{
51768+ assert("nikita-613", coord != NULL);
51769+ assert("nikita-614", block != NULL);
51770+
51771+ return pointer_at(coord) == *block;
51772+}
51773+
51774+/* hook called by ->create_item() method of node plugin after new internal
51775+ item was just created.
51776+
51777+ This is point where pointer to new node is inserted into tree. Initialize
51778+ parent pointer in child znode, insert child into sibling list and slum.
51779+
51780+*/
51781+int create_hook_internal(const coord_t * item /* coord of item */ ,
51782+ void *arg /* child's left neighbor, if any */ )
51783+{
51784+ znode *child;
51785+ __u64 child_ptr;
51786+
51787+ assert("nikita-1252", item != NULL);
51788+ assert("nikita-1253", item->node != NULL);
51789+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51790+ assert("nikita-1450", item->unit_pos == 0);
51791+
51792+ /*
51793+ * preparing to item insertion build_child_ptr_data sets pointer to
51794+ * data to be inserted to jnode's blocknr which is in cpu byte
51795+ * order. Node's create_item simply copied those data. As result we
51796+ * have child pointer in cpu's byte order. Convert content of internal
51797+ * item to little endian byte order.
51798+ */
51799+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51800+ reiser4_update_internal(item, &child_ptr);
51801+
51802+ child = znode_at(item, item->node);
51803+ if (child != NULL && !IS_ERR(child)) {
51804+ znode *left;
51805+ int result = 0;
51806+ reiser4_tree *tree;
51807+
51808+ left = arg;
51809+ tree = znode_get_tree(item->node);
51810+ write_lock_tree(tree);
51811+ write_lock_dk(tree);
51812+ assert("nikita-1400", (child->in_parent.node == NULL)
51813+ || (znode_above_root(child->in_parent.node)));
51814+ ++item->node->c_count;
51815+ coord_to_parent_coord(item, &child->in_parent);
51816+ sibling_list_insert_nolock(child, left);
51817+
51818+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51819+ ZF_CLR(child, JNODE_ORPHAN);
51820+
51821+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51822+ znode_get_rd_key(child))) {
51823+ znode_set_rd_key(child, znode_get_rd_key(left));
51824+ }
51825+ write_unlock_dk(tree);
51826+ write_unlock_tree(tree);
51827+ zput(child);
51828+ return result;
51829+ } else {
51830+ if (child == NULL)
51831+ child = ERR_PTR(-EIO);
51832+ return PTR_ERR(child);
51833+ }
51834+}
51835+
51836+/* hook called by ->cut_and_kill() method of node plugin just before internal
51837+ item is removed.
51838+
51839+ This is point where empty node is removed from the tree. Clear parent
51840+ pointer in child, and mark node for pending deletion.
51841+
51842+ Node will be actually deleted later and in several installations:
51843+
51844+ . when last lock on this node will be released, node will be removed from
51845+ the sibling list and its lock will be invalidated
51846+
51847+ . when last reference to this node will be dropped, bitmap will be updated
51848+ and node will be actually removed from the memory.
51849+
51850+*/
51851+int kill_hook_internal(const coord_t * item /* coord of item */ ,
51852+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
51853+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51854+ struct carry_kill_data *p UNUSED_ARG)
51855+{
51856+ znode *child;
51857+
51858+ assert("nikita-1222", item != NULL);
51859+ assert("nikita-1224", from == 0);
51860+ assert("nikita-1225", count == 1);
51861+
51862+ child = znode_at(item, item->node);
51863+ if (IS_ERR(child))
51864+ return PTR_ERR(child);
51865+ else if (node_is_empty(child)) {
51866+ reiser4_tree *tree;
51867+
51868+ assert("nikita-1397", znode_is_write_locked(child));
51869+ assert("nikita-1398", child->c_count == 0);
51870+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51871+
51872+ tree = znode_get_tree(item->node);
51873+ write_lock_tree(tree);
51874+ init_parent_coord(&child->in_parent, NULL);
51875+ --item->node->c_count;
51876+ write_unlock_tree(tree);
51877+ zput(child);
51878+ return 0;
51879+ } else {
51880+ warning("nikita-1223",
51881+ "Cowardly refuse to remove link to non-empty node");
51882+ zput(child);
51883+ return RETERR(-EIO);
51884+ }
51885+}
51886+
51887+/* hook called by ->shift() node plugin method when iternal item was just
51888+ moved from one node to another.
51889+
51890+ Update parent pointer in child and c_counts in old and new parent
51891+
51892+*/
51893+int shift_hook_internal(const coord_t * item /* coord of item */ ,
51894+ unsigned from UNUSED_ARG /* start unit */ ,
51895+ unsigned count UNUSED_ARG /* stop unit */ ,
51896+ znode * old_node /* old parent */ )
51897+{
51898+ znode *child;
51899+ znode *new_node;
51900+ reiser4_tree *tree;
51901+
51902+ assert("nikita-1276", item != NULL);
51903+ assert("nikita-1277", from == 0);
51904+ assert("nikita-1278", count == 1);
51905+ assert("nikita-1451", item->unit_pos == 0);
51906+
51907+ new_node = item->node;
51908+ assert("nikita-2132", new_node != old_node);
51909+ tree = znode_get_tree(item->node);
51910+ child = child_znode(item, old_node, 1, 0);
51911+ if (child == NULL)
51912+ return 0;
51913+ if (!IS_ERR(child)) {
51914+ write_lock_tree(tree);
51915+ ++new_node->c_count;
51916+ assert("nikita-1395", znode_parent(child) == old_node);
51917+ assert("nikita-1396", old_node->c_count > 0);
51918+ coord_to_parent_coord(item, &child->in_parent);
51919+ assert("nikita-1781", znode_parent(child) == new_node);
51920+ assert("nikita-1782",
51921+ check_tree_pointer(item, child) == NS_FOUND);
51922+ --old_node->c_count;
51923+ write_unlock_tree(tree);
51924+ zput(child);
51925+ return 0;
51926+ } else
51927+ return PTR_ERR(child);
51928+}
51929+
51930+/* plugin->u.item.b.max_key_inside - not defined */
51931+
51932+/* plugin->u.item.b.nr_units - item.c:single_unit */
51933+
51934+/* Make Linus happy.
51935+ Local variables:
51936+ c-indentation-style: "K&R"
51937+ mode-name: "LC"
51938+ c-basic-offset: 8
51939+ tab-width: 8
51940+ fill-column: 120
51941+ End:
51942+*/
51943diff --git a/fs/reiser4/plugin/item/internal.h b/fs/reiser4/plugin/item/internal.h
51944new file mode 100644
51945index 0000000..27aa27d
51946--- /dev/null
51947+++ b/fs/reiser4/plugin/item/internal.h
51948@@ -0,0 +1,57 @@
51949+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51950+/* Internal item contains down-link to the child of the internal/twig
51951+ node in a tree. It is internal items that are actually used during
51952+ tree traversal. */
51953+
51954+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51955+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51956+
51957+#include "../../forward.h"
51958+#include "../../dformat.h"
51959+
51960+/* on-disk layout of internal item */
51961+typedef struct internal_item_layout {
51962+ /* 0 */ reiser4_dblock_nr pointer;
51963+ /* 4 */
51964+} internal_item_layout;
51965+
51966+struct cut_list;
51967+
51968+int mergeable_internal(const coord_t * p1, const coord_t * p2);
51969+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51970+ coord_t * coord);
51971+/* store pointer from internal item into "block". Implementation of
51972+ ->down_link() method */
51973+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51974+ reiser4_block_nr * block);
51975+extern int has_pointer_to_internal(const coord_t * coord,
51976+ const reiser4_block_nr * block);
51977+extern int create_hook_internal(const coord_t * item, void *arg);
51978+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51979+ pos_in_node_t count, struct carry_kill_data *);
51980+extern int shift_hook_internal(const coord_t * item, unsigned from,
51981+ unsigned count, znode * old_node);
51982+extern void reiser4_print_internal(const char *prefix, coord_t * coord);
51983+
51984+extern int utmost_child_internal(const coord_t * coord, sideof side,
51985+ jnode ** child);
51986+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51987+ reiser4_block_nr * block);
51988+
51989+extern void reiser4_update_internal(const coord_t * coord,
51990+ const reiser4_block_nr * blocknr);
51991+/* FIXME: reiserfs has check_internal */
51992+extern int check__internal(const coord_t * coord, const char **error);
51993+
51994+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51995+#endif
51996+
51997+/* Make Linus happy.
51998+ Local variables:
51999+ c-indentation-style: "K&R"
52000+ mode-name: "LC"
52001+ c-basic-offset: 8
52002+ tab-width: 8
52003+ fill-column: 120
52004+ End:
52005+*/
52006diff --git a/fs/reiser4/plugin/item/item.c b/fs/reiser4/plugin/item/item.c
52007new file mode 100644
52008index 0000000..e226f04
52009--- /dev/null
52010+++ b/fs/reiser4/plugin/item/item.c
52011@@ -0,0 +1,719 @@
52012+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52013+
52014+/* definition of item plugins. */
52015+
52016+#include "../../forward.h"
52017+#include "../../debug.h"
52018+#include "../../key.h"
52019+#include "../../coord.h"
52020+#include "../plugin_header.h"
52021+#include "sde.h"
52022+#include "internal.h"
52023+#include "item.h"
52024+#include "static_stat.h"
52025+#include "../plugin.h"
52026+#include "../../znode.h"
52027+#include "../../tree.h"
52028+#include "../../context.h"
52029+#include "ctail.h"
52030+
52031+/* return pointer to item body */
52032+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
52033+{
52034+ assert("nikita-324", coord != NULL);
52035+ assert("nikita-325", coord->node != NULL);
52036+ assert("nikita-326", znode_is_loaded(coord->node));
52037+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
52038+
52039+ coord->offset =
52040+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
52041+ zdata(coord->node);
52042+ ON_DEBUG(coord->body_v = coord->node->times_locked);
52043+}
52044+
52045+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
52046+{
52047+ return zdata(coord->node) + coord->offset;
52048+}
52049+
52050+#if REISER4_DEBUG
52051+
52052+int item_body_is_valid(const coord_t * coord)
52053+{
52054+ return
52055+ coord->offset ==
52056+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
52057+ zdata(coord->node);
52058+}
52059+
52060+#endif
52061+
52062+/* return length of item at @coord */
52063+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
52064+{
52065+ int len;
52066+
52067+ assert("nikita-327", coord != NULL);
52068+ assert("nikita-328", coord->node != NULL);
52069+ assert("nikita-329", znode_is_loaded(coord->node));
52070+
52071+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
52072+ return len;
52073+}
52074+
52075+void obtain_item_plugin(const coord_t * coord)
52076+{
52077+ assert("nikita-330", coord != NULL);
52078+ assert("nikita-331", coord->node != NULL);
52079+ assert("nikita-332", znode_is_loaded(coord->node));
52080+
52081+ coord_set_iplug((coord_t *) coord,
52082+ node_plugin_by_node(coord->node)->
52083+ plugin_by_coord(coord));
52084+ assert("nikita-2479",
52085+ coord_iplug(coord) ==
52086+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
52087+}
52088+
52089+/* return id of item */
52090+/* Audited by: green(2002.06.15) */
52091+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
52092+{
52093+ assert("vs-539", coord != NULL);
52094+ assert("vs-538", coord->node != NULL);
52095+ assert("vs-537", znode_is_loaded(coord->node));
52096+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
52097+ assert("vs-540",
52098+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
52099+
52100+ return item_id_by_plugin(item_plugin_by_coord(coord));
52101+}
52102+
52103+/* return key of item at @coord */
52104+/* Audited by: green(2002.06.15) */
52105+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
52106+ reiser4_key * key /* result */ )
52107+{
52108+ assert("nikita-338", coord != NULL);
52109+ assert("nikita-339", coord->node != NULL);
52110+ assert("nikita-340", znode_is_loaded(coord->node));
52111+
52112+ return node_plugin_by_node(coord->node)->key_at(coord, key);
52113+}
52114+
52115+/* this returns max key in the item */
52116+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
52117+ reiser4_key * key /* result */ )
52118+{
52119+ coord_t last;
52120+
52121+ assert("nikita-338", coord != NULL);
52122+ assert("nikita-339", coord->node != NULL);
52123+ assert("nikita-340", znode_is_loaded(coord->node));
52124+
52125+ /* make coord pointing to last item's unit */
52126+ coord_dup(&last, coord);
52127+ last.unit_pos = coord_num_units(&last) - 1;
52128+ assert("vs-1560", coord_is_existing_unit(&last));
52129+
52130+ max_unit_key_by_coord(&last, key);
52131+ return key;
52132+}
52133+
52134+/* return key of unit at @coord */
52135+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
52136+ reiser4_key * key /* result */ )
52137+{
52138+ assert("nikita-772", coord != NULL);
52139+ assert("nikita-774", coord->node != NULL);
52140+ assert("nikita-775", znode_is_loaded(coord->node));
52141+
52142+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
52143+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
52144+ else
52145+ return item_key_by_coord(coord, key);
52146+}
52147+
52148+/* return the biggest key contained the unit @coord */
52149+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
52150+ reiser4_key * key /* result */ )
52151+{
52152+ assert("nikita-772", coord != NULL);
52153+ assert("nikita-774", coord->node != NULL);
52154+ assert("nikita-775", znode_is_loaded(coord->node));
52155+
52156+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
52157+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
52158+ else
52159+ return unit_key_by_coord(coord, key);
52160+}
52161+
52162+/* ->max_key_inside() method for items consisting of exactly one key (like
52163+ stat-data) */
52164+static reiser4_key *max_key_inside_single_key(const coord_t *
52165+ coord /* coord of item */ ,
52166+ reiser4_key *
52167+ result /* resulting key */ )
52168+{
52169+ assert("nikita-604", coord != NULL);
52170+
52171+ /* coord -> key is starting key of this item and it has to be already
52172+ filled in */
52173+ return unit_key_by_coord(coord, result);
52174+}
52175+
52176+/* ->nr_units() method for items consisting of exactly one unit always */
52177+pos_in_node_t
52178+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
52179+{
52180+ return 1;
52181+}
52182+
52183+static int
52184+paste_no_paste(coord_t * coord UNUSED_ARG,
52185+ reiser4_item_data * data UNUSED_ARG,
52186+ carry_plugin_info * info UNUSED_ARG)
52187+{
52188+ return 0;
52189+}
52190+
52191+/* default ->fast_paste() method */
52192+static int
52193+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
52194+{
52195+ return 1;
52196+}
52197+
52198+int item_can_contain_key(const coord_t * item /* coord of item */ ,
52199+ const reiser4_key * key /* key to check */ ,
52200+ const reiser4_item_data * data /* parameters of item
52201+ * being created */ )
52202+{
52203+ item_plugin *iplug;
52204+ reiser4_key min_key_in_item;
52205+ reiser4_key max_key_in_item;
52206+
52207+ assert("nikita-1658", item != NULL);
52208+ assert("nikita-1659", key != NULL);
52209+
52210+ iplug = item_plugin_by_coord(item);
52211+ if (iplug->b.can_contain_key != NULL)
52212+ return iplug->b.can_contain_key(item, key, data);
52213+ else {
52214+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
52215+ item_key_by_coord(item, &min_key_in_item);
52216+ iplug->b.max_key_inside(item, &max_key_in_item);
52217+
52218+ /* can contain key if
52219+ min_key_in_item <= key &&
52220+ key <= max_key_in_item
52221+ */
52222+ return keyle(&min_key_in_item, key)
52223+ && keyle(key, &max_key_in_item);
52224+ }
52225+}
52226+
52227+/* mergeable method for non mergeable items */
52228+static int
52229+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
52230+{
52231+ return 0;
52232+}
52233+
52234+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
52235+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
52236+ const coord_t * i2 /* coord of second item */ )
52237+{
52238+ item_plugin *iplug;
52239+ reiser4_key k1;
52240+ reiser4_key k2;
52241+
52242+ assert("nikita-1336", i1 != NULL);
52243+ assert("nikita-1337", i2 != NULL);
52244+
52245+ iplug = item_plugin_by_coord(i1);
52246+ assert("nikita-1338", iplug != NULL);
52247+
52248+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
52249+ shifting code when nodes are in "suspended" state. */
52250+ assert("nikita-1663",
52251+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
52252+
52253+ if (iplug->b.mergeable != NULL) {
52254+ return iplug->b.mergeable(i1, i2);
52255+ } else if (iplug->b.max_key_inside != NULL) {
52256+ iplug->b.max_key_inside(i1, &k1);
52257+ item_key_by_coord(i2, &k2);
52258+
52259+ /* mergeable if ->max_key_inside() >= key of i2; */
52260+ return keyge(iplug->b.max_key_inside(i1, &k1),
52261+ item_key_by_coord(i2, &k2));
52262+ } else {
52263+ item_key_by_coord(i1, &k1);
52264+ item_key_by_coord(i2, &k2);
52265+
52266+ return
52267+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
52268+ (get_key_objectid(&k1) == get_key_objectid(&k2))
52269+ && (iplug == item_plugin_by_coord(i2));
52270+ }
52271+}
52272+
52273+int item_is_extent(const coord_t * item)
52274+{
52275+ assert("vs-482", coord_is_existing_item(item));
52276+ return item_id_by_coord(item) == EXTENT_POINTER_ID;
52277+}
52278+
52279+int item_is_tail(const coord_t * item)
52280+{
52281+ assert("vs-482", coord_is_existing_item(item));
52282+ return item_id_by_coord(item) == FORMATTING_ID;
52283+}
52284+
52285+#if REISER4_DEBUG
52286+
52287+int item_is_statdata(const coord_t * item)
52288+{
52289+ assert("vs-516", coord_is_existing_item(item));
52290+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
52291+}
52292+
52293+int item_is_ctail(const coord_t * item)
52294+{
52295+ assert("edward-xx", coord_is_existing_item(item));
52296+ return item_id_by_coord(item) == CTAIL_ID;
52297+}
52298+
52299+#endif /* REISER4_DEBUG */
52300+
52301+static int change_item(struct inode *inode,
52302+ reiser4_plugin * plugin,
52303+ pset_member memb)
52304+{
52305+ /* cannot change constituent item (sd, or dir_item) */
52306+ return RETERR(-EINVAL);
52307+}
52308+
52309+static reiser4_plugin_ops item_plugin_ops = {
52310+ .init = NULL,
52311+ .load = NULL,
52312+ .save_len = NULL,
52313+ .save = NULL,
52314+ .change = change_item
52315+};
52316+
52317+item_plugin item_plugins[LAST_ITEM_ID] = {
52318+ [STATIC_STAT_DATA_ID] = {
52319+ .h = {
52320+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52321+ .id = STATIC_STAT_DATA_ID,
52322+ .groups = (1 << STAT_DATA_ITEM_TYPE),
52323+ .pops = &item_plugin_ops,
52324+ .label = "sd",
52325+ .desc = "stat-data",
52326+ .linkage = {NULL, NULL}
52327+ },
52328+ .b = {
52329+ .max_key_inside = max_key_inside_single_key,
52330+ .can_contain_key = NULL,
52331+ .mergeable = not_mergeable,
52332+ .nr_units = nr_units_single_unit,
52333+ .lookup = NULL,
52334+ .init = NULL,
52335+ .paste = paste_no_paste,
52336+ .fast_paste = NULL,
52337+ .can_shift = NULL,
52338+ .copy_units = NULL,
52339+ .create_hook = NULL,
52340+ .kill_hook = NULL,
52341+ .shift_hook = NULL,
52342+ .cut_units = NULL,
52343+ .kill_units = NULL,
52344+ .unit_key = NULL,
52345+ .max_unit_key = NULL,
52346+ .estimate = NULL,
52347+ .item_data_by_flow = NULL,
52348+#if REISER4_DEBUG
52349+ .check = NULL
52350+#endif
52351+ },
52352+ .f = {
52353+ .utmost_child = NULL,
52354+ .utmost_child_real_block = NULL,
52355+ .update = NULL,
52356+ .scan = NULL,
52357+ .convert = NULL
52358+ },
52359+ .s = {
52360+ .sd = {
52361+ .init_inode = init_inode_static_sd,
52362+ .save_len = save_len_static_sd,
52363+ .save = save_static_sd
52364+ }
52365+ }
52366+ },
52367+ [SIMPLE_DIR_ENTRY_ID] = {
52368+ .h = {
52369+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52370+ .id = SIMPLE_DIR_ENTRY_ID,
52371+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
52372+ .pops = &item_plugin_ops,
52373+ .label = "de",
52374+ .desc = "directory entry",
52375+ .linkage = {NULL, NULL}
52376+ },
52377+ .b = {
52378+ .max_key_inside = max_key_inside_single_key,
52379+ .can_contain_key = NULL,
52380+ .mergeable = NULL,
52381+ .nr_units = nr_units_single_unit,
52382+ .lookup = NULL,
52383+ .init = NULL,
52384+ .paste = NULL,
52385+ .fast_paste = NULL,
52386+ .can_shift = NULL,
52387+ .copy_units = NULL,
52388+ .create_hook = NULL,
52389+ .kill_hook = NULL,
52390+ .shift_hook = NULL,
52391+ .cut_units = NULL,
52392+ .kill_units = NULL,
52393+ .unit_key = NULL,
52394+ .max_unit_key = NULL,
52395+ .estimate = NULL,
52396+ .item_data_by_flow = NULL,
52397+#if REISER4_DEBUG
52398+ .check = NULL
52399+#endif
52400+ },
52401+ .f = {
52402+ .utmost_child = NULL,
52403+ .utmost_child_real_block = NULL,
52404+ .update = NULL,
52405+ .scan = NULL,
52406+ .convert = NULL
52407+ },
52408+ .s = {
52409+ .dir = {
52410+ .extract_key = extract_key_de,
52411+ .update_key = update_key_de,
52412+ .extract_name = extract_name_de,
52413+ .extract_file_type = extract_file_type_de,
52414+ .add_entry = add_entry_de,
52415+ .rem_entry = rem_entry_de,
52416+ .max_name_len = max_name_len_de
52417+ }
52418+ }
52419+ },
52420+ [COMPOUND_DIR_ID] = {
52421+ .h = {
52422+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52423+ .id = COMPOUND_DIR_ID,
52424+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
52425+ .pops = &item_plugin_ops,
52426+ .label = "cde",
52427+ .desc = "compressed directory entry",
52428+ .linkage = {NULL, NULL}
52429+ },
52430+ .b = {
52431+ .max_key_inside = max_key_inside_cde,
52432+ .can_contain_key = can_contain_key_cde,
52433+ .mergeable = mergeable_cde,
52434+ .nr_units = nr_units_cde,
52435+ .lookup = lookup_cde,
52436+ .init = init_cde,
52437+ .paste = paste_cde,
52438+ .fast_paste = agree_to_fast_op,
52439+ .can_shift = can_shift_cde,
52440+ .copy_units = copy_units_cde,
52441+ .create_hook = NULL,
52442+ .kill_hook = NULL,
52443+ .shift_hook = NULL,
52444+ .cut_units = cut_units_cde,
52445+ .kill_units = kill_units_cde,
52446+ .unit_key = unit_key_cde,
52447+ .max_unit_key = unit_key_cde,
52448+ .estimate = estimate_cde,
52449+ .item_data_by_flow = NULL,
52450+#if REISER4_DEBUG
52451+ .check = reiser4_check_cde
52452+#endif
52453+ },
52454+ .f = {
52455+ .utmost_child = NULL,
52456+ .utmost_child_real_block = NULL,
52457+ .update = NULL,
52458+ .scan = NULL,
52459+ .convert = NULL
52460+ },
52461+ .s = {
52462+ .dir = {
52463+ .extract_key = extract_key_cde,
52464+ .update_key = update_key_cde,
52465+ .extract_name = extract_name_cde,
52466+ .extract_file_type = extract_file_type_de,
52467+ .add_entry = add_entry_cde,
52468+ .rem_entry = rem_entry_cde,
52469+ .max_name_len = max_name_len_cde
52470+ }
52471+ }
52472+ },
52473+ [NODE_POINTER_ID] = {
52474+ .h = {
52475+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52476+ .id = NODE_POINTER_ID,
52477+ .groups = (1 << INTERNAL_ITEM_TYPE),
52478+ .pops = NULL,
52479+ .label = "internal",
52480+ .desc = "internal item",
52481+ .linkage = {NULL, NULL}
52482+ },
52483+ .b = {
52484+ .max_key_inside = NULL,
52485+ .can_contain_key = NULL,
52486+ .mergeable = mergeable_internal,
52487+ .nr_units = nr_units_single_unit,
52488+ .lookup = lookup_internal,
52489+ .init = NULL,
52490+ .paste = NULL,
52491+ .fast_paste = NULL,
52492+ .can_shift = NULL,
52493+ .copy_units = NULL,
52494+ .create_hook = create_hook_internal,
52495+ .kill_hook = kill_hook_internal,
52496+ .shift_hook = shift_hook_internal,
52497+ .cut_units = NULL,
52498+ .kill_units = NULL,
52499+ .unit_key = NULL,
52500+ .max_unit_key = NULL,
52501+ .estimate = NULL,
52502+ .item_data_by_flow = NULL,
52503+#if REISER4_DEBUG
52504+ .check = check__internal
52505+#endif
52506+ },
52507+ .f = {
52508+ .utmost_child = utmost_child_internal,
52509+ .utmost_child_real_block =
52510+ utmost_child_real_block_internal,
52511+ .update = reiser4_update_internal,
52512+ .scan = NULL,
52513+ .convert = NULL
52514+ },
52515+ .s = {
52516+ .internal = {
52517+ .down_link = down_link_internal,
52518+ .has_pointer_to = has_pointer_to_internal
52519+ }
52520+ }
52521+ },
52522+ [EXTENT_POINTER_ID] = {
52523+ .h = {
52524+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52525+ .id = EXTENT_POINTER_ID,
52526+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52527+ .pops = NULL,
52528+ .label = "extent",
52529+ .desc = "extent item",
52530+ .linkage = {NULL, NULL}
52531+ },
52532+ .b = {
52533+ .max_key_inside = max_key_inside_extent,
52534+ .can_contain_key = can_contain_key_extent,
52535+ .mergeable = mergeable_extent,
52536+ .nr_units = nr_units_extent,
52537+ .lookup = lookup_extent,
52538+ .init = NULL,
52539+ .paste = paste_extent,
52540+ .fast_paste = agree_to_fast_op,
52541+ .can_shift = can_shift_extent,
52542+ .create_hook = create_hook_extent,
52543+ .copy_units = copy_units_extent,
52544+ .kill_hook = kill_hook_extent,
52545+ .shift_hook = NULL,
52546+ .cut_units = cut_units_extent,
52547+ .kill_units = kill_units_extent,
52548+ .unit_key = unit_key_extent,
52549+ .max_unit_key = max_unit_key_extent,
52550+ .estimate = NULL,
52551+ .item_data_by_flow = NULL,
52552+#if REISER4_DEBUG
52553+ .check = reiser4_check_extent
52554+#endif
52555+ },
52556+ .f = {
52557+ .utmost_child = utmost_child_extent,
52558+ .utmost_child_real_block =
52559+ utmost_child_real_block_extent,
52560+ .update = NULL,
52561+ .scan = reiser4_scan_extent,
52562+ .convert = NULL,
52563+ .key_by_offset = key_by_offset_extent
52564+ },
52565+ .s = {
52566+ .file = {
52567+ .write = reiser4_write_extent,
52568+ .read = reiser4_read_extent,
52569+ .readpage = reiser4_readpage_extent,
52570+ .get_block = get_block_address_extent,
52571+ .append_key = append_key_extent,
52572+ .init_coord_extension =
52573+ init_coord_extension_extent
52574+ }
52575+ }
52576+ },
52577+ [FORMATTING_ID] = {
52578+ .h = {
52579+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52580+ .id = FORMATTING_ID,
52581+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52582+ .pops = NULL,
52583+ .label = "body",
52584+ .desc = "body (or tail?) item",
52585+ .linkage = {NULL, NULL}
52586+ },
52587+ .b = {
52588+ .max_key_inside = max_key_inside_tail,
52589+ .can_contain_key = can_contain_key_tail,
52590+ .mergeable = mergeable_tail,
52591+ .nr_units = nr_units_tail,
52592+ .lookup = lookup_tail,
52593+ .init = NULL,
52594+ .paste = paste_tail,
52595+ .fast_paste = agree_to_fast_op,
52596+ .can_shift = can_shift_tail,
52597+ .create_hook = NULL,
52598+ .copy_units = copy_units_tail,
52599+ .kill_hook = kill_hook_tail,
52600+ .shift_hook = NULL,
52601+ .cut_units = cut_units_tail,
52602+ .kill_units = kill_units_tail,
52603+ .unit_key = unit_key_tail,
52604+ .max_unit_key = unit_key_tail,
52605+ .estimate = NULL,
52606+ .item_data_by_flow = NULL,
52607+#if REISER4_DEBUG
52608+ .check = NULL
52609+#endif
52610+ },
52611+ .f = {
52612+ .utmost_child = NULL,
52613+ .utmost_child_real_block = NULL,
52614+ .update = NULL,
52615+ .scan = NULL,
52616+ .convert = NULL
52617+ },
52618+ .s = {
52619+ .file = {
52620+ .write = reiser4_write_tail,
52621+ .read = reiser4_read_tail,
52622+ .readpage = readpage_tail,
52623+ .get_block = get_block_address_tail,
52624+ .append_key = append_key_tail,
52625+ .init_coord_extension =
52626+ init_coord_extension_tail
52627+ }
52628+ }
52629+ },
52630+ [CTAIL_ID] = {
52631+ .h = {
52632+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52633+ .id = CTAIL_ID,
52634+ .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52635+ .pops = NULL,
52636+ .label = "ctail",
52637+ .desc = "cryptcompress tail item",
52638+ .linkage = {NULL, NULL}
52639+ },
52640+ .b = {
52641+ .max_key_inside = max_key_inside_tail,
52642+ .can_contain_key = can_contain_key_ctail,
52643+ .mergeable = mergeable_ctail,
52644+ .nr_units = nr_units_ctail,
52645+ .lookup = NULL,
52646+ .init = init_ctail,
52647+ .paste = paste_ctail,
52648+ .fast_paste = agree_to_fast_op,
52649+ .can_shift = can_shift_ctail,
52650+ .create_hook = create_hook_ctail,
52651+ .copy_units = copy_units_ctail,
52652+ .kill_hook = kill_hook_ctail,
52653+ .shift_hook = shift_hook_ctail,
52654+ .cut_units = cut_units_ctail,
52655+ .kill_units = kill_units_ctail,
52656+ .unit_key = unit_key_tail,
52657+ .max_unit_key = unit_key_tail,
52658+ .estimate = estimate_ctail,
52659+ .item_data_by_flow = NULL,
52660+#if REISER4_DEBUG
52661+ .check = check_ctail
52662+#endif
52663+ },
52664+ .f = {
52665+ .utmost_child = utmost_child_ctail,
52666+ /* FIXME-EDWARD: write this */
52667+ .utmost_child_real_block = NULL,
52668+ .update = NULL,
52669+ .scan = scan_ctail,
52670+ .convert = convert_ctail
52671+ },
52672+ .s = {
52673+ .file = {
52674+ .write = NULL,
52675+ .read = read_ctail,
52676+ .readpage = readpage_ctail,
52677+ .get_block = get_block_address_tail,
52678+ .append_key = append_key_ctail,
52679+ .init_coord_extension =
52680+ init_coord_extension_tail
52681+ }
52682+ }
52683+ },
52684+ [BLACK_BOX_ID] = {
52685+ .h = {
52686+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
52687+ .id = BLACK_BOX_ID,
52688+ .groups = (1 << OTHER_ITEM_TYPE),
52689+ .pops = NULL,
52690+ .label = "blackbox",
52691+ .desc = "black box item",
52692+ .linkage = {NULL, NULL}
52693+ },
52694+ .b = {
52695+ .max_key_inside = NULL,
52696+ .can_contain_key = NULL,
52697+ .mergeable = not_mergeable,
52698+ .nr_units = nr_units_single_unit,
52699+ /* to need for ->lookup method */
52700+ .lookup = NULL,
52701+ .init = NULL,
52702+ .paste = NULL,
52703+ .fast_paste = NULL,
52704+ .can_shift = NULL,
52705+ .copy_units = NULL,
52706+ .create_hook = NULL,
52707+ .kill_hook = NULL,
52708+ .shift_hook = NULL,
52709+ .cut_units = NULL,
52710+ .kill_units = NULL,
52711+ .unit_key = NULL,
52712+ .max_unit_key = NULL,
52713+ .estimate = NULL,
52714+ .item_data_by_flow = NULL,
52715+#if REISER4_DEBUG
52716+ .check = NULL
52717+#endif
52718+ }
52719+ }
52720+};
52721+
52722+/* Make Linus happy.
52723+ Local variables:
52724+ c-indentation-style: "K&R"
52725+ mode-name: "LC"
52726+ c-basic-offset: 8
52727+ tab-width: 8
52728+ fill-column: 120
52729+ End:
52730+*/
52731diff --git a/fs/reiser4/plugin/item/item.h b/fs/reiser4/plugin/item/item.h
52732new file mode 100644
52733index 0000000..0822296
52734--- /dev/null
52735+++ b/fs/reiser4/plugin/item/item.h
52736@@ -0,0 +1,400 @@
52737+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52738+
52739+/* first read balance.c comments before reading this */
52740+
52741+/* An item_plugin implements all of the operations required for
52742+ balancing that are item specific. */
52743+
52744+/* an item plugin also implements other operations that are specific to that
52745+ item. These go into the item specific operations portion of the item
52746+ handler, and all of the item specific portions of the item handler are put
52747+ into a union. */
52748+
52749+#if !defined( __REISER4_ITEM_H__ )
52750+#define __REISER4_ITEM_H__
52751+
52752+#include "../../forward.h"
52753+#include "../plugin_header.h"
52754+#include "../../dformat.h"
52755+#include "../../seal.h"
52756+#include "../../plugin/file/file.h"
52757+
52758+#include <linux/fs.h> /* for struct file, struct inode */
52759+#include <linux/mm.h> /* for struct page */
52760+#include <linux/dcache.h> /* for struct dentry */
52761+
52762+typedef enum {
52763+ STAT_DATA_ITEM_TYPE,
52764+ DIR_ENTRY_ITEM_TYPE,
52765+ INTERNAL_ITEM_TYPE,
52766+ UNIX_FILE_METADATA_ITEM_TYPE,
52767+ OTHER_ITEM_TYPE
52768+} item_type_id;
52769+
52770+/* this is the part of each item plugin that all items are expected to
52771+ support or at least explicitly fail to support by setting the
52772+ pointer to null. */
52773+typedef struct {
52774+ /* operations called by balancing
52775+
52776+ It is interesting to consider that some of these item
52777+ operations could be given sources or targets that are not
52778+ really items in nodes. This could be ok/useful.
52779+
52780+ */
52781+ /* maximal key that can _possibly_ be occupied by this item
52782+
52783+ When inserting, and node ->lookup() method (called by
52784+ coord_by_key()) reaches an item after binary search,
52785+ the ->max_key_inside() item plugin method is used to determine
52786+ whether new item should pasted into existing item
52787+ (new_key<=max_key_inside()) or new item has to be created
52788+ (new_key>max_key_inside()).
52789+
52790+ For items that occupy exactly one key (like stat-data)
52791+ this method should return this key. For items that can
52792+ grow indefinitely (extent, directory item) this should
52793+ return reiser4_max_key().
52794+
52795+ For example extent with the key
52796+
52797+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52798+
52799+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52800+ */
52801+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52802+
52803+ /* true if item @coord can merge data at @key. */
52804+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
52805+ const reiser4_item_data *);
52806+ /* mergeable() - check items for mergeability
52807+
52808+ Optional method. Returns true if two items can be merged.
52809+
52810+ */
52811+ int (*mergeable) (const coord_t *, const coord_t *);
52812+
52813+ /* number of atomic things in an item.
52814+ NOTE FOR CONTRIBUTORS: use a generic method
52815+ nr_units_single_unit() for solid (atomic) items, as
52816+ tree operations use it as a criterion of solidness
52817+ (see is_solid_item macro) */
52818+ pos_in_node_t(*nr_units) (const coord_t *);
52819+
52820+ /* search within item for a unit within the item, and return a
52821+ pointer to it. This can be used to calculate how many
52822+ bytes to shrink an item if you use pointer arithmetic and
52823+ compare to the start of the item body if the item's data
52824+ are continuous in the node, if the item's data are not
52825+ continuous in the node, all sorts of other things are maybe
52826+ going to break as well. */
52827+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52828+ /* method called by ode_plugin->create_item() to initialise new
52829+ item */
52830+ int (*init) (coord_t * target, coord_t * from,
52831+ reiser4_item_data * data);
52832+ /* method called (e.g., by reiser4_resize_item()) to place new data
52833+ into item when it grows */
52834+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52835+ /* return true if paste into @coord is allowed to skip
52836+ carry. That is, if such paste would require any changes
52837+ at the parent level
52838+ */
52839+ int (*fast_paste) (const coord_t *);
52840+ /* how many but not more than @want units of @source can be
52841+ shifted into @target node. If pend == append - we try to
52842+ append last item of @target by first units of @source. If
52843+ pend == prepend - we try to "prepend" first item in @target
52844+ by last units of @source. @target node has @free_space
52845+ bytes of free space. Total size of those units are returned
52846+ via @size.
52847+
52848+ @target is not NULL if shifting to the mergeable item and
52849+ NULL is new item will be created during shifting.
52850+ */
52851+ int (*can_shift) (unsigned free_space, coord_t *,
52852+ znode *, shift_direction, unsigned *size,
52853+ unsigned want);
52854+
52855+ /* starting off @from-th unit of item @source append or
52856+ prepend @count units to @target. @target has been already
52857+ expanded by @free_space bytes. That must be exactly what is
52858+ needed for those items in @target. If @where_is_free_space
52859+ == SHIFT_LEFT - free space is at the end of @target item,
52860+ othersize - it is in the beginning of it. */
52861+ void (*copy_units) (coord_t *, coord_t *,
52862+ unsigned from, unsigned count,
52863+ shift_direction where_is_free_space,
52864+ unsigned free_space);
52865+
52866+ int (*create_hook) (const coord_t *, void *);
52867+ /* do whatever is necessary to do when @count units starting
52868+ from @from-th one are removed from the tree */
52869+ /* FIXME-VS: this is used to be here for, in particular,
52870+ extents and items of internal type to free blocks they point
52871+ to at the same time with removing items from a
52872+ tree. Problems start, however, when dealloc_block fails due
52873+ to some reason. Item gets removed, but blocks it pointed to
52874+ are not freed. It is not clear how to fix this for items of
52875+ internal type because a need to remove internal item may
52876+ appear in the middle of balancing, and there is no way to
52877+ undo changes made. OTOH, if space allocator involves
52878+ balancing to perform dealloc_block - this will probably
52879+ break balancing due to deadlock issues
52880+ */
52881+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
52882+ pos_in_node_t count, struct carry_kill_data *);
52883+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52884+ znode * _node);
52885+
52886+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52887+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
52888+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52889+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52890+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52891+ */
52892+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52893+ struct carry_cut_data *,
52894+ reiser4_key * smallest_removed,
52895+ reiser4_key * new_first_key);
52896+
52897+ /* like cut_units, except that these units are removed from the
52898+ tree, not only from a node */
52899+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52900+ struct carry_kill_data *,
52901+ reiser4_key * smallest_removed,
52902+ reiser4_key * new_first);
52903+
52904+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
52905+ key of unit is returned. If @coord is not set to certain
52906+ unit - ERR_PTR(-ENOENT) is returned */
52907+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52908+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52909+ /* estimate how much space is needed for paste @data into item at
52910+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
52911+ pasting
52912+ */
52913+ int (*estimate) (const coord_t *, const reiser4_item_data *);
52914+
52915+ /* converts flow @f to item data. @coord == 0 on insert */
52916+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
52917+ reiser4_item_data *);
52918+
52919+ /*void (*show) (struct seq_file *, coord_t *); */
52920+
52921+#if REISER4_DEBUG
52922+ /* used for debugging, every item should have here the most
52923+ complete possible check of the consistency of the item that
52924+ the inventor can construct */
52925+ int (*check) (const coord_t *, const char **error);
52926+#endif
52927+
52928+} balance_ops;
52929+
52930+typedef struct {
52931+ /* return the right or left child of @coord, only if it is in memory */
52932+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52933+
52934+ /* return whether the right or left child of @coord has a non-fake
52935+ block number. */
52936+ int (*utmost_child_real_block) (const coord_t *, sideof side,
52937+ reiser4_block_nr *);
52938+ /* relocate child at @coord to the @block */
52939+ void (*update) (const coord_t *, const reiser4_block_nr *);
52940+ /* count unformatted nodes per item for leave relocation policy, etc.. */
52941+ int (*scan) (flush_scan * scan);
52942+ /* convert item by flush */
52943+ int (*convert) (flush_pos_t * pos);
52944+ /* backward mapping from jnode offset to a key. */
52945+ int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52946+} flush_ops;
52947+
52948+/* operations specific to the directory item */
52949+typedef struct {
52950+ /* extract stat-data key from directory entry at @coord and place it
52951+ into @key. */
52952+ int (*extract_key) (const coord_t *, reiser4_key * key);
52953+ /* update object key in item. */
52954+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52955+ /* extract name from directory entry at @coord and return it */
52956+ char *(*extract_name) (const coord_t *, char *buf);
52957+ /* extract file type (DT_* stuff) from directory entry at @coord and
52958+ return it */
52959+ unsigned (*extract_file_type) (const coord_t *);
52960+ int (*add_entry) (struct inode * dir,
52961+ coord_t *, lock_handle *,
52962+ const struct dentry * name,
52963+ reiser4_dir_entry_desc * entry);
52964+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
52965+ coord_t *, lock_handle *,
52966+ reiser4_dir_entry_desc * entry);
52967+ int (*max_name_len) (const struct inode * dir);
52968+} dir_entry_ops;
52969+
52970+/* operations specific to items regular (unix) file metadata are built of */
52971+typedef struct {
52972+ int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52973+ int (*read) (struct file *, flow_t *, hint_t *);
52974+ int (*readpage) (void *, struct page *);
52975+ int (*get_block) (const coord_t *, sector_t, sector_t *);
52976+ /*
52977+ * key of first byte which is not addressed by the item @coord is set
52978+ * to.
52979+ * For example, for extent item with the key
52980+ *
52981+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52982+ *
52983+ * ->append_key is
52984+ *
52985+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52986+ */
52987+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52988+
52989+ void (*init_coord_extension) (uf_coord_t *, loff_t);
52990+} file_ops;
52991+
52992+/* operations specific to items of stat data type */
52993+typedef struct {
52994+ int (*init_inode) (struct inode * inode, char *sd, int len);
52995+ int (*save_len) (struct inode * inode);
52996+ int (*save) (struct inode * inode, char **area);
52997+} sd_ops;
52998+
52999+/* operations specific to internal item */
53000+typedef struct {
53001+ /* all tree traversal want to know from internal item is where
53002+ to go next. */
53003+ void (*down_link) (const coord_t * coord,
53004+ const reiser4_key * key, reiser4_block_nr * block);
53005+ /* check that given internal item contains given pointer. */
53006+ int (*has_pointer_to) (const coord_t * coord,
53007+ const reiser4_block_nr * block);
53008+} internal_item_ops;
53009+
53010+struct item_plugin {
53011+ /* generic fields */
53012+ plugin_header h;
53013+
53014+ /* methods common for all item types */
53015+ balance_ops b;
53016+ /* methods used during flush */
53017+ flush_ops f;
53018+
53019+ /* methods specific to particular type of item */
53020+ union {
53021+ dir_entry_ops dir;
53022+ file_ops file;
53023+ sd_ops sd;
53024+ internal_item_ops internal;
53025+ } s;
53026+
53027+};
53028+
53029+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
53030+
53031+static inline item_id item_id_by_plugin(item_plugin * plugin)
53032+{
53033+ return plugin->h.id;
53034+}
53035+
53036+static inline char get_iplugid(item_plugin * iplug)
53037+{
53038+ assert("nikita-2838", iplug != NULL);
53039+ assert("nikita-2839", iplug->h.id < 0xff);
53040+ return (char)item_id_by_plugin(iplug);
53041+}
53042+
53043+extern unsigned long znode_times_locked(const znode * z);
53044+
53045+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
53046+{
53047+ assert("nikita-2837", coord != NULL);
53048+ assert("nikita-2838", iplug != NULL);
53049+ coord->iplugid = get_iplugid(iplug);
53050+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
53051+}
53052+
53053+static inline item_plugin *coord_iplug(const coord_t * coord)
53054+{
53055+ assert("nikita-2833", coord != NULL);
53056+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
53057+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
53058+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
53059+ coord->iplugid);
53060+}
53061+
53062+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
53063+ const reiser4_item_data *);
53064+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
53065+extern int item_is_extent(const coord_t *);
53066+extern int item_is_tail(const coord_t *);
53067+extern int item_is_statdata(const coord_t * item);
53068+extern int item_is_ctail(const coord_t *);
53069+
53070+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
53071+extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
53072+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
53073+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
53074+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
53075+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
53076+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
53077+ reiser4_key * key);
53078+extern void obtain_item_plugin(const coord_t * coord);
53079+
53080+#if defined(REISER4_DEBUG)
53081+extern int znode_is_loaded(const znode * node);
53082+#endif
53083+
53084+/* return plugin of item at @coord */
53085+static inline item_plugin *item_plugin_by_coord(const coord_t *
53086+ coord /* coord to query */ )
53087+{
53088+ assert("nikita-330", coord != NULL);
53089+ assert("nikita-331", coord->node != NULL);
53090+ assert("nikita-332", znode_is_loaded(coord->node));
53091+
53092+ if (unlikely(!coord_is_iplug_set(coord)))
53093+ obtain_item_plugin(coord);
53094+ return coord_iplug(coord);
53095+}
53096+
53097+/* this returns true if item is of internal type */
53098+static inline int item_is_internal(const coord_t * item)
53099+{
53100+ assert("vs-483", coord_is_existing_item(item));
53101+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
53102+}
53103+
53104+extern void item_body_by_coord_hard(coord_t * coord);
53105+extern void *item_body_by_coord_easy(const coord_t * coord);
53106+#if REISER4_DEBUG
53107+extern int item_body_is_valid(const coord_t * coord);
53108+#endif
53109+
53110+/* return pointer to item body */
53111+static inline void *item_body_by_coord(const coord_t *
53112+ coord /* coord to query */ )
53113+{
53114+ assert("nikita-324", coord != NULL);
53115+ assert("nikita-325", coord->node != NULL);
53116+ assert("nikita-326", znode_is_loaded(coord->node));
53117+
53118+ if (coord->offset == INVALID_OFFSET)
53119+ item_body_by_coord_hard((coord_t *) coord);
53120+ assert("nikita-3201", item_body_is_valid(coord));
53121+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
53122+ return item_body_by_coord_easy(coord);
53123+}
53124+
53125+/* __REISER4_ITEM_H__ */
53126+#endif
53127+/* Make Linus happy.
53128+ Local variables:
53129+ c-indentation-style: "K&R"
53130+ mode-name: "LC"
53131+ c-basic-offset: 8
53132+ tab-width: 8
53133+ fill-column: 120
53134+ scroll-step: 1
53135+ End:
53136+*/
53137diff --git a/fs/reiser4/plugin/item/sde.c b/fs/reiser4/plugin/item/sde.c
53138new file mode 100644
53139index 0000000..27f2400
53140--- /dev/null
53141+++ b/fs/reiser4/plugin/item/sde.c
53142@@ -0,0 +1,190 @@
53143+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53144+
53145+/* Directory entry implementation */
53146+#include "../../forward.h"
53147+#include "../../debug.h"
53148+#include "../../dformat.h"
53149+#include "../../kassign.h"
53150+#include "../../coord.h"
53151+#include "sde.h"
53152+#include "item.h"
53153+#include "../plugin.h"
53154+#include "../../znode.h"
53155+#include "../../carry.h"
53156+#include "../../tree.h"
53157+#include "../../inode.h"
53158+
53159+#include <linux/fs.h> /* for struct inode */
53160+#include <linux/dcache.h> /* for struct dentry */
53161+#include <linux/quotaops.h>
53162+
53163+/* ->extract_key() method of simple directory item plugin. */
53164+int extract_key_de(const coord_t * coord /* coord of item */ ,
53165+ reiser4_key * key /* resulting key */ )
53166+{
53167+ directory_entry_format *dent;
53168+
53169+ assert("nikita-1458", coord != NULL);
53170+ assert("nikita-1459", key != NULL);
53171+
53172+ dent = (directory_entry_format *) item_body_by_coord(coord);
53173+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
53174+ return extract_key_from_id(&dent->id, key);
53175+}
53176+
53177+int
53178+update_key_de(const coord_t * coord, const reiser4_key * key,
53179+ lock_handle * lh UNUSED_ARG)
53180+{
53181+ directory_entry_format *dent;
53182+ obj_key_id obj_id;
53183+ int result;
53184+
53185+ assert("nikita-2342", coord != NULL);
53186+ assert("nikita-2343", key != NULL);
53187+
53188+ dent = (directory_entry_format *) item_body_by_coord(coord);
53189+ result = build_obj_key_id(key, &obj_id);
53190+ if (result == 0) {
53191+ dent->id = obj_id;
53192+ znode_make_dirty(coord->node);
53193+ }
53194+ return 0;
53195+}
53196+
53197+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
53198+ char *buf)
53199+{
53200+ reiser4_key key;
53201+
53202+ unit_key_by_coord(coord, &key);
53203+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
53204+ reiser4_print_address("oops", znode_get_block(coord->node));
53205+ if (!is_longname_key(&key)) {
53206+ if (is_dot_key(&key))
53207+ return (char *)".";
53208+ else
53209+ return extract_name_from_key(&key, buf);
53210+ } else
53211+ return (char *)dent->name;
53212+}
53213+
53214+/* ->extract_name() method of simple directory item plugin. */
53215+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
53216+{
53217+ directory_entry_format *dent;
53218+
53219+ assert("nikita-1460", coord != NULL);
53220+
53221+ dent = (directory_entry_format *) item_body_by_coord(coord);
53222+ return extract_dent_name(coord, dent, buf);
53223+}
53224+
53225+/* ->extract_file_type() method of simple directory item plugin. */
53226+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
53227+ * item */ )
53228+{
53229+ assert("nikita-1764", coord != NULL);
53230+ /* we don't store file type in the directory entry yet.
53231+
53232+ But see comments at kassign.h:obj_key_id
53233+ */
53234+ return DT_UNKNOWN;
53235+}
53236+
53237+int add_entry_de(struct inode *dir /* directory of item */ ,
53238+ coord_t * coord /* coord of item */ ,
53239+ lock_handle * lh /* insertion lock handle */ ,
53240+ const struct dentry *de /* name to add */ ,
53241+ reiser4_dir_entry_desc * entry /* parameters of new directory
53242+ * entry */ )
53243+{
53244+ reiser4_item_data data;
53245+ directory_entry_format *dent;
53246+ int result;
53247+ const char *name;
53248+ int len;
53249+ int longname;
53250+
53251+ name = de->d_name.name;
53252+ len = de->d_name.len;
53253+ assert("nikita-1163", strlen(name) == len);
53254+
53255+ longname = is_longname(name, len);
53256+
53257+ data.length = sizeof *dent;
53258+ if (longname)
53259+ data.length += len + 1;
53260+ data.data = NULL;
53261+ data.user = 0;
53262+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
53263+
53264+ /* NOTE-NIKITA quota plugin */
53265+ if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
53266+ return -EDQUOT;
53267+
53268+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
53269+ if (result != 0)
53270+ return result;
53271+
53272+ dent = (directory_entry_format *) item_body_by_coord(coord);
53273+ build_inode_key_id(entry->obj, &dent->id);
53274+ if (longname) {
53275+ memcpy(dent->name, name, len);
53276+ put_unaligned(0, &dent->name[len]);
53277+ }
53278+ return 0;
53279+}
53280+
53281+int rem_entry_de(struct inode *dir /* directory of item */ ,
53282+ const struct qstr *name UNUSED_ARG,
53283+ coord_t * coord /* coord of item */ ,
53284+ lock_handle * lh UNUSED_ARG /* lock handle for
53285+ * removal */ ,
53286+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
53287+ * directory entry
53288+ * being removed */ )
53289+{
53290+ coord_t shadow;
53291+ int result;
53292+ int length;
53293+
53294+ length = item_length_by_coord(coord);
53295+ if (inode_get_bytes(dir) < length) {
53296+ warning("nikita-2627", "Dir is broke: %llu: %llu",
53297+ (unsigned long long)get_inode_oid(dir),
53298+ inode_get_bytes(dir));
53299+
53300+ return RETERR(-EIO);
53301+ }
53302+
53303+ /* cut_node() is supposed to take pointers to _different_
53304+ coords, because it will modify them without respect to
53305+ possible aliasing. To work around this, create temporary copy
53306+ of @coord.
53307+ */
53308+ coord_dup(&shadow, coord);
53309+ result =
53310+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
53311+ if (result == 0) {
53312+ /* NOTE-NIKITA quota plugin */
53313+ DQUOT_FREE_SPACE_NODIRTY(dir, length);
53314+ }
53315+ return result;
53316+}
53317+
53318+int max_name_len_de(const struct inode *dir)
53319+{
53320+ return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
53321+ sizeof(directory_entry_format) - 2;
53322+}
53323+
53324+/* Make Linus happy.
53325+ Local variables:
53326+ c-indentation-style: "K&R"
53327+ mode-name: "LC"
53328+ c-basic-offset: 8
53329+ tab-width: 8
53330+ fill-column: 120
53331+ End:
53332+*/
53333diff --git a/fs/reiser4/plugin/item/sde.h b/fs/reiser4/plugin/item/sde.h
53334new file mode 100644
53335index 0000000..f26762a
53336--- /dev/null
53337+++ b/fs/reiser4/plugin/item/sde.h
53338@@ -0,0 +1,66 @@
53339+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53340+
53341+/* Directory entry. */
53342+
53343+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
53344+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
53345+
53346+#include "../../forward.h"
53347+#include "../../dformat.h"
53348+#include "../../kassign.h"
53349+#include "../../key.h"
53350+
53351+#include <linux/fs.h>
53352+#include <linux/dcache.h> /* for struct dentry */
53353+
53354+typedef struct directory_entry_format {
53355+ /* key of object stat-data. It's not necessary to store whole
53356+ key here, because it's always key of stat-data, so minor
53357+ packing locality and offset can be omitted here. But this
53358+ relies on particular key allocation scheme for stat-data, so,
53359+ for extensibility sake, whole key can be stored here.
53360+
53361+ We store key as array of bytes, because we don't want 8-byte
53362+ alignment of dir entries.
53363+ */
53364+ obj_key_id id;
53365+ /* file name. Null terminated string. */
53366+ d8 name[0];
53367+} directory_entry_format;
53368+
53369+void print_de(const char *prefix, coord_t * coord);
53370+int extract_key_de(const coord_t * coord, reiser4_key * key);
53371+int update_key_de(const coord_t * coord, const reiser4_key * key,
53372+ lock_handle * lh);
53373+char *extract_name_de(const coord_t * coord, char *buf);
53374+unsigned extract_file_type_de(const coord_t * coord);
53375+int add_entry_de(struct inode *dir, coord_t * coord,
53376+ lock_handle * lh, const struct dentry *name,
53377+ reiser4_dir_entry_desc * entry);
53378+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
53379+ lock_handle * lh, reiser4_dir_entry_desc * entry);
53380+int max_name_len_de(const struct inode *dir);
53381+
53382+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
53383+
53384+char *extract_dent_name(const coord_t * coord,
53385+ directory_entry_format * dent, char *buf);
53386+
53387+#if REISER4_LARGE_KEY
53388+#define DE_NAME_BUF_LEN (24)
53389+#else
53390+#define DE_NAME_BUF_LEN (16)
53391+#endif
53392+
53393+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
53394+#endif
53395+
53396+/* Make Linus happy.
53397+ Local variables:
53398+ c-indentation-style: "K&R"
53399+ mode-name: "LC"
53400+ c-basic-offset: 8
53401+ tab-width: 8
53402+ fill-column: 120
53403+ End:
53404+*/
53405diff --git a/fs/reiser4/plugin/item/static_stat.c b/fs/reiser4/plugin/item/static_stat.c
53406new file mode 100644
53407index 0000000..c38e44a
53408--- /dev/null
53409+++ b/fs/reiser4/plugin/item/static_stat.c
53410@@ -0,0 +1,1106 @@
53411+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53412+
53413+/* stat data manipulation. */
53414+
53415+#include "../../forward.h"
53416+#include "../../super.h"
53417+#include "../../vfs_ops.h"
53418+#include "../../inode.h"
53419+#include "../../debug.h"
53420+#include "../../dformat.h"
53421+#include "../object.h"
53422+#include "../plugin.h"
53423+#include "../plugin_header.h"
53424+#include "static_stat.h"
53425+#include "item.h"
53426+
53427+#include <linux/types.h>
53428+#include <linux/fs.h>
53429+
53430+/* see static_stat.h for explanation */
53431+
53432+/* helper function used while we are dumping/loading inode/plugin state
53433+ to/from the stat-data. */
53434+
53435+static void move_on(int *length /* space remaining in stat-data */ ,
53436+ char **area /* current coord in stat data */ ,
53437+ int size_of /* how many bytes to move forward */ )
53438+{
53439+ assert("nikita-615", length != NULL);
53440+ assert("nikita-616", area != NULL);
53441+
53442+ *length -= size_of;
53443+ *area += size_of;
53444+
53445+ assert("nikita-617", *length >= 0);
53446+}
53447+
53448+/* helper function used while loading inode/plugin state from stat-data.
53449+ Complain if there is less space in stat-data than was expected.
53450+ Can only happen on disk corruption. */
53451+static int not_enough_space(struct inode *inode /* object being processed */ ,
53452+ const char *where /* error message */ )
53453+{
53454+ assert("nikita-618", inode != NULL);
53455+
53456+ warning("nikita-619", "Not enough space in %llu while loading %s",
53457+ (unsigned long long)get_inode_oid(inode), where);
53458+
53459+ return RETERR(-EINVAL);
53460+}
53461+
53462+/* helper function used while loading inode/plugin state from
53463+ stat-data. Call it if invalid plugin id was found. */
53464+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
53465+ struct inode *inode /* object being processed */ )
53466+{
53467+ warning("nikita-620", "Unknown plugin %i in %llu",
53468+ id, (unsigned long long)get_inode_oid(inode));
53469+
53470+ return RETERR(-EINVAL);
53471+}
53472+
53473+/* this is installed as ->init_inode() method of
53474+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
53475+ Copies data from on-disk stat-data format into inode.
53476+ Handles stat-data extensions. */
53477+/* was sd_load */
53478+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
53479+ char *sd /* stat-data body */ ,
53480+ int len /* length of stat-data */ )
53481+{
53482+ int result;
53483+ int bit;
53484+ int chunk;
53485+ __u16 mask;
53486+ __u64 bigmask;
53487+ reiser4_stat_data_base *sd_base;
53488+ reiser4_inode *state;
53489+
53490+ assert("nikita-625", inode != NULL);
53491+ assert("nikita-626", sd != NULL);
53492+
53493+ result = 0;
53494+ sd_base = (reiser4_stat_data_base *) sd;
53495+ state = reiser4_inode_data(inode);
53496+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
53497+ bigmask = mask;
53498+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
53499+
53500+ move_on(&len, &sd, sizeof *sd_base);
53501+ for (bit = 0, chunk = 0;
53502+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
53503+ ++bit, mask >>= 1) {
53504+ if (((bit + 1) % 16) != 0) {
53505+ /* handle extension */
53506+ sd_ext_plugin *sdplug;
53507+
53508+ if (bit >= LAST_SD_EXTENSION) {
53509+ warning("vpf-1904",
53510+ "No such extension %i in inode %llu",
53511+ bit,
53512+ (unsigned long long)
53513+ get_inode_oid(inode));
53514+
53515+ result = RETERR(-EINVAL);
53516+ break;
53517+ }
53518+
53519+ sdplug = sd_ext_plugin_by_id(bit);
53520+ if (sdplug == NULL) {
53521+ warning("nikita-627",
53522+ "No such extension %i in inode %llu",
53523+ bit,
53524+ (unsigned long long)
53525+ get_inode_oid(inode));
53526+
53527+ result = RETERR(-EINVAL);
53528+ break;
53529+ }
53530+ if (mask & 1) {
53531+ assert("nikita-628", sdplug->present);
53532+ /* alignment is not supported in node layout
53533+ plugin yet.
53534+ result = align( inode, &len, &sd,
53535+ sdplug -> alignment );
53536+ if( result != 0 )
53537+ return result; */
53538+ result = sdplug->present(inode, &sd, &len);
53539+ } else if (sdplug->absent != NULL)
53540+ result = sdplug->absent(inode);
53541+ if (result)
53542+ break;
53543+ /* else, we are looking at the last bit in 16-bit
53544+ portion of bitmask */
53545+ } else if (mask & 1) {
53546+ /* next portion of bitmask */
53547+ if (len < (int)sizeof(d16)) {
53548+ warning("nikita-629",
53549+ "No space for bitmap in inode %llu",
53550+ (unsigned long long)
53551+ get_inode_oid(inode));
53552+
53553+ result = RETERR(-EINVAL);
53554+ break;
53555+ }
53556+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
53557+ bigmask <<= 16;
53558+ bigmask |= mask;
53559+ move_on(&len, &sd, sizeof(d16));
53560+ ++chunk;
53561+ if (chunk == 3) {
53562+ if (!(mask & 0x8000)) {
53563+ /* clear last bit */
53564+ mask &= ~0x8000;
53565+ continue;
53566+ }
53567+ /* too much */
53568+ warning("nikita-630",
53569+ "Too many extensions in %llu",
53570+ (unsigned long long)
53571+ get_inode_oid(inode));
53572+
53573+ result = RETERR(-EINVAL);
53574+ break;
53575+ }
53576+ } else
53577+ /* bitmask exhausted */
53578+ break;
53579+ }
53580+ state->extmask = bigmask;
53581+ if (len - (bit / 16 * sizeof(d16)) > 0) {
53582+ /* alignment in save_len_static_sd() is taken into account
53583+ -edward */
53584+ warning("nikita-631", "unused space in inode %llu",
53585+ (unsigned long long)get_inode_oid(inode));
53586+ }
53587+
53588+ return result;
53589+}
53590+
53591+/* estimates size of stat-data required to store inode.
53592+ Installed as ->save_len() method of
53593+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53594+/* was sd_len */
53595+int save_len_static_sd(struct inode *inode /* object being processed */ )
53596+{
53597+ unsigned int result;
53598+ __u64 mask;
53599+ int bit;
53600+
53601+ assert("nikita-632", inode != NULL);
53602+
53603+ result = sizeof(reiser4_stat_data_base);
53604+ mask = reiser4_inode_data(inode)->extmask;
53605+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53606+ if (mask & 1) {
53607+ sd_ext_plugin *sdplug;
53608+
53609+ sdplug = sd_ext_plugin_by_id(bit);
53610+ assert("nikita-633", sdplug != NULL);
53611+ /* no aligment support
53612+ result +=
53613+ round_up( result, sdplug -> alignment ) - result; */
53614+ result += sdplug->save_len(inode);
53615+ }
53616+ }
53617+ result += bit / 16 * sizeof(d16);
53618+ return result;
53619+}
53620+
53621+/* saves inode into stat-data.
53622+ Installed as ->save() method of
53623+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53624+/* was sd_save */
53625+int save_static_sd(struct inode *inode /* object being processed */ ,
53626+ char **area /* where to save stat-data */ )
53627+{
53628+ int result;
53629+ __u64 emask;
53630+ int bit;
53631+ unsigned int len;
53632+ reiser4_stat_data_base *sd_base;
53633+
53634+ assert("nikita-634", inode != NULL);
53635+ assert("nikita-635", area != NULL);
53636+
53637+ result = 0;
53638+ emask = reiser4_inode_data(inode)->extmask;
53639+ sd_base = (reiser4_stat_data_base *) * area;
53640+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53641+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53642+
53643+ *area += sizeof *sd_base;
53644+ len = 0xffffffffu;
53645+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53646+ if (emask & 1) {
53647+ if ((bit + 1) % 16 != 0) {
53648+ sd_ext_plugin *sdplug;
53649+ sdplug = sd_ext_plugin_by_id(bit);
53650+ assert("nikita-636", sdplug != NULL);
53651+ /* no alignment support yet
53652+ align( inode, &len, area,
53653+ sdplug -> alignment ); */
53654+ result = sdplug->save(inode, area);
53655+ if (result)
53656+ break;
53657+ } else {
53658+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53659+ (d16 *)(*area));
53660+ /*cputod16((unsigned)(emask & 0xffff),
53661+ (d16 *) * area);*/
53662+ *area += sizeof(d16);
53663+ }
53664+ }
53665+ }
53666+ return result;
53667+}
53668+
53669+/* stat-data extension handling functions. */
53670+
53671+static int present_lw_sd(struct inode *inode /* object being processed */ ,
53672+ char **area /* position in stat-data */ ,
53673+ int *len /* remaining length */ )
53674+{
53675+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53676+ reiser4_light_weight_stat *sd_lw;
53677+
53678+ sd_lw = (reiser4_light_weight_stat *) * area;
53679+
53680+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53681+ inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53682+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53683+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53684+ inode->i_mode &= ~S_IFIFO;
53685+ warning("", "partially converted file is encountered");
53686+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
53687+ }
53688+ move_on(len, area, sizeof *sd_lw);
53689+ return 0;
53690+ } else
53691+ return not_enough_space(inode, "lw sd");
53692+}
53693+
53694+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
53695+ * processed */ )
53696+{
53697+ return sizeof(reiser4_light_weight_stat);
53698+}
53699+
53700+static int save_lw_sd(struct inode *inode /* object being processed */ ,
53701+ char **area /* position in stat-data */ )
53702+{
53703+ reiser4_light_weight_stat *sd;
53704+ mode_t delta;
53705+
53706+ assert("nikita-2705", inode != NULL);
53707+ assert("nikita-2706", area != NULL);
53708+ assert("nikita-2707", *area != NULL);
53709+
53710+ sd = (reiser4_light_weight_stat *) * area;
53711+
53712+ delta = (reiser4_inode_get_flag(inode,
53713+ REISER4_PART_MIXED) ? S_IFIFO : 0);
53714+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53715+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53716+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53717+ *area += sizeof *sd;
53718+ return 0;
53719+}
53720+
53721+static int present_unix_sd(struct inode *inode /* object being processed */ ,
53722+ char **area /* position in stat-data */ ,
53723+ int *len /* remaining length */ )
53724+{
53725+ assert("nikita-637", inode != NULL);
53726+ assert("nikita-638", area != NULL);
53727+ assert("nikita-639", *area != NULL);
53728+ assert("nikita-640", len != NULL);
53729+ assert("nikita-641", *len > 0);
53730+
53731+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
53732+ reiser4_unix_stat *sd;
53733+
53734+ sd = (reiser4_unix_stat *) * area;
53735+
53736+ inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53737+ inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53738+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53739+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53740+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53741+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53742+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53743+ else
53744+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53745+ move_on(len, area, sizeof *sd);
53746+ return 0;
53747+ } else
53748+ return not_enough_space(inode, "unix sd");
53749+}
53750+
53751+static int absent_unix_sd(struct inode *inode /* object being processed */ )
53752+{
53753+ inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53754+ inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53755+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53756+ inode_set_bytes(inode, inode->i_size);
53757+ /* mark inode as lightweight, so that caller (lookup_common) will
53758+ complete initialisation by copying [ug]id from a parent. */
53759+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53760+ return 0;
53761+}
53762+
53763+/* Audited by: green(2002.06.14) */
53764+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
53765+ * processed */ )
53766+{
53767+ return sizeof(reiser4_unix_stat);
53768+}
53769+
53770+static int save_unix_sd(struct inode *inode /* object being processed */ ,
53771+ char **area /* position in stat-data */ )
53772+{
53773+ reiser4_unix_stat *sd;
53774+
53775+ assert("nikita-642", inode != NULL);
53776+ assert("nikita-643", area != NULL);
53777+ assert("nikita-644", *area != NULL);
53778+
53779+ sd = (reiser4_unix_stat *) * area;
53780+ put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53781+ put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53782+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53783+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53784+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53785+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53786+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53787+ else
53788+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53789+ *area += sizeof *sd;
53790+ return 0;
53791+}
53792+
53793+static int
53794+present_large_times_sd(struct inode *inode /* object being processed */ ,
53795+ char **area /* position in stat-data */ ,
53796+ int *len /* remaining length */ )
53797+{
53798+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53799+ reiser4_large_times_stat *sd_lt;
53800+
53801+ sd_lt = (reiser4_large_times_stat *) * area;
53802+
53803+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53804+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53805+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53806+
53807+ move_on(len, area, sizeof *sd_lt);
53808+ return 0;
53809+ } else
53810+ return not_enough_space(inode, "large times sd");
53811+}
53812+
53813+static int
53814+save_len_large_times_sd(struct inode *inode UNUSED_ARG
53815+ /* object being processed */ )
53816+{
53817+ return sizeof(reiser4_large_times_stat);
53818+}
53819+
53820+static int
53821+save_large_times_sd(struct inode *inode /* object being processed */ ,
53822+ char **area /* position in stat-data */ )
53823+{
53824+ reiser4_large_times_stat *sd;
53825+
53826+ assert("nikita-2817", inode != NULL);
53827+ assert("nikita-2818", area != NULL);
53828+ assert("nikita-2819", *area != NULL);
53829+
53830+ sd = (reiser4_large_times_stat *) * area;
53831+
53832+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53833+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53834+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53835+
53836+ *area += sizeof *sd;
53837+ return 0;
53838+}
53839+
53840+/* symlink stat data extension */
53841+
53842+/* allocate memory for symlink target and attach it to inode->i_private */
53843+static int
53844+symlink_target_to_inode(struct inode *inode, const char *target, int len)
53845+{
53846+ assert("vs-845", inode->i_private == NULL);
53847+ assert("vs-846", !reiser4_inode_get_flag(inode,
53848+ REISER4_GENERIC_PTR_USED));
53849+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
53850+ places, though */
53851+ inode->i_private = kmalloc((size_t) len + 1,
53852+ reiser4_ctx_gfp_mask_get());
53853+ if (!inode->i_private)
53854+ return RETERR(-ENOMEM);
53855+
53856+ memcpy((char *)(inode->i_private), target, (size_t) len);
53857+ ((char *)(inode->i_private))[len] = 0;
53858+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53859+ return 0;
53860+}
53861+
53862+/* this is called on read_inode. There is nothing to do actually, but some
53863+ sanity checks */
53864+static int present_symlink_sd(struct inode *inode, char **area, int *len)
53865+{
53866+ int result;
53867+ int length;
53868+ reiser4_symlink_stat *sd;
53869+
53870+ length = (int)inode->i_size;
53871+ /*
53872+ * *len is number of bytes in stat data item from *area to the end of
53873+ * item. It must be not less than size of symlink + 1 for ending 0
53874+ */
53875+ if (length > *len)
53876+ return not_enough_space(inode, "symlink");
53877+
53878+ if (*(*area + length) != 0) {
53879+ warning("vs-840", "Symlink is not zero terminated");
53880+ return RETERR(-EIO);
53881+ }
53882+
53883+ sd = (reiser4_symlink_stat *) * area;
53884+ result = symlink_target_to_inode(inode, sd->body, length);
53885+
53886+ move_on(len, area, length + 1);
53887+ return result;
53888+}
53889+
53890+static int save_len_symlink_sd(struct inode *inode)
53891+{
53892+ return inode->i_size + 1;
53893+}
53894+
53895+/* this is called on create and update stat data. Do nothing on update but
53896+ update @area */
53897+static int save_symlink_sd(struct inode *inode, char **area)
53898+{
53899+ int result;
53900+ int length;
53901+ reiser4_symlink_stat *sd;
53902+
53903+ length = (int)inode->i_size;
53904+ /* inode->i_size must be set already */
53905+ assert("vs-841", length);
53906+
53907+ result = 0;
53908+ sd = (reiser4_symlink_stat *) * area;
53909+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53910+ const char *target;
53911+
53912+ target = (const char *)(inode->i_private);
53913+ inode->i_private = NULL;
53914+
53915+ result = symlink_target_to_inode(inode, target, length);
53916+
53917+ /* copy symlink to stat data */
53918+ memcpy(sd->body, target, (size_t) length);
53919+ (*area)[length] = 0;
53920+ } else {
53921+ /* there is nothing to do in update but move area */
53922+ assert("vs-844",
53923+ !memcmp(inode->i_private, sd->body,
53924+ (size_t) length + 1));
53925+ }
53926+
53927+ *area += (length + 1);
53928+ return result;
53929+}
53930+
53931+static int present_flags_sd(struct inode *inode /* object being processed */ ,
53932+ char **area /* position in stat-data */ ,
53933+ int *len /* remaining length */ )
53934+{
53935+ assert("nikita-645", inode != NULL);
53936+ assert("nikita-646", area != NULL);
53937+ assert("nikita-647", *area != NULL);
53938+ assert("nikita-648", len != NULL);
53939+ assert("nikita-649", *len > 0);
53940+
53941+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
53942+ reiser4_flags_stat *sd;
53943+
53944+ sd = (reiser4_flags_stat *) * area;
53945+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53946+ move_on(len, area, sizeof *sd);
53947+ return 0;
53948+ } else
53949+ return not_enough_space(inode, "generation and attrs");
53950+}
53951+
53952+/* Audited by: green(2002.06.14) */
53953+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
53954+ * processed */ )
53955+{
53956+ return sizeof(reiser4_flags_stat);
53957+}
53958+
53959+static int save_flags_sd(struct inode *inode /* object being processed */ ,
53960+ char **area /* position in stat-data */ )
53961+{
53962+ reiser4_flags_stat *sd;
53963+
53964+ assert("nikita-650", inode != NULL);
53965+ assert("nikita-651", area != NULL);
53966+ assert("nikita-652", *area != NULL);
53967+
53968+ sd = (reiser4_flags_stat *) * area;
53969+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53970+ *area += sizeof *sd;
53971+ return 0;
53972+}
53973+
53974+static int absent_plugin_sd(struct inode *inode);
53975+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53976+ char **area /* position in stat-data */ ,
53977+ int *len /* remaining length */,
53978+ int is_pset /* 1 if plugin set, 0 if heir set. */)
53979+{
53980+ reiser4_plugin_stat *sd;
53981+ reiser4_plugin *plugin;
53982+ reiser4_inode *info;
53983+ int i;
53984+ __u16 mask;
53985+ int result;
53986+ int num_of_plugins;
53987+
53988+ assert("nikita-653", inode != NULL);
53989+ assert("nikita-654", area != NULL);
53990+ assert("nikita-655", *area != NULL);
53991+ assert("nikita-656", len != NULL);
53992+ assert("nikita-657", *len > 0);
53993+
53994+ if (*len < (int)sizeof(reiser4_plugin_stat))
53995+ return not_enough_space(inode, "plugin");
53996+
53997+ sd = (reiser4_plugin_stat *) * area;
53998+ info = reiser4_inode_data(inode);
53999+
54000+ mask = 0;
54001+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
54002+ move_on(len, area, sizeof *sd);
54003+ result = 0;
54004+ for (i = 0; i < num_of_plugins; ++i) {
54005+ reiser4_plugin_slot *slot;
54006+ reiser4_plugin_type type;
54007+ pset_member memb;
54008+
54009+ slot = (reiser4_plugin_slot *) * area;
54010+ if (*len < (int)sizeof *slot)
54011+ return not_enough_space(inode, "additional plugin");
54012+
54013+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
54014+ type = aset_member_to_type_unsafe(memb);
54015+
54016+ if (type == REISER4_PLUGIN_TYPES) {
54017+ warning("nikita-3502",
54018+ "wrong %s member (%i) for %llu", is_pset ?
54019+ "pset" : "hset", memb,
54020+ (unsigned long long)get_inode_oid(inode));
54021+ return RETERR(-EINVAL);
54022+ }
54023+ plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
54024+ type, &slot->id);
54025+ if (plugin == NULL)
54026+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
54027+
54028+ /* plugin is loaded into inode, mark this into inode's
54029+ bitmask of loaded non-standard plugins */
54030+ if (!(mask & (1 << memb))) {
54031+ mask |= (1 << memb);
54032+ } else {
54033+ warning("nikita-658", "duplicate plugin for %llu",
54034+ (unsigned long long)get_inode_oid(inode));
54035+ return RETERR(-EINVAL);
54036+ }
54037+ move_on(len, area, sizeof *slot);
54038+ /* load plugin data, if any */
54039+ if (plugin->h.pops != NULL && plugin->h.pops->load)
54040+ result = plugin->h.pops->load(inode, plugin, area, len);
54041+ else
54042+ result = aset_set_unsafe(is_pset ? &info->pset :
54043+ &info->hset, memb, plugin);
54044+ if (result)
54045+ return result;
54046+ }
54047+ if (is_pset) {
54048+ /* if object plugin wasn't loaded from stat-data, guess it by
54049+ mode bits */
54050+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
54051+ if (plugin == NULL)
54052+ result = absent_plugin_sd(inode);
54053+ info->plugin_mask = mask;
54054+ } else
54055+ info->heir_mask = mask;
54056+
54057+ return result;
54058+}
54059+
54060+static int present_pset_sd(struct inode *inode, char **area, int *len) {
54061+ return present_plugin_sd(inode, area, len, 1 /* pset */);
54062+}
54063+
54064+/* Determine object plugin for @inode based on i_mode.
54065+
54066+ Many objects in reiser4 file system are controlled by standard object
54067+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
54068+
54069+ For such files we don't explicitly store plugin id in object stat
54070+ data. Rather required plugin is guessed from mode bits, where file "type"
54071+ is encoded (see stat(2)).
54072+*/
54073+static int
54074+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
54075+{
54076+ int fplug_id;
54077+ int dplug_id;
54078+ reiser4_inode *info;
54079+
54080+ assert("nikita-736", inode != NULL);
54081+
54082+ dplug_id = fplug_id = -1;
54083+
54084+ switch (inode->i_mode & S_IFMT) {
54085+ case S_IFSOCK:
54086+ case S_IFBLK:
54087+ case S_IFCHR:
54088+ case S_IFIFO:
54089+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
54090+ break;
54091+ case S_IFLNK:
54092+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
54093+ break;
54094+ case S_IFDIR:
54095+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
54096+ dplug_id = HASHED_DIR_PLUGIN_ID;
54097+ break;
54098+ default:
54099+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
54100+ return RETERR(-EIO);
54101+ case S_IFREG:
54102+ fplug_id = UNIX_FILE_PLUGIN_ID;
54103+ break;
54104+ }
54105+ info = reiser4_inode_data(inode);
54106+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
54107+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
54108+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
54109+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
54110+ return 0;
54111+}
54112+
54113+/* Audited by: green(2002.06.14) */
54114+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
54115+{
54116+ int result;
54117+
54118+ assert("nikita-659", inode != NULL);
54119+
54120+ result = guess_plugin_by_mode(inode);
54121+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
54122+ but setup_inode_ops() will call make_bad_inode().
54123+ Another, more logical but bit more complex solution is to add
54124+ "bad-file plugin". */
54125+ /* FIXME-VS: activate was called here */
54126+ return result;
54127+}
54128+
54129+/* helper function for plugin_sd_save_len(): calculate how much space
54130+ required to save state of given plugin */
54131+/* Audited by: green(2002.06.14) */
54132+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
54133+ struct inode *inode /* object being processed */ ,
54134+ pset_member memb,
54135+ int len, int is_pset)
54136+{
54137+ reiser4_inode *info;
54138+ assert("nikita-661", inode != NULL);
54139+
54140+ if (plugin == NULL)
54141+ return len;
54142+
54143+ info = reiser4_inode_data(inode);
54144+ if (is_pset ?
54145+ info->plugin_mask & (1 << memb) :
54146+ info->heir_mask & (1 << memb)) {
54147+ len += sizeof(reiser4_plugin_slot);
54148+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
54149+ /* non-standard plugin, call method */
54150+ /* commented as it is incompatible with alignment
54151+ * policy in save_plug() -edward */
54152+ /* len = round_up(len, plugin->h.pops->alignment); */
54153+ len += plugin->h.pops->save_len(inode, plugin);
54154+ }
54155+ }
54156+ return len;
54157+}
54158+
54159+/* calculate how much space is required to save state of all plugins,
54160+ associated with inode */
54161+static int save_len_plugin_sd(struct inode *inode /* object being processed */,
54162+ int is_pset)
54163+{
54164+ int len;
54165+ int last;
54166+ reiser4_inode *state;
54167+ pset_member memb;
54168+
54169+ assert("nikita-663", inode != NULL);
54170+
54171+ state = reiser4_inode_data(inode);
54172+
54173+ /* common case: no non-standard plugins */
54174+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
54175+ return 0;
54176+ len = sizeof(reiser4_plugin_stat);
54177+ last = PSET_LAST;
54178+
54179+ for (memb = 0; memb < last; ++memb) {
54180+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
54181+ inode, memb, len, is_pset);
54182+ }
54183+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
54184+ return len;
54185+}
54186+
54187+static int save_len_pset_sd(struct inode *inode) {
54188+ return save_len_plugin_sd(inode, 1 /* pset */);
54189+}
54190+
54191+/* helper function for plugin_sd_save(): save plugin, associated with
54192+ inode. */
54193+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
54194+ struct inode *inode /* object being processed */ ,
54195+ int memb /* what element of pset is saved */ ,
54196+ char **area /* position in stat-data */ ,
54197+ int *count /* incremented if plugin were actually saved. */,
54198+ int is_pset /* 1 for plugin set, 0 for heir set */)
54199+{
54200+ reiser4_plugin_slot *slot;
54201+ int fake_len;
54202+ int result;
54203+
54204+ assert("nikita-665", inode != NULL);
54205+ assert("nikita-666", area != NULL);
54206+ assert("nikita-667", *area != NULL);
54207+
54208+ if (plugin == NULL)
54209+ return 0;
54210+
54211+ if (is_pset ?
54212+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
54213+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
54214+ return 0;
54215+ slot = (reiser4_plugin_slot *) * area;
54216+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
54217+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
54218+ fake_len = (int)0xffff;
54219+ move_on(&fake_len, area, sizeof *slot);
54220+ ++*count;
54221+ result = 0;
54222+ if (plugin->h.pops != NULL) {
54223+ if (plugin->h.pops->save != NULL)
54224+ result = plugin->h.pops->save(inode, plugin, area);
54225+ }
54226+ return result;
54227+}
54228+
54229+/* save state of all non-standard plugins associated with inode */
54230+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
54231+ char **area /* position in stat-data */,
54232+ int is_pset /* 1 for pset, 0 for hset */)
54233+{
54234+ int fake_len;
54235+ int result = 0;
54236+ int num_of_plugins;
54237+ reiser4_plugin_stat *sd;
54238+ reiser4_inode *state;
54239+ pset_member memb;
54240+
54241+ assert("nikita-669", inode != NULL);
54242+ assert("nikita-670", area != NULL);
54243+ assert("nikita-671", *area != NULL);
54244+
54245+ state = reiser4_inode_data(inode);
54246+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
54247+ return 0;
54248+ sd = (reiser4_plugin_stat *) * area;
54249+ fake_len = (int)0xffff;
54250+ move_on(&fake_len, area, sizeof *sd);
54251+
54252+ num_of_plugins = 0;
54253+ for (memb = 0; memb < PSET_LAST; ++memb) {
54254+ result = save_plug(aset_get(is_pset ? state->pset : state->hset,
54255+ memb),
54256+ inode, memb, area, &num_of_plugins, is_pset);
54257+ if (result != 0)
54258+ break;
54259+ }
54260+
54261+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
54262+ return result;
54263+}
54264+
54265+static int save_pset_sd(struct inode *inode, char **area) {
54266+ return save_plugin_sd(inode, area, 1 /* pset */);
54267+}
54268+
54269+static int present_hset_sd(struct inode *inode, char **area, int *len) {
54270+ return present_plugin_sd(inode, area, len, 0 /* hset */);
54271+}
54272+
54273+static int save_len_hset_sd(struct inode *inode) {
54274+ return save_len_plugin_sd(inode, 0 /* pset */);
54275+}
54276+
54277+static int save_hset_sd(struct inode *inode, char **area) {
54278+ return save_plugin_sd(inode, area, 0 /* hset */);
54279+}
54280+
54281+/* helper function for crypto_sd_present(), crypto_sd_save.
54282+ Allocates memory for crypto stat, keyid and attaches it to the inode */
54283+static int extract_crypto_stat (struct inode * inode,
54284+ reiser4_crypto_stat * sd)
54285+{
54286+ crypto_stat_t * info;
54287+ assert("edward-11", !inode_crypto_stat(inode));
54288+ assert("edward-1413",
54289+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
54290+ /* create and attach a crypto-stat without secret key loaded */
54291+ info = reiser4_alloc_crypto_stat(inode);
54292+ if (IS_ERR(info))
54293+ return PTR_ERR(info);
54294+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
54295+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
54296+ reiser4_attach_crypto_stat(inode, info);
54297+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
54298+ return 0;
54299+}
54300+
54301+/* crypto stat-data extension */
54302+
54303+static int present_crypto_sd(struct inode *inode, char **area, int *len)
54304+{
54305+ int result;
54306+ reiser4_crypto_stat *sd;
54307+ digest_plugin *dplug = inode_digest_plugin(inode);
54308+
54309+ assert("edward-06", dplug != NULL);
54310+ assert("edward-684", dplug->fipsize);
54311+ assert("edward-07", area != NULL);
54312+ assert("edward-08", *area != NULL);
54313+ assert("edward-09", len != NULL);
54314+ assert("edward-10", *len > 0);
54315+
54316+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
54317+ return not_enough_space(inode, "crypto-sd");
54318+ }
54319+ /* *len is number of bytes in stat data item from *area to the end of
54320+ item. It must be not less than size of this extension */
54321+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
54322+
54323+ sd = (reiser4_crypto_stat *) * area;
54324+ result = extract_crypto_stat(inode, sd);
54325+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
54326+
54327+ return result;
54328+}
54329+
54330+static int save_len_crypto_sd(struct inode *inode)
54331+{
54332+ return sizeof(reiser4_crypto_stat) +
54333+ inode_digest_plugin(inode)->fipsize;
54334+}
54335+
54336+static int save_crypto_sd(struct inode *inode, char **area)
54337+{
54338+ int result = 0;
54339+ reiser4_crypto_stat *sd;
54340+ crypto_stat_t * info = inode_crypto_stat(inode);
54341+ digest_plugin *dplug = inode_digest_plugin(inode);
54342+
54343+ assert("edward-12", dplug != NULL);
54344+ assert("edward-13", area != NULL);
54345+ assert("edward-14", *area != NULL);
54346+ assert("edward-15", info != NULL);
54347+ assert("edward-1414", info->keyid != NULL);
54348+ assert("edward-1415", info->keysize != 0);
54349+ assert("edward-76", reiser4_inode_data(inode) != NULL);
54350+
54351+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
54352+ /* file is just created */
54353+ sd = (reiser4_crypto_stat *) *area;
54354+ /* copy everything but private key to the disk stat-data */
54355+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
54356+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
54357+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
54358+ }
54359+ *area += (sizeof(*sd) + dplug->fipsize);
54360+ return result;
54361+}
54362+
54363+static int eio(struct inode *inode, char **area, int *len)
54364+{
54365+ return RETERR(-EIO);
54366+}
54367+
54368+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
54369+ [LIGHT_WEIGHT_STAT] = {
54370+ .h = {
54371+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54372+ .id = LIGHT_WEIGHT_STAT,
54373+ .pops = NULL,
54374+ .label = "light-weight sd",
54375+ .desc = "sd for light-weight files",
54376+ .linkage = {NULL,NULL}
54377+ },
54378+ .present = present_lw_sd,
54379+ .absent = NULL,
54380+ .save_len = save_len_lw_sd,
54381+ .save = save_lw_sd,
54382+ .alignment = 8
54383+ },
54384+ [UNIX_STAT] = {
54385+ .h = {
54386+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54387+ .id = UNIX_STAT,
54388+ .pops = NULL,
54389+ .label = "unix-sd",
54390+ .desc = "unix stat-data fields",
54391+ .linkage = {NULL,NULL}
54392+ },
54393+ .present = present_unix_sd,
54394+ .absent = absent_unix_sd,
54395+ .save_len = save_len_unix_sd,
54396+ .save = save_unix_sd,
54397+ .alignment = 8
54398+ },
54399+ [LARGE_TIMES_STAT] = {
54400+ .h = {
54401+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54402+ .id = LARGE_TIMES_STAT,
54403+ .pops = NULL,
54404+ .label = "64time-sd",
54405+ .desc = "nanosecond resolution for times",
54406+ .linkage = {NULL,NULL}
54407+ },
54408+ .present = present_large_times_sd,
54409+ .absent = NULL,
54410+ .save_len = save_len_large_times_sd,
54411+ .save = save_large_times_sd,
54412+ .alignment = 8
54413+ },
54414+ [SYMLINK_STAT] = {
54415+ /* stat data of symlink has this extension */
54416+ .h = {
54417+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54418+ .id = SYMLINK_STAT,
54419+ .pops = NULL,
54420+ .label = "symlink-sd",
54421+ .desc =
54422+ "stat data is appended with symlink name",
54423+ .linkage = {NULL,NULL}
54424+ },
54425+ .present = present_symlink_sd,
54426+ .absent = NULL,
54427+ .save_len = save_len_symlink_sd,
54428+ .save = save_symlink_sd,
54429+ .alignment = 8
54430+ },
54431+ [PLUGIN_STAT] = {
54432+ .h = {
54433+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54434+ .id = PLUGIN_STAT,
54435+ .pops = NULL,
54436+ .label = "plugin-sd",
54437+ .desc = "plugin stat-data fields",
54438+ .linkage = {NULL,NULL}
54439+ },
54440+ .present = present_pset_sd,
54441+ .absent = absent_plugin_sd,
54442+ .save_len = save_len_pset_sd,
54443+ .save = save_pset_sd,
54444+ .alignment = 8
54445+ },
54446+ [HEIR_STAT] = {
54447+ .h = {
54448+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54449+ .id = HEIR_STAT,
54450+ .pops = NULL,
54451+ .label = "heir-plugin-sd",
54452+ .desc = "heir plugin stat-data fields",
54453+ .linkage = {NULL,NULL}
54454+ },
54455+ .present = present_hset_sd,
54456+ .absent = NULL,
54457+ .save_len = save_len_hset_sd,
54458+ .save = save_hset_sd,
54459+ .alignment = 8
54460+ },
54461+ [FLAGS_STAT] = {
54462+ .h = {
54463+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54464+ .id = FLAGS_STAT,
54465+ .pops = NULL,
54466+ .label = "flags-sd",
54467+ .desc = "inode bit flags",
54468+ .linkage = {NULL, NULL}
54469+ },
54470+ .present = present_flags_sd,
54471+ .absent = NULL,
54472+ .save_len = save_len_flags_sd,
54473+ .save = save_flags_sd,
54474+ .alignment = 8
54475+ },
54476+ [CAPABILITIES_STAT] = {
54477+ .h = {
54478+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54479+ .id = CAPABILITIES_STAT,
54480+ .pops = NULL,
54481+ .label = "capabilities-sd",
54482+ .desc = "capabilities",
54483+ .linkage = {NULL, NULL}
54484+ },
54485+ .present = eio,
54486+ .absent = NULL,
54487+ .save_len = save_len_flags_sd,
54488+ .save = save_flags_sd,
54489+ .alignment = 8
54490+ },
54491+ [CRYPTO_STAT] = {
54492+ .h = {
54493+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54494+ .id = CRYPTO_STAT,
54495+ .pops = NULL,
54496+ .label = "crypto-sd",
54497+ .desc = "secret key size and id",
54498+ .linkage = {NULL, NULL}
54499+ },
54500+ .present = present_crypto_sd,
54501+ .absent = NULL,
54502+ .save_len = save_len_crypto_sd,
54503+ .save = save_crypto_sd,
54504+ .alignment = 8
54505+ }
54506+};
54507+
54508+/* Make Linus happy.
54509+ Local variables:
54510+ c-indentation-style: "K&R"
54511+ mode-name: "LC"
54512+ c-basic-offset: 8
54513+ tab-width: 8
54514+ fill-column: 120
54515+ End:
54516+*/
54517diff --git a/fs/reiser4/plugin/item/static_stat.h b/fs/reiser4/plugin/item/static_stat.h
54518new file mode 100644
54519index 0000000..dd20eb3
54520--- /dev/null
54521+++ b/fs/reiser4/plugin/item/static_stat.h
54522@@ -0,0 +1,224 @@
54523+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54524+
54525+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
54526+
54527+In the case where each file has not less than the fields needed by the
54528+stat() syscall, it is more compact to store those fields in this
54529+struct.
54530+
54531+If this item does not exist, then all stats are dynamically resolved.
54532+At the moment, we either resolve all stats dynamically or all of them
54533+statically. If you think this is not fully optimal, and the rest of
54534+reiser4 is working, then fix it...:-)
54535+
54536+*/
54537+
54538+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
54539+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
54540+
54541+#include "../../forward.h"
54542+#include "../../dformat.h"
54543+
54544+#include <linux/fs.h> /* for struct inode */
54545+
54546+/* Stat data layout: goals and implementation.
54547+
54548+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
54549+ them, including not having semantic metadata attached to them.
54550+
54551+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
54552+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
54553+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
54554+ attributes are.
54555+
54556+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
54557+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
54558+ file in their use of file attributes.
54559+
54560+ Yet this compromise deserves to be compromised a little.
54561+
54562+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54563+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54564+
54565+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54566+ from parent directory (as uid, gid) or initialised to some sane values.
54567+
54568+ To capitalize on existing code infrastructure, extensions are
54569+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54570+ Each stat-data extension plugin implements four methods:
54571+
54572+ ->present() called by sd_load() when this extension is found in stat-data
54573+ ->absent() called by sd_load() when this extension is not found in stat-data
54574+ ->save_len() called by sd_len() to calculate total length of stat-data
54575+ ->save() called by sd_save() to store extension data into stat-data
54576+
54577+ Implementation is in fs/reiser4/plugin/item/static_stat.c
54578+*/
54579+
54580+/* stat-data extension. Please order this by presumed frequency of use */
54581+typedef enum {
54582+ /* support for light-weight files */
54583+ LIGHT_WEIGHT_STAT,
54584+ /* data required to implement unix stat(2) call. Layout is in
54585+ reiser4_unix_stat. If this is not present, file is light-weight */
54586+ UNIX_STAT,
54587+ /* this contains additional set of 32bit [anc]time fields to implement
54588+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54589+ if this extension is governed by 32bittimes mount option. */
54590+ LARGE_TIMES_STAT,
54591+ /* stat data has link name included */
54592+ SYMLINK_STAT,
54593+ /* on-disk slots of non-standard plugins for main plugin table
54594+ (@reiser4_inode->pset), that is, plugins that cannot be deduced
54595+ from file mode bits), for example, aggregation, interpolation etc. */
54596+ PLUGIN_STAT,
54597+ /* this extension contains persistent inode flags. These flags are
54598+ single bits: immutable, append, only, etc. Layout is in
54599+ reiser4_flags_stat. */
54600+ FLAGS_STAT,
54601+ /* this extension contains capabilities sets, associated with this
54602+ file. Layout is in reiser4_capabilities_stat */
54603+ CAPABILITIES_STAT,
54604+ /* this extension contains size and public id of the secret key.
54605+ Layout is in reiser4_crypto_stat */
54606+ CRYPTO_STAT,
54607+ /* on-disk slots of non-default plugins for inheritance, which
54608+ are extracted to special plugin table (@reiser4_inode->hset).
54609+ By default, children of the object will inherit plugins from
54610+ its main plugin table (pset). */
54611+ HEIR_STAT,
54612+ LAST_SD_EXTENSION,
54613+ /*
54614+ * init_inode_static_sd() iterates over extension mask until all
54615+ * non-zero bits are processed. This means, that neither ->present(),
54616+ * nor ->absent() methods will be called for stat-data extensions that
54617+ * go after last present extension. But some basic extensions, we want
54618+ * either ->absent() or ->present() method to be called, because these
54619+ * extensions set up something in inode even when they are not
54620+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54621+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54622+ * ->present(), or ->absent() method will be called, independently of
54623+ * what other extensions are present.
54624+ */
54625+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
54626+} sd_ext_bits;
54627+
54628+/* minimal stat-data. This allows to support light-weight files. */
54629+typedef struct reiser4_stat_data_base {
54630+ /* 0 */ __le16 extmask;
54631+ /* 2 */
54632+} PACKED reiser4_stat_data_base;
54633+
54634+typedef struct reiser4_light_weight_stat {
54635+ /* 0 */ __le16 mode;
54636+ /* 2 */ __le32 nlink;
54637+ /* 6 */ __le64 size;
54638+ /* size in bytes */
54639+ /* 14 */
54640+} PACKED reiser4_light_weight_stat;
54641+
54642+typedef struct reiser4_unix_stat {
54643+ /* owner id */
54644+ /* 0 */ __le32 uid;
54645+ /* group id */
54646+ /* 4 */ __le32 gid;
54647+ /* access time */
54648+ /* 8 */ __le32 atime;
54649+ /* modification time */
54650+ /* 12 */ __le32 mtime;
54651+ /* change time */
54652+ /* 16 */ __le32 ctime;
54653+ union {
54654+ /* minor:major for device files */
54655+ /* 20 */ __le64 rdev;
54656+ /* bytes used by file */
54657+ /* 20 */ __le64 bytes;
54658+ } u;
54659+ /* 28 */
54660+} PACKED reiser4_unix_stat;
54661+
54662+/* symlink stored as part of inode */
54663+typedef struct reiser4_symlink_stat {
54664+ char body[0];
54665+} PACKED reiser4_symlink_stat;
54666+
54667+typedef struct reiser4_plugin_slot {
54668+ /* 0 */ __le16 pset_memb;
54669+ /* 2 */ __le16 id;
54670+ /* 4 *//* here plugin stores its persistent state */
54671+} PACKED reiser4_plugin_slot;
54672+
54673+/* stat-data extension for files with non-standard plugin. */
54674+typedef struct reiser4_plugin_stat {
54675+ /* number of additional plugins, associated with this object */
54676+ /* 0 */ __le16 plugins_no;
54677+ /* 2 */ reiser4_plugin_slot slot[0];
54678+ /* 2 */
54679+} PACKED reiser4_plugin_stat;
54680+
54681+/* stat-data extension for inode flags. Currently it is just fixed-width 32
54682+ * bit mask. If need arise, this can be replaced with variable width
54683+ * bitmask. */
54684+typedef struct reiser4_flags_stat {
54685+ /* 0 */ __le32 flags;
54686+ /* 4 */
54687+} PACKED reiser4_flags_stat;
54688+
54689+typedef struct reiser4_capabilities_stat {
54690+ /* 0 */ __le32 effective;
54691+ /* 8 */ __le32 permitted;
54692+ /* 16 */
54693+} PACKED reiser4_capabilities_stat;
54694+
54695+typedef struct reiser4_cluster_stat {
54696+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54697+ /* 0 */ d8 cluster_shift;
54698+ /* 1 */
54699+} PACKED reiser4_cluster_stat;
54700+
54701+typedef struct reiser4_crypto_stat {
54702+ /* secret key size, bits */
54703+ /* 0 */ d16 keysize;
54704+ /* secret key id */
54705+ /* 2 */ d8 keyid[0];
54706+ /* 2 */
54707+} PACKED reiser4_crypto_stat;
54708+
54709+typedef struct reiser4_large_times_stat {
54710+ /* access time */
54711+ /* 0 */ d32 atime;
54712+ /* modification time */
54713+ /* 4 */ d32 mtime;
54714+ /* change time */
54715+ /* 8 */ d32 ctime;
54716+ /* 12 */
54717+} PACKED reiser4_large_times_stat;
54718+
54719+/* this structure is filled by sd_item_stat */
54720+typedef struct sd_stat {
54721+ int dirs;
54722+ int files;
54723+ int others;
54724+} sd_stat;
54725+
54726+/* plugin->item.common.* */
54727+extern void print_sd(const char *prefix, coord_t * coord);
54728+extern void item_stat_static_sd(const coord_t * coord, void *vp);
54729+
54730+/* plugin->item.s.sd.* */
54731+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54732+extern int save_len_static_sd(struct inode *inode);
54733+extern int save_static_sd(struct inode *inode, char **area);
54734+
54735+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54736+#endif
54737+
54738+/* Make Linus happy.
54739+ Local variables:
54740+ c-indentation-style: "K&R"
54741+ mode-name: "LC"
54742+ c-basic-offset: 8
54743+ tab-width: 8
54744+ fill-column: 120
54745+ End:
54746+*/
54747diff --git a/fs/reiser4/plugin/item/tail.c b/fs/reiser4/plugin/item/tail.c
54748new file mode 100644
54749index 0000000..281dd36
54750--- /dev/null
54751+++ b/fs/reiser4/plugin/item/tail.c
54752@@ -0,0 +1,812 @@
54753+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54754+
54755+#include "item.h"
54756+#include "../../inode.h"
54757+#include "../../page_cache.h"
54758+#include "../../carry.h"
54759+#include "../../vfs_ops.h"
54760+
54761+#include <linux/quotaops.h>
54762+#include <asm/uaccess.h>
54763+#include <linux/swap.h>
54764+#include <linux/writeback.h>
54765+
54766+/* plugin->u.item.b.max_key_inside */
54767+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54768+{
54769+ item_key_by_coord(coord, key);
54770+ set_key_offset(key, get_key_offset(reiser4_max_key()));
54771+ return key;
54772+}
54773+
54774+/* plugin->u.item.b.can_contain_key */
54775+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54776+ const reiser4_item_data *data)
54777+{
54778+ reiser4_key item_key;
54779+
54780+ if (item_plugin_by_coord(coord) != data->iplug)
54781+ return 0;
54782+
54783+ item_key_by_coord(coord, &item_key);
54784+ if (get_key_locality(key) != get_key_locality(&item_key) ||
54785+ get_key_objectid(key) != get_key_objectid(&item_key))
54786+ return 0;
54787+
54788+ return 1;
54789+}
54790+
54791+/* plugin->u.item.b.mergeable
54792+ first item is of tail type */
54793+/* Audited by: green(2002.06.14) */
54794+int mergeable_tail(const coord_t *p1, const coord_t *p2)
54795+{
54796+ reiser4_key key1, key2;
54797+
54798+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
54799+ UNIX_FILE_METADATA_ITEM_TYPE));
54800+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54801+
54802+ if (item_id_by_coord(p2) != FORMATTING_ID) {
54803+ /* second item is of another type */
54804+ return 0;
54805+ }
54806+
54807+ item_key_by_coord(p1, &key1);
54808+ item_key_by_coord(p2, &key2);
54809+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
54810+ get_key_objectid(&key1) != get_key_objectid(&key2)
54811+ || get_key_type(&key1) != get_key_type(&key2)) {
54812+ /* items of different objects */
54813+ return 0;
54814+ }
54815+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54816+ /* not adjacent items */
54817+ return 0;
54818+ }
54819+ return 1;
54820+}
54821+
54822+/* plugin->u.item.b.print
54823+ plugin->u.item.b.check */
54824+
54825+/* plugin->u.item.b.nr_units */
54826+pos_in_node_t nr_units_tail(const coord_t * coord)
54827+{
54828+ return item_length_by_coord(coord);
54829+}
54830+
54831+/* plugin->u.item.b.lookup */
54832+lookup_result
54833+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54834+{
54835+ reiser4_key item_key;
54836+ __u64 lookuped, offset;
54837+ unsigned nr_units;
54838+
54839+ item_key_by_coord(coord, &item_key);
54840+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
54841+ nr_units = nr_units_tail(coord);
54842+
54843+ /* key we are looking for must be greater than key of item @coord */
54844+ assert("vs-416", keygt(key, &item_key));
54845+
54846+ /* offset we are looking for */
54847+ lookuped = get_key_offset(key);
54848+
54849+ if (lookuped >= offset && lookuped < offset + nr_units) {
54850+ /* byte we are looking for is in this item */
54851+ coord->unit_pos = lookuped - offset;
54852+ coord->between = AT_UNIT;
54853+ return CBK_COORD_FOUND;
54854+ }
54855+
54856+ /* set coord after last unit */
54857+ coord->unit_pos = nr_units - 1;
54858+ coord->between = AFTER_UNIT;
54859+ return bias ==
54860+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54861+}
54862+
54863+/* plugin->u.item.b.paste */
54864+int
54865+paste_tail(coord_t *coord, reiser4_item_data *data,
54866+ carry_plugin_info *info UNUSED_ARG)
54867+{
54868+ unsigned old_item_length;
54869+ char *item;
54870+
54871+ /* length the item had before resizing has been performed */
54872+ old_item_length = item_length_by_coord(coord) - data->length;
54873+
54874+ /* tail items never get pasted in the middle */
54875+ assert("vs-363",
54876+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54877+ (coord->unit_pos == old_item_length - 1 &&
54878+ coord->between == AFTER_UNIT) ||
54879+ (coord->unit_pos == 0 && old_item_length == 0
54880+ && coord->between == AT_UNIT));
54881+
54882+ item = item_body_by_coord(coord);
54883+ if (coord->unit_pos == 0)
54884+ /* make space for pasted data when pasting at the beginning of
54885+ the item */
54886+ memmove(item + data->length, item, old_item_length);
54887+
54888+ if (coord->between == AFTER_UNIT)
54889+ coord->unit_pos++;
54890+
54891+ if (data->data) {
54892+ assert("vs-554", data->user == 0 || data->user == 1);
54893+ if (data->user) {
54894+ assert("nikita-3035", reiser4_schedulable());
54895+ /* copy from user space */
54896+ if (__copy_from_user(item + coord->unit_pos,
54897+ (const char __user *)data->data,
54898+ (unsigned)data->length))
54899+ return RETERR(-EFAULT);
54900+ } else
54901+ /* copy from kernel space */
54902+ memcpy(item + coord->unit_pos, data->data,
54903+ (unsigned)data->length);
54904+ } else {
54905+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
54906+ }
54907+ return 0;
54908+}
54909+
54910+/* plugin->u.item.b.fast_paste */
54911+
54912+/* plugin->u.item.b.can_shift
54913+ number of units is returned via return value, number of bytes via @size. For
54914+ tail items they coincide */
54915+int
54916+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54917+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54918+ unsigned *size, unsigned want)
54919+{
54920+ /* make sure that that we do not want to shift more than we have */
54921+ assert("vs-364", want > 0
54922+ && want <= (unsigned)item_length_by_coord(source));
54923+
54924+ *size = min(want, free_space);
54925+ return *size;
54926+}
54927+
54928+/* plugin->u.item.b.copy_units */
54929+void
54930+copy_units_tail(coord_t * target, coord_t * source,
54931+ unsigned from, unsigned count,
54932+ shift_direction where_is_free_space,
54933+ unsigned free_space UNUSED_ARG)
54934+{
54935+ /* make sure that item @target is expanded already */
54936+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54937+ assert("vs-370", free_space >= count);
54938+
54939+ if (where_is_free_space == SHIFT_LEFT) {
54940+ /* append item @target with @count first bytes of @source */
54941+ assert("vs-365", from == 0);
54942+
54943+ memcpy((char *)item_body_by_coord(target) +
54944+ item_length_by_coord(target) - count,
54945+ (char *)item_body_by_coord(source), count);
54946+ } else {
54947+ /* target item is moved to right already */
54948+ reiser4_key key;
54949+
54950+ assert("vs-367",
54951+ (unsigned)item_length_by_coord(source) == from + count);
54952+
54953+ memcpy((char *)item_body_by_coord(target),
54954+ (char *)item_body_by_coord(source) + from, count);
54955+
54956+ /* new units are inserted before first unit in an item,
54957+ therefore, we have to update item key */
54958+ item_key_by_coord(source, &key);
54959+ set_key_offset(&key, get_key_offset(&key) + from);
54960+
54961+ node_plugin_by_node(target->node)->update_item_key(target, &key,
54962+ NULL /*info */);
54963+ }
54964+}
54965+
54966+/* plugin->u.item.b.create_hook */
54967+
54968+/* item_plugin->b.kill_hook
54969+ this is called when @count units starting from @from-th one are going to be removed
54970+ */
54971+int
54972+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54973+ pos_in_node_t count, struct carry_kill_data *kdata)
54974+{
54975+ reiser4_key key;
54976+ loff_t start, end;
54977+
54978+ assert("vs-1577", kdata);
54979+ assert("vs-1579", kdata->inode);
54980+
54981+ item_key_by_coord(coord, &key);
54982+ start = get_key_offset(&key) + from;
54983+ end = start + count;
54984+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54985+ return 0;
54986+}
54987+
54988+/* plugin->u.item.b.shift_hook */
54989+
54990+/* helper for kill_units_tail and cut_units_tail */
54991+static int
54992+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54993+ reiser4_key * smallest_removed, reiser4_key * new_first)
54994+{
54995+ pos_in_node_t count;
54996+
54997+ /* this method is only called to remove part of item */
54998+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54999+ /* tails items are never cut from the middle of an item */
55000+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
55001+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
55002+
55003+ count = to - from + 1;
55004+
55005+ if (smallest_removed) {
55006+ /* store smallest key removed */
55007+ item_key_by_coord(coord, smallest_removed);
55008+ set_key_offset(smallest_removed,
55009+ get_key_offset(smallest_removed) + from);
55010+ }
55011+ if (new_first) {
55012+ /* head of item is cut */
55013+ assert("vs-1529", from == 0);
55014+
55015+ item_key_by_coord(coord, new_first);
55016+ set_key_offset(new_first,
55017+ get_key_offset(new_first) + from + count);
55018+ }
55019+
55020+ if (REISER4_DEBUG)
55021+ memset((char *)item_body_by_coord(coord) + from, 0, count);
55022+ return count;
55023+}
55024+
55025+/* plugin->u.item.b.cut_units */
55026+int
55027+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
55028+ struct carry_cut_data *cdata UNUSED_ARG,
55029+ reiser4_key * smallest_removed, reiser4_key * new_first)
55030+{
55031+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
55032+}
55033+
55034+/* plugin->u.item.b.kill_units */
55035+int
55036+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
55037+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
55038+ reiser4_key * new_first)
55039+{
55040+ kill_hook_tail(coord, from, to - from + 1, kdata);
55041+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
55042+}
55043+
55044+/* plugin->u.item.b.unit_key */
55045+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
55046+{
55047+ assert("vs-375", coord_is_existing_unit(coord));
55048+
55049+ item_key_by_coord(coord, key);
55050+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
55051+
55052+ return key;
55053+}
55054+
55055+/* plugin->u.item.b.estimate
55056+ plugin->u.item.b.item_data_by_flow */
55057+
55058+/* tail redpage function. It is called from readpage_tail(). */
55059+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
55060+{
55061+ tap_t tap;
55062+ int result;
55063+ coord_t coord;
55064+ lock_handle lh;
55065+ int count, mapped;
55066+ struct inode *inode;
55067+ char *pagedata;
55068+
55069+ /* saving passed coord in order to do not move it by tap. */
55070+ init_lh(&lh);
55071+ copy_lh(&lh, uf_coord->lh);
55072+ inode = page->mapping->host;
55073+ coord_dup(&coord, &uf_coord->coord);
55074+
55075+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
55076+
55077+ if ((result = reiser4_tap_load(&tap)))
55078+ goto out_tap_done;
55079+
55080+ /* lookup until page is filled up. */
55081+ for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
55082+ /* number of bytes to be copied to page */
55083+ count = item_length_by_coord(&coord) - coord.unit_pos;
55084+ if (count > PAGE_CACHE_SIZE - mapped)
55085+ count = PAGE_CACHE_SIZE - mapped;
55086+
55087+ /* attach @page to address space and get data address */
55088+ pagedata = kmap_atomic(page, KM_USER0);
55089+
55090+ /* copy tail item to page */
55091+ memcpy(pagedata + mapped,
55092+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
55093+ count);
55094+ mapped += count;
55095+
55096+ flush_dcache_page(page);
55097+
55098+ /* dettach page from address space */
55099+ kunmap_atomic(pagedata, KM_USER0);
55100+
55101+ /* Getting next tail item. */
55102+ if (mapped < PAGE_CACHE_SIZE) {
55103+ /*
55104+ * unlock page in order to avoid keep it locked
55105+ * during tree lookup, which takes long term locks
55106+ */
55107+ unlock_page(page);
55108+
55109+ /* getting right neighbour. */
55110+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
55111+
55112+ /* lock page back */
55113+ lock_page(page);
55114+ if (PageUptodate(page)) {
55115+ /*
55116+ * another thread read the page, we have
55117+ * nothing to do
55118+ */
55119+ result = 0;
55120+ goto out_unlock_page;
55121+ }
55122+
55123+ if (result) {
55124+ if (result == -E_NO_NEIGHBOR) {
55125+ /*
55126+ * rigth neighbor is not a formatted
55127+ * node
55128+ */
55129+ result = 0;
55130+ goto done;
55131+ } else {
55132+ goto out_tap_relse;
55133+ }
55134+ } else {
55135+ if (!inode_file_plugin(inode)->
55136+ owns_item(inode, &coord)) {
55137+ /* item of another file is found */
55138+ result = 0;
55139+ goto done;
55140+ }
55141+ }
55142+ }
55143+ }
55144+
55145+ done:
55146+ if (mapped != PAGE_CACHE_SIZE) {
55147+ pagedata = kmap_atomic(page, KM_USER0);
55148+ memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
55149+ flush_dcache_page(page);
55150+ kunmap_atomic(pagedata, KM_USER0);
55151+ }
55152+ SetPageUptodate(page);
55153+ out_unlock_page:
55154+ unlock_page(page);
55155+ out_tap_relse:
55156+ reiser4_tap_relse(&tap);
55157+ out_tap_done:
55158+ reiser4_tap_done(&tap);
55159+ return result;
55160+}
55161+
55162+/*
55163+ plugin->s.file.readpage
55164+ reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
55165+ or
55166+ filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
55167+
55168+ At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
55169+ item. */
55170+int readpage_tail(void *vp, struct page *page)
55171+{
55172+ uf_coord_t *uf_coord = vp;
55173+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
55174+ ON_DEBUG(reiser4_key key);
55175+
55176+ assert("umka-2515", PageLocked(page));
55177+ assert("umka-2516", !PageUptodate(page));
55178+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
55179+ assert("umka-2518", page->mapping && page->mapping->host);
55180+
55181+ assert("umka-2519", znode_is_loaded(coord->node));
55182+ assert("umka-2520", item_is_tail(coord));
55183+ assert("umka-2521", coord_is_existing_unit(coord));
55184+ assert("umka-2522", znode_is_rlocked(coord->node));
55185+ assert("umka-2523",
55186+ page->mapping->host->i_ino ==
55187+ get_key_objectid(item_key_by_coord(coord, &key)));
55188+
55189+ return do_readpage_tail(uf_coord, page);
55190+}
55191+
55192+/**
55193+ * overwrite_tail
55194+ * @flow:
55195+ * @coord:
55196+ *
55197+ * Overwrites tail item or its part by user data. Returns number of bytes
55198+ * written or error code.
55199+ */
55200+static int overwrite_tail(flow_t *flow, coord_t *coord)
55201+{
55202+ unsigned count;
55203+
55204+ assert("vs-570", flow->user == 1);
55205+ assert("vs-946", flow->data);
55206+ assert("vs-947", coord_is_existing_unit(coord));
55207+ assert("vs-948", znode_is_write_locked(coord->node));
55208+ assert("nikita-3036", reiser4_schedulable());
55209+
55210+ count = item_length_by_coord(coord) - coord->unit_pos;
55211+ if (count > flow->length)
55212+ count = flow->length;
55213+
55214+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
55215+ (const char __user *)flow->data, count))
55216+ return RETERR(-EFAULT);
55217+
55218+ znode_make_dirty(coord->node);
55219+ return count;
55220+}
55221+
55222+/**
55223+ * insert_first_tail
55224+ * @inode:
55225+ * @flow:
55226+ * @coord:
55227+ * @lh:
55228+ *
55229+ * Returns number of bytes written or error code.
55230+ */
55231+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
55232+ coord_t *coord, lock_handle *lh)
55233+{
55234+ int result;
55235+ loff_t to_write;
55236+ unix_file_info_t *uf_info;
55237+
55238+ if (get_key_offset(&flow->key) != 0) {
55239+ /*
55240+ * file is empty and we have to write not to the beginning of
55241+ * file. Create a hole at the beginning of file. On success
55242+ * insert_flow returns 0 as number of written bytes which is
55243+ * what we have to return on padding a file with holes
55244+ */
55245+ flow->data = NULL;
55246+ flow->length = get_key_offset(&flow->key);
55247+ set_key_offset(&flow->key, 0);
55248+ /*
55249+ * holes in files built of tails are stored just like if there
55250+ * were real data which are all zeros. Therefore we have to
55251+ * allocate quota here as well
55252+ */
55253+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55254+ return RETERR(-EDQUOT);
55255+ result = reiser4_insert_flow(coord, lh, flow);
55256+ if (flow->length)
55257+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55258+
55259+ uf_info = unix_file_inode_data(inode);
55260+
55261+ /*
55262+ * first item insertion is only possible when writing to empty
55263+ * file or performing tail conversion
55264+ */
55265+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
55266+ (reiser4_inode_get_flag(inode,
55267+ REISER4_PART_MIXED) &&
55268+ reiser4_inode_get_flag(inode,
55269+ REISER4_PART_IN_CONV))));
55270+ /* if file was empty - update its state */
55271+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
55272+ uf_info->container = UF_CONTAINER_TAILS;
55273+ return result;
55274+ }
55275+
55276+ /* check quota before appending data */
55277+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55278+ return RETERR(-EDQUOT);
55279+
55280+ to_write = flow->length;
55281+ result = reiser4_insert_flow(coord, lh, flow);
55282+ if (flow->length)
55283+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55284+ return (to_write - flow->length) ? (to_write - flow->length) : result;
55285+}
55286+
55287+/**
55288+ * append_tail
55289+ * @inode:
55290+ * @flow:
55291+ * @coord:
55292+ * @lh:
55293+ *
55294+ * Returns number of bytes written or error code.
55295+ */
55296+static ssize_t append_tail(struct inode *inode,
55297+ flow_t *flow, coord_t *coord, lock_handle *lh)
55298+{
55299+ int result;
55300+ reiser4_key append_key;
55301+ loff_t to_write;
55302+
55303+ if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
55304+ flow->data = NULL;
55305+ flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
55306+ set_key_offset(&flow->key, get_key_offset(&append_key));
55307+ /*
55308+ * holes in files built of tails are stored just like if there
55309+ * were real data which are all zeros. Therefore we have to
55310+ * allocate quota here as well
55311+ */
55312+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55313+ return RETERR(-EDQUOT);
55314+ result = reiser4_insert_flow(coord, lh, flow);
55315+ if (flow->length)
55316+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55317+ return result;
55318+ }
55319+
55320+ /* check quota before appending data */
55321+ if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55322+ return RETERR(-EDQUOT);
55323+
55324+ to_write = flow->length;
55325+ result = reiser4_insert_flow(coord, lh, flow);
55326+ if (flow->length)
55327+ DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55328+ return (to_write - flow->length) ? (to_write - flow->length) : result;
55329+}
55330+
55331+/**
55332+ * write_tail_reserve_space - reserve space for tail write operation
55333+ * @inode:
55334+ *
55335+ * Estimates and reserves space which may be required for writing one flow to a
55336+ * file
55337+ */
55338+static int write_extent_reserve_space(struct inode *inode)
55339+{
55340+ __u64 count;
55341+ reiser4_tree *tree;
55342+
55343+ /*
55344+ * to write one flow to a file by tails we have to reserve disk space for:
55345+
55346+ * 1. find_file_item may have to insert empty node to the tree (empty
55347+ * leaf node between two extent items). This requires 1 block and
55348+ * number of blocks which are necessary to perform insertion of an
55349+ * internal item into twig level.
55350+ *
55351+ * 2. flow insertion
55352+ *
55353+ * 3. stat data update
55354+ */
55355+ tree = reiser4_tree_by_inode(inode);
55356+ count = estimate_one_insert_item(tree) +
55357+ estimate_insert_flow(tree->height) +
55358+ estimate_one_insert_item(tree);
55359+ grab_space_enable();
55360+ return reiser4_grab_space(count, 0 /* flags */);
55361+}
55362+
55363+#define PAGE_PER_FLOW 4
55364+
55365+static loff_t faultin_user_pages(const char __user *buf, size_t count)
55366+{
55367+ loff_t faulted;
55368+ int to_fault;
55369+
55370+ if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
55371+ count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
55372+ faulted = 0;
55373+ while (count > 0) {
55374+ to_fault = PAGE_CACHE_SIZE;
55375+ if (count < to_fault)
55376+ to_fault = count;
55377+ fault_in_pages_readable(buf + faulted, to_fault);
55378+ count -= to_fault;
55379+ faulted += to_fault;
55380+ }
55381+ return faulted;
55382+}
55383+
55384+/**
55385+ * reiser4_write_extent - write method of tail item plugin
55386+ * @file: file to write to
55387+ * @buf: address of user-space buffer
55388+ * @count: number of bytes to write
55389+ * @pos: position in file to write to
55390+ *
55391+ * Returns number of written bytes or error code.
55392+ */
55393+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
55394+ size_t count, loff_t *pos)
55395+{
55396+ struct inode *inode;
55397+ struct hint hint;
55398+ int result;
55399+ flow_t flow;
55400+ coord_t *coord;
55401+ lock_handle *lh;
55402+ znode *loaded;
55403+
55404+ inode = file->f_dentry->d_inode;
55405+
55406+ if (write_extent_reserve_space(inode))
55407+ return RETERR(-ENOSPC);
55408+
55409+ result = load_file_hint(file, &hint);
55410+ BUG_ON(result != 0);
55411+
55412+ flow.length = faultin_user_pages(buf, count);
55413+ flow.user = 1;
55414+ memcpy(&flow.data, &buf, sizeof(buf));
55415+ flow.op = WRITE_OP;
55416+ key_by_inode_and_offset_common(inode, *pos, &flow.key);
55417+
55418+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
55419+ if (IS_CBKERR(result))
55420+ return result;
55421+
55422+ coord = &hint.ext_coord.coord;
55423+ lh = hint.ext_coord.lh;
55424+
55425+ result = zload(coord->node);
55426+ BUG_ON(result != 0);
55427+ loaded = coord->node;
55428+
55429+ if (coord->between == AFTER_UNIT) {
55430+ /* append with data or hole */
55431+ result = append_tail(inode, &flow, coord, lh);
55432+ } else if (coord->between == AT_UNIT) {
55433+ /* overwrite */
55434+ result = overwrite_tail(&flow, coord);
55435+ } else {
55436+ /* no items of this file yet. insert data or hole */
55437+ result = insert_first_tail(inode, &flow, coord, lh);
55438+ }
55439+ zrelse(loaded);
55440+ if (result < 0) {
55441+ done_lh(lh);
55442+ return result;
55443+ }
55444+
55445+ /* seal and unlock znode */
55446+ hint.ext_coord.valid = 0;
55447+ if (hint.ext_coord.valid)
55448+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
55449+ else
55450+ reiser4_unset_hint(&hint);
55451+
55452+ save_file_hint(file, &hint);
55453+ return result;
55454+}
55455+
55456+#if REISER4_DEBUG
55457+
55458+static int
55459+coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
55460+{
55461+ reiser4_key item_key;
55462+
55463+ assert("vs-1356", coord_is_existing_unit(coord));
55464+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
55465+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
55466+ return get_key_offset(key) ==
55467+ get_key_offset(&item_key) + coord->unit_pos;
55468+
55469+}
55470+
55471+#endif
55472+
55473+/* plugin->u.item.s.file.read */
55474+int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
55475+{
55476+ unsigned count;
55477+ int item_length;
55478+ coord_t *coord;
55479+ uf_coord_t *uf_coord;
55480+
55481+ uf_coord = &hint->ext_coord;
55482+ coord = &uf_coord->coord;
55483+
55484+ assert("vs-571", f->user == 1);
55485+ assert("vs-571", f->data);
55486+ assert("vs-967", coord && coord->node);
55487+ assert("vs-1117", znode_is_rlocked(coord->node));
55488+ assert("vs-1118", znode_is_loaded(coord->node));
55489+
55490+ assert("nikita-3037", reiser4_schedulable());
55491+ assert("vs-1357", coord_matches_key_tail(coord, &f->key));
55492+
55493+ /* calculate number of bytes to read off the item */
55494+ item_length = item_length_by_coord(coord);
55495+ count = item_length_by_coord(coord) - coord->unit_pos;
55496+ if (count > f->length)
55497+ count = f->length;
55498+
55499+ /* user page has to be brought in so that major page fault does not
55500+ * occur here when longtem lock is held */
55501+ if (__copy_to_user((char __user *)f->data,
55502+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
55503+ count))
55504+ return RETERR(-EFAULT);
55505+
55506+ /* probably mark_page_accessed() should only be called if
55507+ * coord->unit_pos is zero. */
55508+ mark_page_accessed(znode_page(coord->node));
55509+ move_flow_forward(f, count);
55510+
55511+ coord->unit_pos += count;
55512+ if (item_length == coord->unit_pos) {
55513+ coord->unit_pos--;
55514+ coord->between = AFTER_UNIT;
55515+ }
55516+
55517+ return 0;
55518+}
55519+
55520+/*
55521+ plugin->u.item.s.file.append_key
55522+ key of first byte which is the next to last byte by addressed by this item
55523+*/
55524+reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
55525+{
55526+ item_key_by_coord(coord, key);
55527+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
55528+ return key;
55529+}
55530+
55531+/* plugin->u.item.s.file.init_coord_extension */
55532+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
55533+{
55534+ uf_coord->valid = 1;
55535+}
55536+
55537+/*
55538+ plugin->u.item.s.file.get_block
55539+*/
55540+int
55541+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
55542+{
55543+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
55544+
55545+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
55546+ /* if node has'nt obtainet its block number yet, return 0.
55547+ * Lets avoid upsetting users with some cosmic numbers beyond
55548+ * the device capacity.*/
55549+ *block = 0;
55550+ else
55551+ *block = *znode_get_block(coord->node);
55552+ return 0;
55553+}
55554+
55555+/*
55556+ * Local variables:
55557+ * c-indentation-style: "K&R"
55558+ * mode-name: "LC"
55559+ * c-basic-offset: 8
55560+ * tab-width: 8
55561+ * fill-column: 79
55562+ * scroll-step: 1
55563+ * End:
55564+ */
55565diff --git a/fs/reiser4/plugin/item/tail.h b/fs/reiser4/plugin/item/tail.h
55566new file mode 100644
55567index 0000000..459fa27
55568--- /dev/null
55569+++ b/fs/reiser4/plugin/item/tail.h
55570@@ -0,0 +1,58 @@
55571+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55572+
55573+#if !defined( __REISER4_TAIL_H__ )
55574+#define __REISER4_TAIL_H__
55575+
55576+typedef struct {
55577+ int not_used;
55578+} tail_coord_extension_t;
55579+
55580+struct cut_list;
55581+
55582+/* plugin->u.item.b.* */
55583+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55584+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55585+ const reiser4_item_data *);
55586+int mergeable_tail(const coord_t * p1, const coord_t * p2);
55587+pos_in_node_t nr_units_tail(const coord_t *);
55588+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55589+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55590+int can_shift_tail(unsigned free_space, coord_t * source,
55591+ znode * target, shift_direction, unsigned *size,
55592+ unsigned want);
55593+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55594+ unsigned count, shift_direction, unsigned free_space);
55595+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55596+ struct carry_kill_data *);
55597+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55598+ struct carry_cut_data *, reiser4_key * smallest_removed,
55599+ reiser4_key * new_first);
55600+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55601+ struct carry_kill_data *, reiser4_key * smallest_removed,
55602+ reiser4_key * new_first);
55603+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55604+
55605+/* plugin->u.item.s.* */
55606+ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
55607+ size_t count, loff_t *pos);
55608+int reiser4_read_tail(struct file *, flow_t *, hint_t *);
55609+int readpage_tail(void *vp, struct page *page);
55610+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55611+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55612+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55613+int item_balance_dirty_pages(struct address_space *, const flow_t *,
55614+ hint_t *, int back_to_dirty, int set_hint);
55615+
55616+/* __REISER4_TAIL_H__ */
55617+#endif
55618+
55619+/* Make Linus happy.
55620+ Local variables:
55621+ c-indentation-style: "K&R"
55622+ mode-name: "LC"
55623+ c-basic-offset: 8
55624+ tab-width: 8
55625+ fill-column: 120
55626+ scroll-step: 1
55627+ End:
55628+*/
55629diff --git a/fs/reiser4/plugin/node/Makefile b/fs/reiser4/plugin/node/Makefile
55630new file mode 100644
55631index 0000000..9400627
55632--- /dev/null
55633+++ b/fs/reiser4/plugin/node/Makefile
55634@@ -0,0 +1,5 @@
55635+obj-$(CONFIG_REISER4_FS) += node_plugins.o
55636+
55637+node_plugins-objs := \
55638+ node.o \
55639+ node40.o
55640diff --git a/fs/reiser4/plugin/node/node.c b/fs/reiser4/plugin/node/node.c
55641new file mode 100644
55642index 0000000..179a4a7
55643--- /dev/null
55644+++ b/fs/reiser4/plugin/node/node.c
55645@@ -0,0 +1,131 @@
55646+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55647+
55648+/* Node plugin interface.
55649+
55650+ Description: The tree provides the abstraction of flows, which it
55651+ internally fragments into items which it stores in nodes.
55652+
55653+ A key_atom is a piece of data bound to a single key.
55654+
55655+ For reasonable space efficiency to be achieved it is often
55656+ necessary to store key_atoms in the nodes in the form of items, where
55657+ an item is a sequence of key_atoms of the same or similar type. It is
55658+ more space-efficient, because the item can implement (very)
55659+ efficient compression of key_atom's bodies using internal knowledge
55660+ about their semantics, and it can often avoid having a key for each
55661+ key_atom. Each type of item has specific operations implemented by its
55662+ item handler (see balance.c).
55663+
55664+ Rationale: the rest of the code (specifically balancing routines)
55665+ accesses leaf level nodes through this interface. This way we can
55666+ implement various block layouts and even combine various layouts
55667+ within the same tree. Balancing/allocating algorithms should not
55668+ care about peculiarities of splitting/merging specific item types,
55669+ but rather should leave that to the item's item handler.
55670+
55671+ Items, including those that provide the abstraction of flows, have
55672+ the property that if you move them in part or in whole to another
55673+ node, the balancing code invokes their is_left_mergeable()
55674+ item_operation to determine if they are mergeable with their new
55675+ neighbor in the node you have moved them to. For some items the
55676+ is_left_mergeable() function always returns null.
55677+
55678+ When moving the bodies of items from one node to another:
55679+
55680+ if a partial item is shifted to another node the balancing code invokes
55681+ an item handler method to handle the item splitting.
55682+
55683+ if the balancing code needs to merge with an item in the node it
55684+ is shifting to, it will invoke an item handler method to handle
55685+ the item merging.
55686+
55687+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55688+ adjusting the item headers after the move is done using the node handler.
55689+*/
55690+
55691+#include "../../forward.h"
55692+#include "../../debug.h"
55693+#include "../../key.h"
55694+#include "../../coord.h"
55695+#include "../plugin_header.h"
55696+#include "../item/item.h"
55697+#include "node.h"
55698+#include "../plugin.h"
55699+#include "../../znode.h"
55700+#include "../../tree.h"
55701+#include "../../super.h"
55702+#include "../../reiser4.h"
55703+
55704+/**
55705+ * leftmost_key_in_node - get the smallest key in node
55706+ * @node:
55707+ * @key: store result here
55708+ *
55709+ * Stores the leftmost key of @node in @key.
55710+ */
55711+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55712+{
55713+ assert("nikita-1634", node != NULL);
55714+ assert("nikita-1635", key != NULL);
55715+
55716+ if (!node_is_empty(node)) {
55717+ coord_t first_item;
55718+
55719+ coord_init_first_unit(&first_item, (znode *) node);
55720+ item_key_by_coord(&first_item, key);
55721+ } else
55722+ *key = *reiser4_max_key();
55723+ return key;
55724+}
55725+
55726+node_plugin node_plugins[LAST_NODE_ID] = {
55727+ [NODE40_ID] = {
55728+ .h = {
55729+ .type_id = REISER4_NODE_PLUGIN_TYPE,
55730+ .id = NODE40_ID,
55731+ .pops = NULL,
55732+ .label = "unified",
55733+ .desc = "unified node layout",
55734+ .linkage = {NULL, NULL}
55735+ },
55736+ .item_overhead = item_overhead_node40,
55737+ .free_space = free_space_node40,
55738+ .lookup = lookup_node40,
55739+ .num_of_items = num_of_items_node40,
55740+ .item_by_coord = item_by_coord_node40,
55741+ .length_by_coord = length_by_coord_node40,
55742+ .plugin_by_coord = plugin_by_coord_node40,
55743+ .key_at = key_at_node40,
55744+ .estimate = estimate_node40,
55745+ .check = check_node40,
55746+ .parse = parse_node40,
55747+ .init = init_node40,
55748+#ifdef GUESS_EXISTS
55749+ .guess = guess_node40,
55750+#endif
55751+ .change_item_size = change_item_size_node40,
55752+ .create_item = create_item_node40,
55753+ .update_item_key = update_item_key_node40,
55754+ .cut_and_kill = kill_node40,
55755+ .cut = cut_node40,
55756+ .shift = shift_node40,
55757+ .shrink_item = shrink_item_node40,
55758+ .fast_insert = fast_insert_node40,
55759+ .fast_paste = fast_paste_node40,
55760+ .fast_cut = fast_cut_node40,
55761+ .max_item_size = max_item_size_node40,
55762+ .prepare_removal = prepare_removal_node40,
55763+ .set_item_plugin = set_item_plugin_node40
55764+ }
55765+};
55766+
55767+/*
55768+ Local variables:
55769+ c-indentation-style: "K&R"
55770+ mode-name: "LC"
55771+ c-basic-offset: 8
55772+ tab-width: 8
55773+ fill-column: 120
55774+ scroll-step: 1
55775+ End:
55776+*/
55777diff --git a/fs/reiser4/plugin/node/node.h b/fs/reiser4/plugin/node/node.h
55778new file mode 100644
55779index 0000000..af0c641
55780--- /dev/null
55781+++ b/fs/reiser4/plugin/node/node.h
55782@@ -0,0 +1,272 @@
55783+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55784+
55785+/* We need a definition of the default node layout here. */
55786+
55787+/* Generally speaking, it is best to have free space in the middle of the
55788+ node so that two sets of things can grow towards it, and to have the
55789+ item bodies on the left so that the last one of them grows into free
55790+ space. We optimize for the case where we append new items to the end
55791+ of the node, or grow the last item, because it hurts nothing to so
55792+ optimize and it is a common special case to do massive insertions in
55793+ increasing key order (and one of cases more likely to have a real user
55794+ notice the delay time for).
55795+
55796+ formatted leaf default layout: (leaf1)
55797+
55798+ |node header:item bodies:free space:key + pluginid + item offset|
55799+
55800+ We grow towards the middle, optimizing layout for the case where we
55801+ append new items to the end of the node. The node header is fixed
55802+ length. Keys, and item offsets plus pluginids for the items
55803+ corresponding to them are in increasing key order, and are fixed
55804+ length. Item offsets are relative to start of node (16 bits creating
55805+ a node size limit of 64k, 12 bits might be a better choice....). Item
55806+ bodies are in decreasing key order. Item bodies have a variable size.
55807+ There is a one to one to one mapping of keys to item offsets to item
55808+ bodies. Item offsets consist of pointers to the zeroth byte of the
55809+ item body. Item length equals the start of the next item minus the
55810+ start of this item, except the zeroth item whose length equals the end
55811+ of the node minus the start of that item (plus a byte). In other
55812+ words, the item length is not recorded anywhere, and it does not need
55813+ to be since it is computable.
55814+
55815+ Leaf variable length items and keys layout : (lvar)
55816+
55817+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55818+
55819+ We grow towards the middle, optimizing layout for the case where we
55820+ append new items to the end of the node. The node header is fixed
55821+ length. Keys and item offsets for the items corresponding to them are
55822+ in increasing key order, and keys are variable length. Item offsets
55823+ are relative to start of node (16 bits). Item bodies are in
55824+ decreasing key order. Item bodies have a variable size. There is a
55825+ one to one to one mapping of keys to item offsets to item bodies.
55826+ Item offsets consist of pointers to the zeroth byte of the item body.
55827+ Item length equals the start of the next item's key minus the start of
55828+ this item, except the zeroth item whose length equals the end of the
55829+ node minus the start of that item (plus a byte).
55830+
55831+ leaf compressed keys layout: (lcomp)
55832+
55833+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55834+
55835+ We grow towards the middle, optimizing layout for the case where we
55836+ append new items to the end of the node. The node header is fixed
55837+ length. Keys and item offsets for the items corresponding to them are
55838+ in increasing key order, and keys are variable length. The "key
55839+ inherit" field indicates how much of the key prefix is identical to
55840+ the previous key (stem compression as described in "Managing
55841+ Gigabytes" is used). key_inherit is a one byte integer. The
55842+ intra-node searches performed through this layout are linear searches,
55843+ and this is theorized to not hurt performance much due to the high
55844+ cost of processor stalls on modern CPUs, and the small number of keys
55845+ in a single node. Item offsets are relative to start of node (16
55846+ bits). Item bodies are in decreasing key order. Item bodies have a
55847+ variable size. There is a one to one to one mapping of keys to item
55848+ offsets to item bodies. Item offsets consist of pointers to the
55849+ zeroth byte of the item body. Item length equals the start of the
55850+ next item minus the start of this item, except the zeroth item whose
55851+ length equals the end of the node minus the start of that item (plus a
55852+ byte). In other words, item length and key length is not recorded
55853+ anywhere, and it does not need to be since it is computable.
55854+
55855+ internal node default layout: (idef1)
55856+
55857+ just like ldef1 except that item bodies are either blocknrs of
55858+ children or extents, and moving them may require updating parent
55859+ pointers in the nodes that they point to.
55860+*/
55861+
55862+/* There is an inherent 3-way tradeoff between optimizing and
55863+ exchanging disks between different architectures and code
55864+ complexity. This is optimal and simple and inexchangeable.
55865+ Someone else can do the code for exchanging disks and make it
55866+ complex. It would not be that hard. Using other than the PAGE_SIZE
55867+ might be suboptimal.
55868+*/
55869+
55870+#if !defined( __REISER4_NODE_H__ )
55871+#define __REISER4_NODE_H__
55872+
55873+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55874+
55875+#include "../../dformat.h"
55876+#include "../plugin_header.h"
55877+
55878+#include <linux/types.h>
55879+
55880+typedef enum {
55881+ NS_FOUND = 0,
55882+ NS_NOT_FOUND = -ENOENT
55883+} node_search_result;
55884+
55885+/* Maximal possible space overhead for creation of new item in a node */
55886+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55887+
55888+typedef enum {
55889+ REISER4_NODE_DKEYS = (1 << 0),
55890+ REISER4_NODE_TREE_STABLE = (1 << 1)
55891+} reiser4_node_check_flag;
55892+
55893+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55894+struct cut_list {
55895+ coord_t *from;
55896+ coord_t *to;
55897+ const reiser4_key *from_key;
55898+ const reiser4_key *to_key;
55899+ reiser4_key *smallest_removed;
55900+ carry_plugin_info *info;
55901+ __u32 flags;
55902+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55903+ lock_handle *left;
55904+ lock_handle *right;
55905+};
55906+
55907+struct carry_cut_data;
55908+struct carry_kill_data;
55909+
55910+/* The responsibility of the node plugin is to store and give access
55911+ to the sequence of items within the node. */
55912+typedef struct node_plugin {
55913+ /* generic plugin fields */
55914+ plugin_header h;
55915+
55916+ /* calculates the amount of space that will be required to store an
55917+ item which is in addition to the space consumed by the item body.
55918+ (the space consumed by the item body can be gotten by calling
55919+ item->estimate) */
55920+ size_t(*item_overhead) (const znode * node, flow_t * f);
55921+
55922+ /* returns free space by looking into node (i.e., without using
55923+ znode->free_space). */
55924+ size_t(*free_space) (znode * node);
55925+ /* search within the node for the one item which might
55926+ contain the key, invoking item->search_within to search within
55927+ that item to see if it is in there */
55928+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
55929+ lookup_bias bias, coord_t * coord);
55930+ /* number of items in node */
55931+ int (*num_of_items) (const znode * node);
55932+
55933+ /* store information about item in @coord in @data */
55934+ /* break into several node ops, don't add any more uses of this before doing so */
55935+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55936+ char *(*item_by_coord) (const coord_t * coord);
55937+ int (*length_by_coord) (const coord_t * coord);
55938+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
55939+
55940+ /* store item key in @key */
55941+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55942+ /* conservatively estimate whether unit of what size can fit
55943+ into node. This estimation should be performed without
55944+ actually looking into the node's content (free space is saved in
55945+ znode). */
55946+ size_t(*estimate) (znode * node);
55947+
55948+ /* performs every consistency check the node plugin author could
55949+ imagine. Optional. */
55950+ int (*check) (const znode * node, __u32 flags, const char **error);
55951+
55952+ /* Called when node is read into memory and node plugin is
55953+ already detected. This should read some data into znode (like free
55954+ space counter) and, optionally, check data consistency.
55955+ */
55956+ int (*parse) (znode * node);
55957+ /* This method is called on a new node to initialise plugin specific
55958+ data (header, etc.) */
55959+ int (*init) (znode * node);
55960+ /* Check whether @node content conforms to this plugin format.
55961+ Probably only useful after support for old V3.x formats is added.
55962+ Uncomment after 4.0 only.
55963+ */
55964+ /* int ( *guess )( const znode *node ); */
55965+#if REISER4_DEBUG
55966+ void (*print) (const char *prefix, const znode * node, __u32 flags);
55967+#endif
55968+ /* change size of @item by @by bytes. @item->node has enough free
55969+ space. When @by > 0 - free space is appended to end of item. When
55970+ @by < 0 - item is truncated - it is assumed that last @by bytes if
55971+ the item are freed already */
55972+ void (*change_item_size) (coord_t * item, int by);
55973+
55974+ /* create new item @length bytes long in coord @target */
55975+ int (*create_item) (coord_t * target, const reiser4_key * key,
55976+ reiser4_item_data * data, carry_plugin_info * info);
55977+
55978+ /* update key of item. */
55979+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
55980+ carry_plugin_info * info);
55981+
55982+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
55983+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
55984+
55985+ /*
55986+ * shrink item pointed to by @coord by @delta bytes.
55987+ */
55988+ int (*shrink_item) (coord_t * coord, int delta);
55989+
55990+ /* copy as much as possible but not more than up to @stop from
55991+ @stop->node to @target. If (pend == append) then data from beginning of
55992+ @stop->node are copied to the end of @target. If (pend == prepend) then
55993+ data from the end of @stop->node are copied to the beginning of
55994+ @target. Copied data are removed from @stop->node. Information
55995+ about what to do on upper level is stored in @todo */
55996+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
55997+ int delete_node, int including_insert_coord,
55998+ carry_plugin_info * info);
55999+ /* return true if this node allows skip carry() in some situations
56000+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56001+ emulation doesn't.
56002+
56003+ This will speedup insertions that doesn't require updates to the
56004+ parent, by bypassing initialisation of carry() structures. It's
56005+ believed that majority of insertions will fit there.
56006+
56007+ */
56008+ int (*fast_insert) (const coord_t * coord);
56009+ int (*fast_paste) (const coord_t * coord);
56010+ int (*fast_cut) (const coord_t * coord);
56011+ /* this limits max size of item which can be inserted into a node and
56012+ number of bytes item in a node may be appended with */
56013+ int (*max_item_size) (void);
56014+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56015+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56016+ * files */
56017+ int (*set_item_plugin) (coord_t * coord, item_id);
56018+} node_plugin;
56019+
56020+typedef enum {
56021+ /* standard unified node layout used for both leaf and internal
56022+ nodes */
56023+ NODE40_ID,
56024+ LAST_NODE_ID
56025+} reiser4_node_id;
56026+
56027+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56028+#if REISER4_DEBUG
56029+extern void print_node_content(const char *prefix, const znode * node,
56030+ __u32 flags);
56031+#endif
56032+
56033+extern void indent_znode(const znode * node);
56034+
56035+typedef struct common_node_header {
56036+ /*
56037+ * identifier of node plugin. Must be located at the very beginning of
56038+ * a node.
56039+ */
56040+ __le16 plugin_id;
56041+} common_node_header;
56042+
56043+/* __REISER4_NODE_H__ */
56044+#endif
56045+/*
56046+ * Local variables:
56047+ * c-indentation-style: "K&R"
56048+ * mode-name: "LC"
56049+ * c-basic-offset: 8
56050+ * tab-width: 8
56051+ * fill-column: 79
56052+ * scroll-step: 1
56053+ * End:
56054+ */
56055diff --git a/fs/reiser4/plugin/node/node40.c b/fs/reiser4/plugin/node/node40.c
56056new file mode 100644
56057index 0000000..6a9cc73
56058--- /dev/null
56059+++ b/fs/reiser4/plugin/node/node40.c
56060@@ -0,0 +1,2924 @@
56061+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
56062+
56063+#include "../../debug.h"
56064+#include "../../key.h"
56065+#include "../../coord.h"
56066+#include "../plugin_header.h"
56067+#include "../item/item.h"
56068+#include "node.h"
56069+#include "node40.h"
56070+#include "../plugin.h"
56071+#include "../../jnode.h"
56072+#include "../../znode.h"
56073+#include "../../pool.h"
56074+#include "../../carry.h"
56075+#include "../../tap.h"
56076+#include "../../tree.h"
56077+#include "../../super.h"
56078+#include "../../reiser4.h"
56079+
56080+#include <asm/uaccess.h>
56081+#include <linux/types.h>
56082+#include <linux/prefetch.h>
56083+
56084+/* leaf 40 format:
56085+
56086+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
56087+ plugin_id (16) key
56088+ free_space (16) pluginid (16)
56089+ free_space_start (16) offset (16)
56090+ level (8)
56091+ num_items (16)
56092+ magic (32)
56093+ flush_time (32)
56094+*/
56095+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
56096+/* magic number that is stored in ->magic field of node header */
56097+static const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
56098+
56099+static int prepare_for_update(znode * left, znode * right,
56100+ carry_plugin_info * info);
56101+
56102+/* header of node of reiser40 format is at the beginning of node */
56103+static inline node40_header *node40_node_header(const znode * node /* node to
56104+ * query */ )
56105+{
56106+ assert("nikita-567", node != NULL);
56107+ assert("nikita-568", znode_page(node) != NULL);
56108+ assert("nikita-569", zdata(node) != NULL);
56109+ return (node40_header *) zdata(node);
56110+}
56111+
56112+/* functions to get/set fields of node40_header */
56113+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
56114+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
56115+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
56116+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
56117+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
56118+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
56119+
56120+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
56121+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
56122+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
56123+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
56124+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
56125+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
56126+
56127+/* plugin field of node header should be read/set by
56128+ plugin_by_disk_id/save_disk_plugin */
56129+
56130+/* array of item headers is at the end of node */
56131+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
56132+{
56133+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
56134+}
56135+
56136+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
56137+ */
56138+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
56139+{
56140+ return (item_header40 *) (zdata(coord->node) +
56141+ znode_size(coord->node)) - (coord->item_pos) -
56142+ 1;
56143+}
56144+
56145+/* functions to get/set fields of item_header40 */
56146+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
56147+
56148+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
56149+
56150+/* plugin field of item header should be read/set by
56151+ plugin_by_disk_id/save_disk_plugin */
56152+
56153+/* plugin methods */
56154+
56155+/* plugin->u.node.item_overhead
56156+ look for description of this method in plugin/node/node.h */
56157+size_t
56158+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
56159+{
56160+ return sizeof(item_header40);
56161+}
56162+
56163+/* plugin->u.node.free_space
56164+ look for description of this method in plugin/node/node.h */
56165+size_t free_space_node40(znode * node)
56166+{
56167+ assert("nikita-577", node != NULL);
56168+ assert("nikita-578", znode_is_loaded(node));
56169+ assert("nikita-579", zdata(node) != NULL);
56170+
56171+ return nh40_get_free_space(node40_node_header(node));
56172+}
56173+
56174+/* private inline version of node40_num_of_items() for use in this file. This
56175+ is necessary, because address of node40_num_of_items() is taken and it is
56176+ never inlined as a result. */
56177+static inline short node40_num_of_items_internal(const znode * node)
56178+{
56179+ return nh40_get_num_items(node40_node_header(node));
56180+}
56181+
56182+#if REISER4_DEBUG
56183+static inline void check_num_items(const znode * node)
56184+{
56185+ assert("nikita-2749",
56186+ node40_num_of_items_internal(node) == node->nr_items);
56187+ assert("nikita-2746", znode_is_write_locked(node));
56188+}
56189+#else
56190+#define check_num_items(node) noop
56191+#endif
56192+
56193+/* plugin->u.node.num_of_items
56194+ look for description of this method in plugin/node/node.h */
56195+int num_of_items_node40(const znode * node)
56196+{
56197+ return node40_num_of_items_internal(node);
56198+}
56199+
56200+static void
56201+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
56202+{
56203+ assert("nikita-2751", node != NULL);
56204+ assert("nikita-2750", nh == node40_node_header(node));
56205+
56206+ check_num_items(node);
56207+ nh40_set_num_items(nh, value);
56208+ node->nr_items = value;
56209+ check_num_items(node);
56210+}
56211+
56212+/* plugin->u.node.item_by_coord
56213+ look for description of this method in plugin/node/node.h */
56214+char *item_by_coord_node40(const coord_t * coord)
56215+{
56216+ item_header40 *ih;
56217+ char *p;
56218+
56219+ /* @coord is set to existing item */
56220+ assert("nikita-596", coord != NULL);
56221+ assert("vs-255", coord_is_existing_item(coord));
56222+
56223+ ih = node40_ih_at_coord(coord);
56224+ p = zdata(coord->node) + ih40_get_offset(ih);
56225+ return p;
56226+}
56227+
56228+/* plugin->u.node.length_by_coord
56229+ look for description of this method in plugin/node/node.h */
56230+int length_by_coord_node40(const coord_t * coord)
56231+{
56232+ item_header40 *ih;
56233+ int result;
56234+
56235+ /* @coord is set to existing item */
56236+ assert("vs-256", coord != NULL);
56237+ assert("vs-257", coord_is_existing_item(coord));
56238+
56239+ ih = node40_ih_at_coord(coord);
56240+ if ((int)coord->item_pos ==
56241+ node40_num_of_items_internal(coord->node) - 1)
56242+ result =
56243+ nh40_get_free_space_start(node40_node_header(coord->node)) -
56244+ ih40_get_offset(ih);
56245+ else
56246+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
56247+
56248+ return result;
56249+}
56250+
56251+static pos_in_node_t
56252+node40_item_length(const znode * node, pos_in_node_t item_pos)
56253+{
56254+ item_header40 *ih;
56255+ pos_in_node_t result;
56256+
56257+ /* @coord is set to existing item */
56258+ assert("vs-256", node != NULL);
56259+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
56260+
56261+ ih = node40_ih_at(node, item_pos);
56262+ if (item_pos == node40_num_of_items_internal(node) - 1)
56263+ result =
56264+ nh40_get_free_space_start(node40_node_header(node)) -
56265+ ih40_get_offset(ih);
56266+ else
56267+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
56268+
56269+ return result;
56270+}
56271+
56272+/* plugin->u.node.plugin_by_coord
56273+ look for description of this method in plugin/node/node.h */
56274+item_plugin *plugin_by_coord_node40(const coord_t * coord)
56275+{
56276+ item_header40 *ih;
56277+ item_plugin *result;
56278+
56279+ /* @coord is set to existing item */
56280+ assert("vs-258", coord != NULL);
56281+ assert("vs-259", coord_is_existing_item(coord));
56282+
56283+ ih = node40_ih_at_coord(coord);
56284+ /* pass NULL in stead of current tree. This is time critical call. */
56285+ result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
56286+ return result;
56287+}
56288+
56289+/* plugin->u.node.key_at
56290+ look for description of this method in plugin/node/node.h */
56291+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
56292+{
56293+ item_header40 *ih;
56294+
56295+ assert("nikita-1765", coord_is_existing_item(coord));
56296+
56297+ /* @coord is set to existing item */
56298+ ih = node40_ih_at_coord(coord);
56299+ memcpy(key, &ih->key, sizeof(reiser4_key));
56300+ return key;
56301+}
56302+
56303+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
56304+
56305+#define NODE_INCSTAT(n, counter) \
56306+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
56307+
56308+#define NODE_ADDSTAT(n, counter, val) \
56309+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
56310+
56311+/* plugin->u.node.lookup
56312+ look for description of this method in plugin/node/node.h */
56313+node_search_result lookup_node40(znode * node /* node to query */ ,
56314+ const reiser4_key * key /* key to look for */ ,
56315+ lookup_bias bias /* search bias */ ,
56316+ coord_t * coord /* resulting coord */ )
56317+{
56318+ int left;
56319+ int right;
56320+ int found;
56321+ int items;
56322+
56323+ item_header40 *lefth;
56324+ item_header40 *righth;
56325+
56326+ item_plugin *iplug;
56327+ item_header40 *bstop;
56328+ item_header40 *ih;
56329+ cmp_t order;
56330+
56331+ assert("nikita-583", node != NULL);
56332+ assert("nikita-584", key != NULL);
56333+ assert("nikita-585", coord != NULL);
56334+ assert("nikita-2693", znode_is_any_locked(node));
56335+ cassert(REISER4_SEQ_SEARCH_BREAK > 2);
56336+
56337+ items = node_num_items(node);
56338+
56339+ if (unlikely(items == 0)) {
56340+ coord_init_first_unit(coord, node);
56341+ return NS_NOT_FOUND;
56342+ }
56343+
56344+ /* binary search for item that can contain given key */
56345+ left = 0;
56346+ right = items - 1;
56347+ coord->node = node;
56348+ coord_clear_iplug(coord);
56349+ found = 0;
56350+
56351+ lefth = node40_ih_at(node, left);
56352+ righth = node40_ih_at(node, right);
56353+
56354+ /* It is known that for small arrays sequential search is on average
56355+ more efficient than binary. This is because sequential search is
56356+ coded as tight loop that can be better optimized by compilers and
56357+ for small array size gain from this optimization makes sequential
56358+ search the winner. Another, maybe more important, reason for this,
56359+ is that sequential array is more CPU cache friendly, whereas binary
56360+ search effectively destroys CPU caching.
56361+
56362+ Critical here is the notion of "smallness". Reasonable value of
56363+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
56364+ fs/reiser4/ulevel/ulevel.c:test_search().
56365+
56366+ Don't try to further optimize sequential search by scanning from
56367+ right to left in attempt to use more efficient loop termination
56368+ condition (comparison with 0). This doesn't work.
56369+
56370+ */
56371+
56372+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
56373+ int median;
56374+ item_header40 *medianh;
56375+
56376+ median = (left + right) / 2;
56377+ medianh = node40_ih_at(node, median);
56378+
56379+ assert("nikita-1084", median >= 0);
56380+ assert("nikita-1085", median < items);
56381+ switch (keycmp(key, &medianh->key)) {
56382+ case LESS_THAN:
56383+ right = median;
56384+ righth = medianh;
56385+ break;
56386+ default:
56387+ wrong_return_value("nikita-586", "keycmp");
56388+ case GREATER_THAN:
56389+ left = median;
56390+ lefth = medianh;
56391+ break;
56392+ case EQUAL_TO:
56393+ do {
56394+ --median;
56395+ /* headers are ordered from right to left */
56396+ ++medianh;
56397+ } while (median >= 0 && keyeq(key, &medianh->key));
56398+ right = left = median + 1;
56399+ ih = lefth = righth = medianh - 1;
56400+ found = 1;
56401+ break;
56402+ }
56403+ }
56404+ /* sequential scan. Item headers, and, therefore, keys are stored at
56405+ the rightmost part of a node from right to left. We are trying to
56406+ access memory from left to right, and hence, scan in _descending_
56407+ order of item numbers.
56408+ */
56409+ if (!found) {
56410+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
56411+ cmp_t comparison;
56412+
56413+ prefetchkey(&(ih + 1)->key);
56414+ comparison = keycmp(&ih->key, key);
56415+ if (comparison == GREATER_THAN)
56416+ continue;
56417+ if (comparison == EQUAL_TO) {
56418+ found = 1;
56419+ do {
56420+ --left;
56421+ ++ih;
56422+ } while (left >= 0 && keyeq(&ih->key, key));
56423+ ++left;
56424+ --ih;
56425+ } else {
56426+ assert("nikita-1256", comparison == LESS_THAN);
56427+ }
56428+ break;
56429+ }
56430+ if (unlikely(left < 0))
56431+ left = 0;
56432+ }
56433+
56434+ assert("nikita-3212", right >= left);
56435+ assert("nikita-3214",
56436+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
56437+
56438+ coord_set_item_pos(coord, left);
56439+ coord->unit_pos = 0;
56440+ coord->between = AT_UNIT;
56441+
56442+ /* key < leftmost key in a mode or node is corrupted and keys
56443+ are not sorted */
56444+ bstop = node40_ih_at(node, (unsigned)left);
56445+ order = keycmp(&bstop->key, key);
56446+ if (unlikely(order == GREATER_THAN)) {
56447+ if (unlikely(left != 0)) {
56448+ /* screw up */
56449+ warning("nikita-587", "Key less than %i key in a node",
56450+ left);
56451+ reiser4_print_key("key", key);
56452+ reiser4_print_key("min", &bstop->key);
56453+ print_coord_content("coord", coord);
56454+ return RETERR(-EIO);
56455+ } else {
56456+ coord->between = BEFORE_UNIT;
56457+ return NS_NOT_FOUND;
56458+ }
56459+ }
56460+ /* left <= key, ok */
56461+ iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
56462+
56463+ if (unlikely(iplug == NULL)) {
56464+ warning("nikita-588", "Unknown plugin %i",
56465+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
56466+ reiser4_print_key("key", key);
56467+ print_coord_content("coord", coord);
56468+ return RETERR(-EIO);
56469+ }
56470+
56471+ coord_set_iplug(coord, iplug);
56472+
56473+ /* if exact key from item header was found by binary search, no
56474+ further checks are necessary. */
56475+ if (found) {
56476+ assert("nikita-1259", order == EQUAL_TO);
56477+ return NS_FOUND;
56478+ }
56479+ if (iplug->b.max_key_inside != NULL) {
56480+ reiser4_key max_item_key;
56481+
56482+ /* key > max_item_key --- outside of an item */
56483+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
56484+ coord->unit_pos = 0;
56485+ coord->between = AFTER_ITEM;
56486+ /* FIXME-VS: key we are looking for does not fit into
56487+ found item. Return NS_NOT_FOUND then. Without that
56488+ the following case does not work: there is extent of
56489+ file 10000, 10001. File 10000, 10002 has been just
56490+ created. When writing to position 0 in that file -
56491+ traverse_tree will stop here on twig level. When we
56492+ want it to go down to leaf level
56493+ */
56494+ return NS_NOT_FOUND;
56495+ }
56496+ }
56497+
56498+ if (iplug->b.lookup != NULL) {
56499+ return iplug->b.lookup(key, bias, coord);
56500+ } else {
56501+ assert("nikita-1260", order == LESS_THAN);
56502+ coord->between = AFTER_UNIT;
56503+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
56504+ }
56505+}
56506+
56507+#undef NODE_ADDSTAT
56508+#undef NODE_INCSTAT
56509+
56510+/* plugin->u.node.estimate
56511+ look for description of this method in plugin/node/node.h */
56512+size_t estimate_node40(znode * node)
56513+{
56514+ size_t result;
56515+
56516+ assert("nikita-597", node != NULL);
56517+
56518+ result = free_space_node40(node) - sizeof(item_header40);
56519+
56520+ return (result > 0) ? result : 0;
56521+}
56522+
56523+/* plugin->u.node.check
56524+ look for description of this method in plugin/node/node.h */
56525+int check_node40(const znode * node /* node to check */ ,
56526+ __u32 flags /* check flags */ ,
56527+ const char **error /* where to store error message */ )
56528+{
56529+ int nr_items;
56530+ int i;
56531+ reiser4_key prev;
56532+ unsigned old_offset;
56533+ tree_level level;
56534+ coord_t coord;
56535+ int result;
56536+
56537+ assert("nikita-580", node != NULL);
56538+ assert("nikita-581", error != NULL);
56539+ assert("nikita-2948", znode_is_loaded(node));
56540+
56541+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
56542+ return 0;
56543+
56544+ assert("nikita-582", zdata(node) != NULL);
56545+
56546+ nr_items = node40_num_of_items_internal(node);
56547+ if (nr_items < 0) {
56548+ *error = "Negative number of items";
56549+ return -1;
56550+ }
56551+
56552+ if (flags & REISER4_NODE_DKEYS)
56553+ prev = *znode_get_ld_key((znode *) node);
56554+ else
56555+ prev = *reiser4_min_key();
56556+
56557+ old_offset = 0;
56558+ coord_init_zero(&coord);
56559+ coord.node = (znode *) node;
56560+ coord.unit_pos = 0;
56561+ coord.between = AT_UNIT;
56562+ level = znode_get_level(node);
56563+ for (i = 0; i < nr_items; i++) {
56564+ item_header40 *ih;
56565+ reiser4_key unit_key;
56566+ unsigned j;
56567+
56568+ ih = node40_ih_at(node, (unsigned)i);
56569+ coord_set_item_pos(&coord, i);
56570+ if ((ih40_get_offset(ih) >=
56571+ znode_size(node) - nr_items * sizeof(item_header40)) ||
56572+ (ih40_get_offset(ih) < sizeof(node40_header))) {
56573+ *error = "Offset is out of bounds";
56574+ return -1;
56575+ }
56576+ if (ih40_get_offset(ih) <= old_offset) {
56577+ *error = "Offsets are in wrong order";
56578+ return -1;
56579+ }
56580+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
56581+ *error = "Wrong offset of first item";
56582+ return -1;
56583+ }
56584+ old_offset = ih40_get_offset(ih);
56585+
56586+ if (keygt(&prev, &ih->key)) {
56587+ *error = "Keys are in wrong order";
56588+ return -1;
56589+ }
56590+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
56591+ *error = "Wrong key of first unit";
56592+ return -1;
56593+ }
56594+ prev = ih->key;
56595+ for (j = 0; j < coord_num_units(&coord); ++j) {
56596+ coord.unit_pos = j;
56597+ unit_key_by_coord(&coord, &unit_key);
56598+ if (keygt(&prev, &unit_key)) {
56599+ *error = "Unit keys are in wrong order";
56600+ return -1;
56601+ }
56602+ prev = unit_key;
56603+ }
56604+ coord.unit_pos = 0;
56605+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
56606+ *error = "extent on the wrong level";
56607+ return -1;
56608+ }
56609+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
56610+ *error = "internal item on the wrong level";
56611+ return -1;
56612+ }
56613+ if (level != LEAF_LEVEL &&
56614+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
56615+ *error = "wrong item on the internal level";
56616+ return -1;
56617+ }
56618+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
56619+ *error = "non-internal item on the internal level";
56620+ return -1;
56621+ }
56622+#if REISER4_DEBUG
56623+ if (item_plugin_by_coord(&coord)->b.check
56624+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
56625+ return -1;
56626+#endif
56627+ if (i) {
56628+ coord_t prev_coord;
56629+ /* two neighboring items can not be mergeable */
56630+ coord_dup(&prev_coord, &coord);
56631+ coord_prev_item(&prev_coord);
56632+ if (are_items_mergeable(&prev_coord, &coord)) {
56633+ *error = "mergeable items in one node";
56634+ return -1;
56635+ }
56636+
56637+ }
56638+ }
56639+
56640+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
56641+ coord_t coord;
56642+ item_plugin *iplug;
56643+
56644+ coord_init_last_unit(&coord, node);
56645+ iplug = item_plugin_by_coord(&coord);
56646+ if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
56647+ iplug->s.file.append_key != NULL) {
56648+ reiser4_key mkey;
56649+
56650+ iplug->s.file.append_key(&coord, &mkey);
56651+ set_key_offset(&mkey, get_key_offset(&mkey) - 1);
56652+ read_lock_dk(current_tree);
56653+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
56654+ read_unlock_dk(current_tree);
56655+ if (result) {
56656+ *error = "key of rightmost item is too large";
56657+ return -1;
56658+ }
56659+ }
56660+ }
56661+ if (flags & REISER4_NODE_DKEYS) {
56662+ read_lock_tree(current_tree);
56663+ read_lock_dk(current_tree);
56664+
56665+ flags |= REISER4_NODE_TREE_STABLE;
56666+
56667+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
56668+ if (flags & REISER4_NODE_TREE_STABLE) {
56669+ *error = "Last key is greater than rdkey";
56670+ read_unlock_dk(current_tree);
56671+ read_unlock_tree(current_tree);
56672+ return -1;
56673+ }
56674+ }
56675+ if (keygt
56676+ (znode_get_ld_key((znode *) node),
56677+ znode_get_rd_key((znode *) node))) {
56678+ *error = "ldkey is greater than rdkey";
56679+ read_unlock_dk(current_tree);
56680+ read_unlock_tree(current_tree);
56681+ return -1;
56682+ }
56683+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
56684+ (node->left != NULL) &&
56685+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
56686+ ergo(flags & REISER4_NODE_TREE_STABLE,
56687+ !keyeq(znode_get_rd_key(node->left),
56688+ znode_get_ld_key((znode *) node)))
56689+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56690+ keygt(znode_get_rd_key(node->left),
56691+ znode_get_ld_key((znode *) node)))) {
56692+ *error = "left rdkey or ldkey is wrong";
56693+ read_unlock_dk(current_tree);
56694+ read_unlock_tree(current_tree);
56695+ return -1;
56696+ }
56697+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
56698+ (node->right != NULL) &&
56699+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
56700+ ergo(flags & REISER4_NODE_TREE_STABLE,
56701+ !keyeq(znode_get_rd_key((znode *) node),
56702+ znode_get_ld_key(node->right)))
56703+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56704+ keygt(znode_get_rd_key((znode *) node),
56705+ znode_get_ld_key(node->right)))) {
56706+ *error = "rdkey or right ldkey is wrong";
56707+ read_unlock_dk(current_tree);
56708+ read_unlock_tree(current_tree);
56709+ return -1;
56710+ }
56711+
56712+ read_unlock_dk(current_tree);
56713+ read_unlock_tree(current_tree);
56714+ }
56715+
56716+ return 0;
56717+}
56718+
56719+/* plugin->u.node.parse
56720+ look for description of this method in plugin/node/node.h */
56721+int parse_node40(znode * node /* node to parse */ )
56722+{
56723+ node40_header *header;
56724+ int result;
56725+ d8 level;
56726+
56727+ header = node40_node_header((znode *) node);
56728+ result = -EIO;
56729+ level = nh40_get_level(header);
56730+ if (unlikely(((__u8) znode_get_level(node)) != level))
56731+ warning("nikita-494", "Wrong level found in node: %i != %i",
56732+ znode_get_level(node), level);
56733+ else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
56734+ warning("nikita-495",
56735+ "Wrong magic in tree node: want %x, got %x",
56736+ REISER4_NODE_MAGIC, nh40_get_magic(header));
56737+ else {
56738+ node->nr_items = node40_num_of_items_internal(node);
56739+ result = 0;
56740+ }
56741+ return RETERR(result);
56742+}
56743+
56744+/* plugin->u.node.init
56745+ look for description of this method in plugin/node/node.h */
56746+int init_node40(znode * node /* node to initialise */ )
56747+{
56748+ node40_header *header;
56749+
56750+ assert("nikita-570", node != NULL);
56751+ assert("nikita-572", zdata(node) != NULL);
56752+
56753+ header = node40_node_header(node);
56754+ memset(header, 0, sizeof(node40_header));
56755+ nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
56756+ nh40_set_free_space_start(header, sizeof(node40_header));
56757+ /* sane hypothesis: 0 in CPU format is 0 in disk format */
56758+ /* items: 0 */
56759+ save_plugin_id(node_plugin_to_plugin(node->nplug),
56760+ &header->common_header.plugin_id);
56761+ nh40_set_level(header, znode_get_level(node));
56762+ nh40_set_magic(header, REISER4_NODE_MAGIC);
56763+ node->nr_items = 0;
56764+ nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
56765+
56766+ /* flags: 0 */
56767+ return 0;
56768+}
56769+
56770+#ifdef GUESS_EXISTS
56771+int guess_node40(const znode * node /* node to guess plugin of */ )
56772+{
56773+ node40_header *nethack;
56774+
56775+ assert("nikita-1058", node != NULL);
56776+ nethack = node40_node_header(node);
56777+ return
56778+ (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
56779+ (plugin_by_disk_id(znode_get_tree(node),
56780+ REISER4_NODE_PLUGIN_TYPE,
56781+ &nethack->common_header.plugin_id)->h.id ==
56782+ NODE40_ID);
56783+}
56784+#endif
56785+
56786+/* plugin->u.node.chage_item_size
56787+ look for description of this method in plugin/node/node.h */
56788+void change_item_size_node40(coord_t * coord, int by)
56789+{
56790+ node40_header *nh;
56791+ item_header40 *ih;
56792+ char *item_data;
56793+ int item_length;
56794+ unsigned i;
56795+
56796+ /* make sure that @item is coord of existing item */
56797+ assert("vs-210", coord_is_existing_item(coord));
56798+
56799+ nh = node40_node_header(coord->node);
56800+
56801+ item_data = item_by_coord_node40(coord);
56802+ item_length = length_by_coord_node40(coord);
56803+
56804+ /* move item bodies */
56805+ ih = node40_ih_at_coord(coord);
56806+ memmove(item_data + item_length + by, item_data + item_length,
56807+ nh40_get_free_space_start(node40_node_header(coord->node)) -
56808+ (ih40_get_offset(ih) + item_length));
56809+
56810+ /* update offsets of moved items */
56811+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
56812+ ih = node40_ih_at(coord->node, i);
56813+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
56814+ }
56815+
56816+ /* update node header */
56817+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
56818+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
56819+}
56820+
56821+static int should_notify_parent(const znode * node)
56822+{
56823+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
56824+ return !disk_addr_eq(znode_get_block(node),
56825+ &znode_get_tree(node)->root_block);
56826+}
56827+
56828+/* plugin->u.node.create_item
56829+ look for description of this method in plugin/node/node.h */
56830+int
56831+create_item_node40(coord_t *target, const reiser4_key *key,
56832+ reiser4_item_data *data, carry_plugin_info *info)
56833+{
56834+ node40_header *nh;
56835+ item_header40 *ih;
56836+ unsigned offset;
56837+ unsigned i;
56838+
56839+ nh = node40_node_header(target->node);
56840+
56841+ assert("vs-212", coord_is_between_items(target));
56842+ /* node must have enough free space */
56843+ assert("vs-254",
56844+ free_space_node40(target->node) >=
56845+ data->length + sizeof(item_header40));
56846+ assert("vs-1410", data->length >= 0);
56847+
56848+ if (coord_set_to_right(target))
56849+ /* there are not items to the right of @target, so, new item
56850+ will be inserted after last one */
56851+ coord_set_item_pos(target, nh40_get_num_items(nh));
56852+
56853+ if (target->item_pos < nh40_get_num_items(nh)) {
56854+ /* there are items to be moved to prepare space for new
56855+ item */
56856+ ih = node40_ih_at_coord(target);
56857+ /* new item will start at this offset */
56858+ offset = ih40_get_offset(ih);
56859+
56860+ memmove(zdata(target->node) + offset + data->length,
56861+ zdata(target->node) + offset,
56862+ nh40_get_free_space_start(nh) - offset);
56863+ /* update headers of moved items */
56864+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
56865+ ih = node40_ih_at(target->node, i);
56866+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
56867+ }
56868+
56869+ /* @ih is set to item header of the last item, move item headers */
56870+ memmove(ih - 1, ih,
56871+ sizeof(item_header40) * (nh40_get_num_items(nh) -
56872+ target->item_pos));
56873+ } else {
56874+ /* new item will start at this offset */
56875+ offset = nh40_get_free_space_start(nh);
56876+ }
56877+
56878+ /* make item header for the new item */
56879+ ih = node40_ih_at_coord(target);
56880+ memcpy(&ih->key, key, sizeof(reiser4_key));
56881+ ih40_set_offset(ih, offset);
56882+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
56883+
56884+ /* update node header */
56885+ nh40_set_free_space(nh,
56886+ nh40_get_free_space(nh) - data->length -
56887+ sizeof(item_header40));
56888+ nh40_set_free_space_start(nh,
56889+ nh40_get_free_space_start(nh) + data->length);
56890+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
56891+
56892+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
56893+ target->unit_pos = 0;
56894+ target->between = AT_UNIT;
56895+ coord_clear_iplug(target);
56896+
56897+ /* initialize item */
56898+ if (data->iplug->b.init != NULL) {
56899+ data->iplug->b.init(target, NULL, data);
56900+ }
56901+ /* copy item body */
56902+ if (data->iplug->b.paste != NULL) {
56903+ data->iplug->b.paste(target, data, info);
56904+ } else if (data->data != NULL) {
56905+ if (data->user) {
56906+ /* AUDIT: Are we really should not check that pointer
56907+ from userspace was valid and data bytes were
56908+ available? How will we return -EFAULT of some kind
56909+ without this check? */
56910+ assert("nikita-3038", reiser4_schedulable());
56911+ /* copy data from user space */
56912+ __copy_from_user(zdata(target->node) + offset,
56913+ (const char __user *)data->data,
56914+ (unsigned)data->length);
56915+ } else
56916+ /* copy from kernel space */
56917+ memcpy(zdata(target->node) + offset, data->data,
56918+ (unsigned)data->length);
56919+ }
56920+
56921+ if (target->item_pos == 0) {
56922+ /* left delimiting key has to be updated */
56923+ prepare_for_update(NULL, target->node, info);
56924+ }
56925+
56926+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
56927+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
56928+ }
56929+
56930+ return 0;
56931+}
56932+
56933+/* plugin->u.node.update_item_key
56934+ look for description of this method in plugin/node/node.h */
56935+void
56936+update_item_key_node40(coord_t * target, const reiser4_key * key,
56937+ carry_plugin_info * info)
56938+{
56939+ item_header40 *ih;
56940+
56941+ ih = node40_ih_at_coord(target);
56942+ memcpy(&ih->key, key, sizeof(reiser4_key));
56943+
56944+ if (target->item_pos == 0) {
56945+ prepare_for_update(NULL, target->node, info);
56946+ }
56947+}
56948+
56949+/* this bits encode cut mode */
56950+#define CMODE_TAIL 1
56951+#define CMODE_WHOLE 2
56952+#define CMODE_HEAD 4
56953+
56954+struct cut40_info {
56955+ int mode;
56956+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
56957+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
56958+ pos_in_node_t removed_count; /* number of items removed completely */
56959+ pos_in_node_t head_removed; /* position of item which gets head removed */
56960+
56961+ pos_in_node_t freed_space_start;
56962+ pos_in_node_t freed_space_end;
56963+ pos_in_node_t first_moved;
56964+ pos_in_node_t head_removed_location;
56965+};
56966+
56967+static void init_cinfo(struct cut40_info *cinfo)
56968+{
56969+ cinfo->mode = 0;
56970+ cinfo->tail_removed = MAX_POS_IN_NODE;
56971+ cinfo->first_removed = MAX_POS_IN_NODE;
56972+ cinfo->removed_count = MAX_POS_IN_NODE;
56973+ cinfo->head_removed = MAX_POS_IN_NODE;
56974+ cinfo->freed_space_start = MAX_POS_IN_NODE;
56975+ cinfo->freed_space_end = MAX_POS_IN_NODE;
56976+ cinfo->first_moved = MAX_POS_IN_NODE;
56977+ cinfo->head_removed_location = MAX_POS_IN_NODE;
56978+}
56979+
56980+/* complete cut_node40/kill_node40 content by removing the gap created by */
56981+static void compact(znode * node, struct cut40_info *cinfo)
56982+{
56983+ node40_header *nh;
56984+ item_header40 *ih;
56985+ pos_in_node_t freed;
56986+ pos_in_node_t pos, nr_items;
56987+
56988+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56989+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
56990+ cinfo->first_moved != MAX_POS_IN_NODE));
56991+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56992+
56993+ nh = node40_node_header(node);
56994+ nr_items = nh40_get_num_items(nh);
56995+
56996+ /* remove gap made up by removal */
56997+ memmove(zdata(node) + cinfo->freed_space_start,
56998+ zdata(node) + cinfo->freed_space_end,
56999+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
57000+
57001+ /* update item headers of moved items - change their locations */
57002+ pos = cinfo->first_moved;
57003+ ih = node40_ih_at(node, pos);
57004+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
57005+ assert("vs-1580", pos == cinfo->head_removed);
57006+ ih40_set_offset(ih, cinfo->head_removed_location);
57007+ pos++;
57008+ ih--;
57009+ }
57010+
57011+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
57012+ for (; pos < nr_items; pos++, ih--) {
57013+ assert("vs-1581", ih == node40_ih_at(node, pos));
57014+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
57015+ }
57016+
57017+ /* free space start moved to right */
57018+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
57019+
57020+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
57021+ /* number of items changed. Remove item headers of those items */
57022+ ih = node40_ih_at(node, nr_items - 1);
57023+ memmove(ih + cinfo->removed_count, ih,
57024+ sizeof(item_header40) * (nr_items -
57025+ cinfo->removed_count -
57026+ cinfo->first_removed));
57027+ freed += sizeof(item_header40) * cinfo->removed_count;
57028+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
57029+ }
57030+
57031+ /* total amount of free space increased */
57032+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
57033+}
57034+
57035+int shrink_item_node40(coord_t * coord, int delta)
57036+{
57037+ node40_header *nh;
57038+ item_header40 *ih;
57039+ pos_in_node_t pos;
57040+ pos_in_node_t nr_items;
57041+ char *end;
57042+ znode *node;
57043+ int off;
57044+
57045+ assert("nikita-3487", coord != NULL);
57046+ assert("nikita-3488", delta >= 0);
57047+
57048+ node = coord->node;
57049+ nh = node40_node_header(node);
57050+ nr_items = nh40_get_num_items(nh);
57051+
57052+ ih = node40_ih_at_coord(coord);
57053+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
57054+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
57055+ end = zdata(node) + off;
57056+
57057+ /* remove gap made up by removal */
57058+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
57059+
57060+ /* update item headers of moved items - change their locations */
57061+ pos = coord->item_pos + 1;
57062+ ih = node40_ih_at(node, pos);
57063+ for (; pos < nr_items; pos++, ih--) {
57064+ assert("nikita-3490", ih == node40_ih_at(node, pos));
57065+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
57066+ }
57067+
57068+ /* free space start moved to left */
57069+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
57070+ /* total amount of free space increased */
57071+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
57072+ /*
57073+ * This method does _not_ changes number of items. Hence, it cannot
57074+ * make node empty. Also it doesn't remove items at all, which means
57075+ * that no keys have to be updated either.
57076+ */
57077+ return 0;
57078+}
57079+
57080+/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
57081+ of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the
57082+ rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
57083+ getting head cut. Function returns 0 in this case */
57084+static int
57085+parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
57086+{
57087+ reiser4_key left_key, right_key;
57088+ reiser4_key min_from_key, max_to_key;
57089+ const reiser4_key *from_key, *to_key;
57090+
57091+ init_cinfo(cinfo);
57092+
57093+ /* calculate minimal key stored in first item of items to be cut (params->from) */
57094+ item_key_by_coord(params->from, &min_from_key);
57095+ /* and max key stored in last item of items to be cut (params->to) */
57096+ max_item_key_by_coord(params->to, &max_to_key);
57097+
57098+ /* if cut key range is not defined in input parameters - define it using cut coord range */
57099+ if (params->from_key == NULL) {
57100+ assert("vs-1513", params->to_key == NULL);
57101+ unit_key_by_coord(params->from, &left_key);
57102+ from_key = &left_key;
57103+ max_unit_key_by_coord(params->to, &right_key);
57104+ to_key = &right_key;
57105+ } else {
57106+ from_key = params->from_key;
57107+ to_key = params->to_key;
57108+ }
57109+
57110+ if (params->from->item_pos == params->to->item_pos) {
57111+ if (keylt(&min_from_key, from_key)
57112+ && keylt(to_key, &max_to_key))
57113+ return 1;
57114+
57115+ if (keygt(from_key, &min_from_key)) {
57116+ /* tail of item is to be cut cut */
57117+ cinfo->tail_removed = params->from->item_pos;
57118+ cinfo->mode |= CMODE_TAIL;
57119+ } else if (keylt(to_key, &max_to_key)) {
57120+ /* head of item is to be cut */
57121+ cinfo->head_removed = params->from->item_pos;
57122+ cinfo->mode |= CMODE_HEAD;
57123+ } else {
57124+ /* item is removed completely */
57125+ cinfo->first_removed = params->from->item_pos;
57126+ cinfo->removed_count = 1;
57127+ cinfo->mode |= CMODE_WHOLE;
57128+ }
57129+ } else {
57130+ cinfo->first_removed = params->from->item_pos + 1;
57131+ cinfo->removed_count =
57132+ params->to->item_pos - params->from->item_pos - 1;
57133+
57134+ if (keygt(from_key, &min_from_key)) {
57135+ /* first item is not cut completely */
57136+ cinfo->tail_removed = params->from->item_pos;
57137+ cinfo->mode |= CMODE_TAIL;
57138+ } else {
57139+ cinfo->first_removed--;
57140+ cinfo->removed_count++;
57141+ }
57142+ if (keylt(to_key, &max_to_key)) {
57143+ /* last item is not cut completely */
57144+ cinfo->head_removed = params->to->item_pos;
57145+ cinfo->mode |= CMODE_HEAD;
57146+ } else {
57147+ cinfo->removed_count++;
57148+ }
57149+ if (cinfo->removed_count)
57150+ cinfo->mode |= CMODE_WHOLE;
57151+ }
57152+
57153+ return 0;
57154+}
57155+
57156+static void
57157+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
57158+ carry_kill_data * kdata)
57159+{
57160+ coord_t coord;
57161+ item_plugin *iplug;
57162+ pos_in_node_t pos;
57163+
57164+ coord.node = node;
57165+ coord.unit_pos = 0;
57166+ coord.between = AT_UNIT;
57167+ for (pos = 0; pos < count; pos++) {
57168+ coord_set_item_pos(&coord, from + pos);
57169+ coord.unit_pos = 0;
57170+ coord.between = AT_UNIT;
57171+ iplug = item_plugin_by_coord(&coord);
57172+ if (iplug->b.kill_hook) {
57173+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
57174+ kdata);
57175+ }
57176+ }
57177+}
57178+
57179+/* this is used to kill item partially */
57180+static pos_in_node_t
57181+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
57182+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
57183+{
57184+ struct carry_kill_data *kdata;
57185+ item_plugin *iplug;
57186+
57187+ kdata = data;
57188+ iplug = item_plugin_by_coord(coord);
57189+
57190+ assert("vs-1524", iplug->b.kill_units);
57191+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
57192+ new_first_key);
57193+}
57194+
57195+/* call item plugin to cut tail of file */
57196+static pos_in_node_t
57197+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
57198+{
57199+ struct carry_kill_data *kdata;
57200+ pos_in_node_t to;
57201+
57202+ kdata = data;
57203+ to = coord_last_unit_pos(coord);
57204+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
57205+ NULL);
57206+}
57207+
57208+/* call item plugin to cut head of item */
57209+static pos_in_node_t
57210+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
57211+ reiser4_key * new_first_key)
57212+{
57213+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
57214+ new_first_key);
57215+}
57216+
57217+/* this is used to cut item partially */
57218+static pos_in_node_t
57219+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
57220+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
57221+{
57222+ carry_cut_data *cdata;
57223+ item_plugin *iplug;
57224+
57225+ cdata = data;
57226+ iplug = item_plugin_by_coord(coord);
57227+ assert("vs-302", iplug->b.cut_units);
57228+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
57229+ new_first_key);
57230+}
57231+
57232+/* call item plugin to cut tail of file */
57233+static pos_in_node_t
57234+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
57235+{
57236+ carry_cut_data *cdata;
57237+ pos_in_node_t to;
57238+
57239+ cdata = data;
57240+ to = coord_last_unit_pos(cdata->params.from);
57241+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
57242+}
57243+
57244+/* call item plugin to cut head of item */
57245+static pos_in_node_t
57246+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
57247+ reiser4_key * new_first_key)
57248+{
57249+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
57250+ new_first_key);
57251+}
57252+
57253+/* this returns 1 of key of first item changed, 0 - if it did not */
57254+static int
57255+prepare_for_compact(struct cut40_info *cinfo,
57256+ const struct cut_kill_params *params, int is_cut,
57257+ void *data, carry_plugin_info * info)
57258+{
57259+ znode *node;
57260+ item_header40 *ih;
57261+ pos_in_node_t freed;
57262+ pos_in_node_t item_pos;
57263+ coord_t coord;
57264+ reiser4_key new_first_key;
57265+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
57266+ void *, reiser4_key *, reiser4_key *);
57267+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
57268+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
57269+ reiser4_key *);
57270+ int retval;
57271+
57272+ retval = 0;
57273+
57274+ node = params->from->node;
57275+
57276+ assert("vs-184", node == params->to->node);
57277+ assert("vs-312", !node_is_empty(node));
57278+ assert("vs-297",
57279+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
57280+
57281+ if (is_cut) {
57282+ kill_units_f = cut_units;
57283+ kill_tail_f = cut_tail;
57284+ kill_head_f = cut_head;
57285+ } else {
57286+ kill_units_f = kill_units;
57287+ kill_tail_f = kill_tail;
57288+ kill_head_f = kill_head;
57289+ }
57290+
57291+ if (parse_cut(cinfo, params) == 1) {
57292+ /* cut from the middle of item */
57293+ freed =
57294+ kill_units_f(params->from, params->from->unit_pos,
57295+ params->to->unit_pos, data,
57296+ params->smallest_removed, NULL);
57297+
57298+ item_pos = params->from->item_pos;
57299+ ih = node40_ih_at(node, item_pos);
57300+ cinfo->freed_space_start =
57301+ ih40_get_offset(ih) + node40_item_length(node,
57302+ item_pos) - freed;
57303+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
57304+ cinfo->first_moved = item_pos + 1;
57305+ } else {
57306+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
57307+ cinfo->first_removed != MAX_POS_IN_NODE ||
57308+ cinfo->head_removed != MAX_POS_IN_NODE));
57309+
57310+ switch (cinfo->mode) {
57311+ case CMODE_TAIL:
57312+ /* one item gets cut partially from its end */
57313+ assert("vs-1562",
57314+ cinfo->tail_removed == params->from->item_pos);
57315+
57316+ freed =
57317+ kill_tail_f(params->from, data,
57318+ params->smallest_removed);
57319+
57320+ item_pos = cinfo->tail_removed;
57321+ ih = node40_ih_at(node, item_pos);
57322+ cinfo->freed_space_start =
57323+ ih40_get_offset(ih) + node40_item_length(node,
57324+ item_pos) -
57325+ freed;
57326+ cinfo->freed_space_end =
57327+ cinfo->freed_space_start + freed;
57328+ cinfo->first_moved = cinfo->tail_removed + 1;
57329+ break;
57330+
57331+ case CMODE_WHOLE:
57332+ /* one or more items get removed completely */
57333+ assert("vs-1563",
57334+ cinfo->first_removed == params->from->item_pos);
57335+ assert("vs-1564", cinfo->removed_count > 0
57336+ && cinfo->removed_count != MAX_POS_IN_NODE);
57337+
57338+ /* call kill hook for all items removed completely */
57339+ if (is_cut == 0)
57340+ call_kill_hooks(node, cinfo->first_removed,
57341+ cinfo->removed_count, data);
57342+
57343+ item_pos = cinfo->first_removed;
57344+ ih = node40_ih_at(node, item_pos);
57345+
57346+ if (params->smallest_removed)
57347+ memcpy(params->smallest_removed, &ih->key,
57348+ sizeof(reiser4_key));
57349+
57350+ cinfo->freed_space_start = ih40_get_offset(ih);
57351+
57352+ item_pos += (cinfo->removed_count - 1);
57353+ ih -= (cinfo->removed_count - 1);
57354+ cinfo->freed_space_end =
57355+ ih40_get_offset(ih) + node40_item_length(node,
57356+ item_pos);
57357+ cinfo->first_moved = item_pos + 1;
57358+ if (cinfo->first_removed == 0)
57359+ /* key of first item of the node changes */
57360+ retval = 1;
57361+ break;
57362+
57363+ case CMODE_HEAD:
57364+ /* one item gets cut partially from its head */
57365+ assert("vs-1565",
57366+ cinfo->head_removed == params->from->item_pos);
57367+
57368+ freed =
57369+ kill_head_f(params->to, data,
57370+ params->smallest_removed,
57371+ &new_first_key);
57372+
57373+ item_pos = cinfo->head_removed;
57374+ ih = node40_ih_at(node, item_pos);
57375+ cinfo->freed_space_start = ih40_get_offset(ih);
57376+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
57377+ cinfo->first_moved = cinfo->head_removed + 1;
57378+
57379+ /* item head is removed, therefore, item key changed */
57380+ coord.node = node;
57381+ coord_set_item_pos(&coord, item_pos);
57382+ coord.unit_pos = 0;
57383+ coord.between = AT_UNIT;
57384+ update_item_key_node40(&coord, &new_first_key, NULL);
57385+ if (item_pos == 0)
57386+ /* key of first item of the node changes */
57387+ retval = 1;
57388+ break;
57389+
57390+ case CMODE_TAIL | CMODE_WHOLE:
57391+ /* one item gets cut from its end and one or more items get removed completely */
57392+ assert("vs-1566",
57393+ cinfo->tail_removed == params->from->item_pos);
57394+ assert("vs-1567",
57395+ cinfo->first_removed == cinfo->tail_removed + 1);
57396+ assert("vs-1564", cinfo->removed_count > 0
57397+ && cinfo->removed_count != MAX_POS_IN_NODE);
57398+
57399+ freed =
57400+ kill_tail_f(params->from, data,
57401+ params->smallest_removed);
57402+
57403+ item_pos = cinfo->tail_removed;
57404+ ih = node40_ih_at(node, item_pos);
57405+ cinfo->freed_space_start =
57406+ ih40_get_offset(ih) + node40_item_length(node,
57407+ item_pos) -
57408+ freed;
57409+
57410+ /* call kill hook for all items removed completely */
57411+ if (is_cut == 0)
57412+ call_kill_hooks(node, cinfo->first_removed,
57413+ cinfo->removed_count, data);
57414+
57415+ item_pos += cinfo->removed_count;
57416+ ih -= cinfo->removed_count;
57417+ cinfo->freed_space_end =
57418+ ih40_get_offset(ih) + node40_item_length(node,
57419+ item_pos);
57420+ cinfo->first_moved = item_pos + 1;
57421+ break;
57422+
57423+ case CMODE_WHOLE | CMODE_HEAD:
57424+ /* one or more items get removed completely and one item gets cut partially from its head */
57425+ assert("vs-1568",
57426+ cinfo->first_removed == params->from->item_pos);
57427+ assert("vs-1564", cinfo->removed_count > 0
57428+ && cinfo->removed_count != MAX_POS_IN_NODE);
57429+ assert("vs-1569",
57430+ cinfo->head_removed ==
57431+ cinfo->first_removed + cinfo->removed_count);
57432+
57433+ /* call kill hook for all items removed completely */
57434+ if (is_cut == 0)
57435+ call_kill_hooks(node, cinfo->first_removed,
57436+ cinfo->removed_count, data);
57437+
57438+ item_pos = cinfo->first_removed;
57439+ ih = node40_ih_at(node, item_pos);
57440+
57441+ if (params->smallest_removed)
57442+ memcpy(params->smallest_removed, &ih->key,
57443+ sizeof(reiser4_key));
57444+
57445+ freed =
57446+ kill_head_f(params->to, data, NULL, &new_first_key);
57447+
57448+ cinfo->freed_space_start = ih40_get_offset(ih);
57449+
57450+ ih = node40_ih_at(node, cinfo->head_removed);
57451+ /* this is the most complex case. Item which got head removed and items which are to be moved
57452+ intact change their location differently. */
57453+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
57454+ cinfo->first_moved = cinfo->head_removed;
57455+ cinfo->head_removed_location = cinfo->freed_space_start;
57456+
57457+ /* item head is removed, therefore, item key changed */
57458+ coord.node = node;
57459+ coord_set_item_pos(&coord, cinfo->head_removed);
57460+ coord.unit_pos = 0;
57461+ coord.between = AT_UNIT;
57462+ update_item_key_node40(&coord, &new_first_key, NULL);
57463+
57464+ assert("vs-1579", cinfo->first_removed == 0);
57465+ /* key of first item of the node changes */
57466+ retval = 1;
57467+ break;
57468+
57469+ case CMODE_TAIL | CMODE_HEAD:
57470+ /* one item get cut from its end and its neighbor gets cut from its tail */
57471+ impossible("vs-1576", "this can not happen currently");
57472+ break;
57473+
57474+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
57475+ impossible("vs-1577", "this can not happen currently");
57476+ break;
57477+ default:
57478+ impossible("vs-1578", "unexpected cut mode");
57479+ break;
57480+ }
57481+ }
57482+ return retval;
57483+}
57484+
57485+/* plugin->u.node.kill
57486+ return value is number of items removed completely */
57487+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
57488+{
57489+ znode *node;
57490+ struct cut40_info cinfo;
57491+ int first_key_changed;
57492+
57493+ node = kdata->params.from->node;
57494+
57495+ first_key_changed =
57496+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
57497+ info);
57498+ compact(node, &cinfo);
57499+
57500+ if (info) {
57501+ /* it is not called by node40_shift, so we have to take care
57502+ of changes on upper levels */
57503+ if (node_is_empty(node)
57504+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
57505+ /* all contents of node is deleted */
57506+ prepare_removal_node40(node, info);
57507+ else if (first_key_changed) {
57508+ prepare_for_update(NULL, node, info);
57509+ }
57510+ }
57511+
57512+ coord_clear_iplug(kdata->params.from);
57513+ coord_clear_iplug(kdata->params.to);
57514+
57515+ znode_make_dirty(node);
57516+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57517+}
57518+
57519+/* plugin->u.node.cut
57520+ return value is number of items removed completely */
57521+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
57522+{
57523+ znode *node;
57524+ struct cut40_info cinfo;
57525+ int first_key_changed;
57526+
57527+ node = cdata->params.from->node;
57528+
57529+ first_key_changed =
57530+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
57531+ info);
57532+ compact(node, &cinfo);
57533+
57534+ if (info) {
57535+ /* it is not called by node40_shift, so we have to take care
57536+ of changes on upper levels */
57537+ if (node_is_empty(node))
57538+ /* all contents of node is deleted */
57539+ prepare_removal_node40(node, info);
57540+ else if (first_key_changed) {
57541+ prepare_for_update(NULL, node, info);
57542+ }
57543+ }
57544+
57545+ coord_clear_iplug(cdata->params.from);
57546+ coord_clear_iplug(cdata->params.to);
57547+
57548+ znode_make_dirty(node);
57549+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57550+}
57551+
57552+/* this structure is used by shift method of node40 plugin */
57553+struct shift_params {
57554+ shift_direction pend; /* when @pend == append - we are shifting to
57555+ left, when @pend == prepend - to right */
57556+ coord_t wish_stop; /* when shifting to left this is last unit we
57557+ want shifted, when shifting to right - this
57558+ is set to unit we want to start shifting
57559+ from */
57560+ znode *target;
57561+ int everything; /* it is set to 1 if everything we have to shift is
57562+ shifted, 0 - otherwise */
57563+
57564+ /* FIXME-VS: get rid of read_stop */
57565+
57566+ /* these are set by estimate_shift */
57567+ coord_t real_stop; /* this will be set to last unit which will be
57568+ really shifted */
57569+
57570+ /* coordinate in source node before operation of unit which becomes
57571+ first after shift to left of last after shift to right */
57572+ union {
57573+ coord_t future_first;
57574+ coord_t future_last;
57575+ } u;
57576+
57577+ unsigned merging_units; /* number of units of first item which have to
57578+ be merged with last item of target node */
57579+ unsigned merging_bytes; /* number of bytes in those units */
57580+
57581+ unsigned entire; /* items shifted in their entirety */
57582+ unsigned entire_bytes; /* number of bytes in those items */
57583+
57584+ unsigned part_units; /* number of units of partially copied item */
57585+ unsigned part_bytes; /* number of bytes in those units */
57586+
57587+ unsigned shift_bytes; /* total number of bytes in items shifted (item
57588+ headers not included) */
57589+
57590+};
57591+
57592+static int item_creation_overhead(coord_t *item)
57593+{
57594+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
57595+}
57596+
57597+/* how many units are there in @source starting from source->unit_pos
57598+ but not further than @stop_coord */
57599+static int
57600+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
57601+{
57602+ if (pend == SHIFT_LEFT) {
57603+ assert("vs-181", source->unit_pos == 0);
57604+ } else {
57605+ assert("vs-182",
57606+ source->unit_pos == coord_last_unit_pos(source));
57607+ }
57608+
57609+ if (source->item_pos != stop_coord->item_pos) {
57610+ /* @source and @stop_coord are different items */
57611+ return coord_last_unit_pos(source) + 1;
57612+ }
57613+
57614+ if (pend == SHIFT_LEFT) {
57615+ return stop_coord->unit_pos + 1;
57616+ } else {
57617+ return source->unit_pos - stop_coord->unit_pos + 1;
57618+ }
57619+}
57620+
57621+/* this calculates what can be copied from @shift->wish_stop.node to
57622+ @shift->target */
57623+static void
57624+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
57625+{
57626+ unsigned target_free_space, size;
57627+ pos_in_node_t stop_item; /* item which estimating should not consider */
57628+ unsigned want; /* number of units of item we want shifted */
57629+ coord_t source; /* item being estimated */
57630+ item_plugin *iplug;
57631+
57632+ /* shifting to left/right starts from first/last units of
57633+ @shift->wish_stop.node */
57634+ if (shift->pend == SHIFT_LEFT) {
57635+ coord_init_first_unit(&source, shift->wish_stop.node);
57636+ } else {
57637+ coord_init_last_unit(&source, shift->wish_stop.node);
57638+ }
57639+ shift->real_stop = source;
57640+
57641+ /* free space in target node and number of items in source */
57642+ target_free_space = znode_free_space(shift->target);
57643+
57644+ shift->everything = 0;
57645+ if (!node_is_empty(shift->target)) {
57646+ /* target node is not empty, check for boundary items
57647+ mergeability */
57648+ coord_t to;
57649+
57650+ /* item we try to merge @source with */
57651+ if (shift->pend == SHIFT_LEFT) {
57652+ coord_init_last_unit(&to, shift->target);
57653+ } else {
57654+ coord_init_first_unit(&to, shift->target);
57655+ }
57656+
57657+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
57658+ &source) :
57659+ are_items_mergeable(&source, &to)) {
57660+ /* how many units of @source do we want to merge to
57661+ item @to */
57662+ want =
57663+ wanted_units(&source, &shift->wish_stop,
57664+ shift->pend);
57665+
57666+ /* how many units of @source we can merge to item
57667+ @to */
57668+ iplug = item_plugin_by_coord(&source);
57669+ if (iplug->b.can_shift != NULL)
57670+ shift->merging_units =
57671+ iplug->b.can_shift(target_free_space,
57672+ &source, shift->target,
57673+ shift->pend, &size,
57674+ want);
57675+ else {
57676+ shift->merging_units = 0;
57677+ size = 0;
57678+ }
57679+ shift->merging_bytes = size;
57680+ shift->shift_bytes += size;
57681+ /* update stop coord to be set to last unit of @source
57682+ we can merge to @target */
57683+ if (shift->merging_units)
57684+ /* at least one unit can be shifted */
57685+ shift->real_stop.unit_pos =
57686+ (shift->merging_units - source.unit_pos -
57687+ 1) * shift->pend;
57688+ else {
57689+ /* nothing can be shifted */
57690+ if (shift->pend == SHIFT_LEFT)
57691+ coord_init_before_first_item(&shift->
57692+ real_stop,
57693+ source.
57694+ node);
57695+ else
57696+ coord_init_after_last_item(&shift->
57697+ real_stop,
57698+ source.node);
57699+ }
57700+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
57701+
57702+ if (shift->merging_units != want) {
57703+ /* we could not copy as many as we want, so,
57704+ there is no reason for estimating any
57705+ longer */
57706+ return;
57707+ }
57708+
57709+ target_free_space -= size;
57710+ coord_add_item_pos(&source, shift->pend);
57711+ }
57712+ }
57713+
57714+ /* number of item nothing of which we want to shift */
57715+ stop_item = shift->wish_stop.item_pos + shift->pend;
57716+
57717+ /* calculate how many items can be copied into given free
57718+ space as whole */
57719+ for (; source.item_pos != stop_item;
57720+ coord_add_item_pos(&source, shift->pend)) {
57721+ if (shift->pend == SHIFT_RIGHT)
57722+ source.unit_pos = coord_last_unit_pos(&source);
57723+
57724+ /* how many units of @source do we want to copy */
57725+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
57726+
57727+ if (want == coord_last_unit_pos(&source) + 1) {
57728+ /* we want this item to be copied entirely */
57729+ size =
57730+ item_length_by_coord(&source) +
57731+ item_creation_overhead(&source);
57732+ if (size <= target_free_space) {
57733+ /* item fits into target node as whole */
57734+ target_free_space -= size;
57735+ shift->shift_bytes +=
57736+ size - item_creation_overhead(&source);
57737+ shift->entire_bytes +=
57738+ size - item_creation_overhead(&source);
57739+ shift->entire++;
57740+
57741+ /* update shift->real_stop coord to be set to
57742+ last unit of @source we can merge to
57743+ @target */
57744+ shift->real_stop = source;
57745+ if (shift->pend == SHIFT_LEFT)
57746+ shift->real_stop.unit_pos =
57747+ coord_last_unit_pos(&shift->
57748+ real_stop);
57749+ else
57750+ shift->real_stop.unit_pos = 0;
57751+ continue;
57752+ }
57753+ }
57754+
57755+ /* we reach here only for an item which does not fit into
57756+ target node in its entirety. This item may be either
57757+ partially shifted, or not shifted at all. We will have to
57758+ create new item in target node, so decrease amout of free
57759+ space by an item creation overhead. We can reach here also
57760+ if stop coord is in this item */
57761+ if (target_free_space >=
57762+ (unsigned)item_creation_overhead(&source)) {
57763+ target_free_space -= item_creation_overhead(&source);
57764+ iplug = item_plugin_by_coord(&source);
57765+ if (iplug->b.can_shift) {
57766+ shift->part_units = iplug->b.can_shift(target_free_space,
57767+ &source,
57768+ NULL, /* target */
57769+ shift->pend,
57770+ &size,
57771+ want);
57772+ } else {
57773+ target_free_space = 0;
57774+ shift->part_units = 0;
57775+ size = 0;
57776+ }
57777+ } else {
57778+ target_free_space = 0;
57779+ shift->part_units = 0;
57780+ size = 0;
57781+ }
57782+ shift->part_bytes = size;
57783+ shift->shift_bytes += size;
57784+
57785+ /* set @shift->real_stop to last unit of @source we can merge
57786+ to @shift->target */
57787+ if (shift->part_units) {
57788+ shift->real_stop = source;
57789+ shift->real_stop.unit_pos =
57790+ (shift->part_units - source.unit_pos -
57791+ 1) * shift->pend;
57792+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
57793+ }
57794+
57795+ if (want != shift->part_units)
57796+ /* not everything wanted were shifted */
57797+ return;
57798+ break;
57799+ }
57800+
57801+ shift->everything = 1;
57802+}
57803+
57804+static void
57805+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
57806+ shift_direction dir, unsigned free_space)
57807+{
57808+ item_plugin *iplug;
57809+
57810+ assert("nikita-1463", target != NULL);
57811+ assert("nikita-1464", source != NULL);
57812+ assert("nikita-1465", from + count <= coord_num_units(source));
57813+
57814+ iplug = item_plugin_by_coord(source);
57815+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
57816+ iplug->b.copy_units(target, source, from, count, dir, free_space);
57817+
57818+ if (dir == SHIFT_RIGHT) {
57819+ /* FIXME-VS: this looks not necessary. update_item_key was
57820+ called already by copy_units method */
57821+ reiser4_key split_key;
57822+
57823+ assert("nikita-1469", target->unit_pos == 0);
57824+
57825+ unit_key_by_coord(target, &split_key);
57826+ node_plugin_by_coord(target)->update_item_key(target,
57827+ &split_key, NULL);
57828+ }
57829+}
57830+
57831+/* copy part of @shift->real_stop.node starting either from its beginning or
57832+ from its end and ending at @shift->real_stop to either the end or the
57833+ beginning of @shift->target */
57834+static void copy(struct shift_params *shift)
57835+{
57836+ node40_header *nh;
57837+ coord_t from;
57838+ coord_t to;
57839+ item_header40 *from_ih, *to_ih;
57840+ int free_space_start;
57841+ int new_items;
57842+ unsigned old_items;
57843+ int old_offset;
57844+ unsigned i;
57845+
57846+ nh = node40_node_header(shift->target);
57847+ free_space_start = nh40_get_free_space_start(nh);
57848+ old_items = nh40_get_num_items(nh);
57849+ new_items = shift->entire + (shift->part_units ? 1 : 0);
57850+ assert("vs-185",
57851+ shift->shift_bytes ==
57852+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
57853+
57854+ from = shift->wish_stop;
57855+
57856+ coord_init_first_unit(&to, shift->target);
57857+
57858+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
57859+ hence to.between is set to EMPTY_NODE above. Looks like we want it
57860+ to be AT_UNIT.
57861+
57862+ Oh, wonders of ->betweeness...
57863+
57864+ */
57865+ to.between = AT_UNIT;
57866+
57867+ if (shift->pend == SHIFT_LEFT) {
57868+ /* copying to left */
57869+
57870+ coord_set_item_pos(&from, 0);
57871+ from_ih = node40_ih_at(from.node, 0);
57872+
57873+ coord_set_item_pos(&to,
57874+ node40_num_of_items_internal(to.node) - 1);
57875+ if (shift->merging_units) {
57876+ /* expand last item, so that plugin methods will see
57877+ correct data */
57878+ free_space_start += shift->merging_bytes;
57879+ nh40_set_free_space_start(nh,
57880+ (unsigned)free_space_start);
57881+ nh40_set_free_space(nh,
57882+ nh40_get_free_space(nh) -
57883+ shift->merging_bytes);
57884+
57885+ /* appending last item of @target */
57886+ copy_units(&to, &from, 0, /* starting from 0-th unit */
57887+ shift->merging_units, SHIFT_LEFT,
57888+ shift->merging_bytes);
57889+ coord_inc_item_pos(&from);
57890+ from_ih--;
57891+ coord_inc_item_pos(&to);
57892+ }
57893+
57894+ to_ih = node40_ih_at(shift->target, old_items);
57895+ if (shift->entire) {
57896+ /* copy @entire items entirely */
57897+
57898+ /* copy item headers */
57899+ memcpy(to_ih - shift->entire + 1,
57900+ from_ih - shift->entire + 1,
57901+ shift->entire * sizeof(item_header40));
57902+ /* update item header offset */
57903+ old_offset = ih40_get_offset(from_ih);
57904+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
57905+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
57906+ ih40_set_offset(to_ih,
57907+ ih40_get_offset(from_ih) -
57908+ old_offset + free_space_start);
57909+
57910+ /* copy item bodies */
57911+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
57912+ shift->entire_bytes);
57913+
57914+ coord_add_item_pos(&from, (int)shift->entire);
57915+ coord_add_item_pos(&to, (int)shift->entire);
57916+ }
57917+
57918+ nh40_set_free_space_start(nh,
57919+ free_space_start +
57920+ shift->shift_bytes -
57921+ shift->merging_bytes);
57922+ nh40_set_free_space(nh,
57923+ nh40_get_free_space(nh) -
57924+ (shift->shift_bytes - shift->merging_bytes +
57925+ sizeof(item_header40) * new_items));
57926+
57927+ /* update node header */
57928+ node40_set_num_items(shift->target, nh, old_items + new_items);
57929+ assert("vs-170",
57930+ nh40_get_free_space(nh) < znode_size(shift->target));
57931+
57932+ if (shift->part_units) {
57933+ /* copy heading part (@part units) of @source item as
57934+ a new item into @target->node */
57935+
57936+ /* copy item header of partially copied item */
57937+ coord_set_item_pos(&to,
57938+ node40_num_of_items_internal(to.node)
57939+ - 1);
57940+ memcpy(to_ih, from_ih, sizeof(item_header40));
57941+ ih40_set_offset(to_ih,
57942+ nh40_get_free_space_start(nh) -
57943+ shift->part_bytes);
57944+ if (item_plugin_by_coord(&to)->b.init)
57945+ item_plugin_by_coord(&to)->b.init(&to, &from,
57946+ NULL);
57947+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
57948+ shift->part_bytes);
57949+ }
57950+
57951+ } else {
57952+ /* copying to right */
57953+
57954+ coord_set_item_pos(&from,
57955+ node40_num_of_items_internal(from.node) - 1);
57956+ from_ih = node40_ih_at_coord(&from);
57957+
57958+ coord_set_item_pos(&to, 0);
57959+
57960+ /* prepare space for new items */
57961+ memmove(zdata(to.node) + sizeof(node40_header) +
57962+ shift->shift_bytes,
57963+ zdata(to.node) + sizeof(node40_header),
57964+ free_space_start - sizeof(node40_header));
57965+ /* update item headers of moved items */
57966+ to_ih = node40_ih_at(to.node, 0);
57967+ /* first item gets @merging_bytes longer. free space appears
57968+ at its beginning */
57969+ if (!node_is_empty(to.node))
57970+ ih40_set_offset(to_ih,
57971+ ih40_get_offset(to_ih) +
57972+ shift->shift_bytes -
57973+ shift->merging_bytes);
57974+
57975+ for (i = 1; i < old_items; i++)
57976+ ih40_set_offset(to_ih - i,
57977+ ih40_get_offset(to_ih - i) +
57978+ shift->shift_bytes);
57979+
57980+ /* move item headers to make space for new items */
57981+ memmove(to_ih - old_items + 1 - new_items,
57982+ to_ih - old_items + 1,
57983+ sizeof(item_header40) * old_items);
57984+ to_ih -= (new_items - 1);
57985+
57986+ nh40_set_free_space_start(nh,
57987+ free_space_start +
57988+ shift->shift_bytes);
57989+ nh40_set_free_space(nh,
57990+ nh40_get_free_space(nh) -
57991+ (shift->shift_bytes +
57992+ sizeof(item_header40) * new_items));
57993+
57994+ /* update node header */
57995+ node40_set_num_items(shift->target, nh, old_items + new_items);
57996+ assert("vs-170",
57997+ nh40_get_free_space(nh) < znode_size(shift->target));
57998+
57999+ if (shift->merging_units) {
58000+ coord_add_item_pos(&to, new_items);
58001+ to.unit_pos = 0;
58002+ to.between = AT_UNIT;
58003+ /* prepend first item of @to */
58004+ copy_units(&to, &from,
58005+ coord_last_unit_pos(&from) -
58006+ shift->merging_units + 1,
58007+ shift->merging_units, SHIFT_RIGHT,
58008+ shift->merging_bytes);
58009+ coord_dec_item_pos(&from);
58010+ from_ih++;
58011+ }
58012+
58013+ if (shift->entire) {
58014+ /* copy @entire items entirely */
58015+
58016+ /* copy item headers */
58017+ memcpy(to_ih, from_ih,
58018+ shift->entire * sizeof(item_header40));
58019+
58020+ /* update item header offset */
58021+ old_offset =
58022+ ih40_get_offset(from_ih + shift->entire - 1);
58023+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
58024+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
58025+ ih40_set_offset(to_ih,
58026+ ih40_get_offset(from_ih) -
58027+ old_offset +
58028+ sizeof(node40_header) +
58029+ shift->part_bytes);
58030+ /* copy item bodies */
58031+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
58032+ memcpy(zdata(to.node) + sizeof(node40_header) +
58033+ shift->part_bytes, item_by_coord_node40(&from),
58034+ shift->entire_bytes);
58035+ coord_dec_item_pos(&from);
58036+ }
58037+
58038+ if (shift->part_units) {
58039+ coord_set_item_pos(&to, 0);
58040+ to.unit_pos = 0;
58041+ to.between = AT_UNIT;
58042+ /* copy heading part (@part units) of @source item as
58043+ a new item into @target->node */
58044+
58045+ /* copy item header of partially copied item */
58046+ memcpy(to_ih, from_ih, sizeof(item_header40));
58047+ ih40_set_offset(to_ih, sizeof(node40_header));
58048+ if (item_plugin_by_coord(&to)->b.init)
58049+ item_plugin_by_coord(&to)->b.init(&to, &from,
58050+ NULL);
58051+ copy_units(&to, &from,
58052+ coord_last_unit_pos(&from) -
58053+ shift->part_units + 1, shift->part_units,
58054+ SHIFT_RIGHT, shift->part_bytes);
58055+ }
58056+ }
58057+}
58058+
58059+/* remove everything either before or after @fact_stop. Number of items
58060+ removed completely is returned */
58061+static int delete_copied(struct shift_params *shift)
58062+{
58063+ coord_t from;
58064+ coord_t to;
58065+ struct carry_cut_data cdata;
58066+
58067+ if (shift->pend == SHIFT_LEFT) {
58068+ /* we were shifting to left, remove everything from the
58069+ beginning of @shift->wish_stop->node upto
58070+ @shift->wish_stop */
58071+ coord_init_first_unit(&from, shift->real_stop.node);
58072+ to = shift->real_stop;
58073+
58074+ /* store old coordinate of unit which will be first after
58075+ shift to left */
58076+ shift->u.future_first = to;
58077+ coord_next_unit(&shift->u.future_first);
58078+ } else {
58079+ /* we were shifting to right, remove everything from
58080+ @shift->stop_coord upto to end of
58081+ @shift->stop_coord->node */
58082+ from = shift->real_stop;
58083+ coord_init_last_unit(&to, from.node);
58084+
58085+ /* store old coordinate of unit which will be last after
58086+ shift to right */
58087+ shift->u.future_last = from;
58088+ coord_prev_unit(&shift->u.future_last);
58089+ }
58090+
58091+ cdata.params.from = &from;
58092+ cdata.params.to = &to;
58093+ cdata.params.from_key = NULL;
58094+ cdata.params.to_key = NULL;
58095+ cdata.params.smallest_removed = NULL;
58096+ return cut_node40(&cdata, NULL);
58097+}
58098+
58099+/* something was moved between @left and @right. Add carry operation to @info
58100+ list to have carry to update delimiting key between them */
58101+static int
58102+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
58103+{
58104+ carry_op *op;
58105+ carry_node *cn;
58106+
58107+ if (info == NULL)
58108+ /* nowhere to send operation to. */
58109+ return 0;
58110+
58111+ if (!should_notify_parent(right))
58112+ return 0;
58113+
58114+ op = node_post_carry(info, COP_UPDATE, right, 1);
58115+ if (IS_ERR(op) || op == NULL)
58116+ return op ? PTR_ERR(op) : -EIO;
58117+
58118+ if (left != NULL) {
58119+ carry_node *reference;
58120+
58121+ if (info->doing)
58122+ reference = insert_carry_node(info->doing,
58123+ info->todo, left);
58124+ else
58125+ reference = op->node;
58126+ assert("nikita-2992", reference != NULL);
58127+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
58128+ if (IS_ERR(cn))
58129+ return PTR_ERR(cn);
58130+ cn->parent = 1;
58131+ cn->node = left;
58132+ if (ZF_ISSET(left, JNODE_ORPHAN))
58133+ cn->left_before = 1;
58134+ op->u.update.left = cn;
58135+ } else
58136+ op->u.update.left = NULL;
58137+ return 0;
58138+}
58139+
58140+/* plugin->u.node.prepare_removal
58141+ to delete a pointer to @empty from the tree add corresponding carry
58142+ operation (delete) to @info list */
58143+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
58144+{
58145+ carry_op *op;
58146+ reiser4_tree *tree;
58147+
58148+ if (!should_notify_parent(empty))
58149+ return 0;
58150+ /* already on a road to Styx */
58151+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
58152+ return 0;
58153+ op = node_post_carry(info, COP_DELETE, empty, 1);
58154+ if (IS_ERR(op) || op == NULL)
58155+ return RETERR(op ? PTR_ERR(op) : -EIO);
58156+
58157+ op->u.delete.child = NULL;
58158+ op->u.delete.flags = 0;
58159+
58160+ /* fare thee well */
58161+ tree = znode_get_tree(empty);
58162+ read_lock_tree(tree);
58163+ write_lock_dk(tree);
58164+ znode_set_ld_key(empty, znode_get_rd_key(empty));
58165+ if (znode_is_left_connected(empty) && empty->left)
58166+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
58167+ write_unlock_dk(tree);
58168+ read_unlock_tree(tree);
58169+
58170+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
58171+ return 0;
58172+}
58173+
58174+/* something were shifted from @insert_coord->node to @shift->target, update
58175+ @insert_coord correspondingly */
58176+static void
58177+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
58178+ int including_insert_coord)
58179+{
58180+ /* item plugin was invalidated by shifting */
58181+ coord_clear_iplug(insert_coord);
58182+
58183+ if (node_is_empty(shift->wish_stop.node)) {
58184+ assert("vs-242", shift->everything);
58185+ if (including_insert_coord) {
58186+ if (shift->pend == SHIFT_RIGHT) {
58187+ /* set @insert_coord before first unit of
58188+ @shift->target node */
58189+ coord_init_before_first_item(insert_coord,
58190+ shift->target);
58191+ } else {
58192+ /* set @insert_coord after last in target node */
58193+ coord_init_after_last_item(insert_coord,
58194+ shift->target);
58195+ }
58196+ } else {
58197+ /* set @insert_coord inside of empty node. There is
58198+ only one possible coord within an empty
58199+ node. init_first_unit will set that coord */
58200+ coord_init_first_unit(insert_coord,
58201+ shift->wish_stop.node);
58202+ }
58203+ return;
58204+ }
58205+
58206+ if (shift->pend == SHIFT_RIGHT) {
58207+ /* there was shifting to right */
58208+ if (shift->everything) {
58209+ /* everything wanted was shifted */
58210+ if (including_insert_coord) {
58211+ /* @insert_coord is set before first unit of
58212+ @to node */
58213+ coord_init_before_first_item(insert_coord,
58214+ shift->target);
58215+ insert_coord->between = BEFORE_UNIT;
58216+ } else {
58217+ /* @insert_coord is set after last unit of
58218+ @insert->node */
58219+ coord_init_last_unit(insert_coord,
58220+ shift->wish_stop.node);
58221+ insert_coord->between = AFTER_UNIT;
58222+ }
58223+ }
58224+ return;
58225+ }
58226+
58227+ /* there was shifting to left */
58228+ if (shift->everything) {
58229+ /* everything wanted was shifted */
58230+ if (including_insert_coord) {
58231+ /* @insert_coord is set after last unit in @to node */
58232+ coord_init_after_last_item(insert_coord, shift->target);
58233+ } else {
58234+ /* @insert_coord is set before first unit in the same
58235+ node */
58236+ coord_init_before_first_item(insert_coord,
58237+ shift->wish_stop.node);
58238+ }
58239+ return;
58240+ }
58241+
58242+ /* FIXME-VS: the code below is complicated because with between ==
58243+ AFTER_ITEM unit_pos is set to 0 */
58244+
58245+ if (!removed) {
58246+ /* no items were shifted entirely */
58247+ assert("vs-195", shift->merging_units == 0
58248+ || shift->part_units == 0);
58249+
58250+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
58251+ if (shift->merging_units) {
58252+ if (insert_coord->between == AFTER_UNIT) {
58253+ assert("nikita-1441",
58254+ insert_coord->unit_pos >=
58255+ shift->merging_units);
58256+ insert_coord->unit_pos -=
58257+ shift->merging_units;
58258+ } else if (insert_coord->between == BEFORE_UNIT) {
58259+ assert("nikita-2090",
58260+ insert_coord->unit_pos >
58261+ shift->merging_units);
58262+ insert_coord->unit_pos -=
58263+ shift->merging_units;
58264+ }
58265+
58266+ assert("nikita-2083",
58267+ insert_coord->unit_pos + 1);
58268+ } else {
58269+ if (insert_coord->between == AFTER_UNIT) {
58270+ assert("nikita-1442",
58271+ insert_coord->unit_pos >=
58272+ shift->part_units);
58273+ insert_coord->unit_pos -=
58274+ shift->part_units;
58275+ } else if (insert_coord->between == BEFORE_UNIT) {
58276+ assert("nikita-2089",
58277+ insert_coord->unit_pos >
58278+ shift->part_units);
58279+ insert_coord->unit_pos -=
58280+ shift->part_units;
58281+ }
58282+
58283+ assert("nikita-2084",
58284+ insert_coord->unit_pos + 1);
58285+ }
58286+ }
58287+ return;
58288+ }
58289+
58290+ /* we shifted to left and there was no enough space for everything */
58291+ switch (insert_coord->between) {
58292+ case AFTER_UNIT:
58293+ case BEFORE_UNIT:
58294+ if (shift->real_stop.item_pos == insert_coord->item_pos)
58295+ insert_coord->unit_pos -= shift->part_units;
58296+ case AFTER_ITEM:
58297+ coord_add_item_pos(insert_coord, -removed);
58298+ break;
58299+ default:
58300+ impossible("nikita-2087", "not ready");
58301+ }
58302+ assert("nikita-2085", insert_coord->unit_pos + 1);
58303+}
58304+
58305+static int call_shift_hooks(struct shift_params *shift)
58306+{
58307+ unsigned i, shifted;
58308+ coord_t coord;
58309+ item_plugin *iplug;
58310+
58311+ assert("vs-275", !node_is_empty(shift->target));
58312+
58313+ /* number of items shift touches */
58314+ shifted =
58315+ shift->entire + (shift->merging_units ? 1 : 0) +
58316+ (shift->part_units ? 1 : 0);
58317+
58318+ if (shift->pend == SHIFT_LEFT) {
58319+ /* moved items are at the end */
58320+ coord_init_last_unit(&coord, shift->target);
58321+ coord.unit_pos = 0;
58322+
58323+ assert("vs-279", shift->pend == 1);
58324+ for (i = 0; i < shifted; i++) {
58325+ unsigned from, count;
58326+
58327+ iplug = item_plugin_by_coord(&coord);
58328+ if (i == 0 && shift->part_units) {
58329+ assert("vs-277",
58330+ coord_num_units(&coord) ==
58331+ shift->part_units);
58332+ count = shift->part_units;
58333+ from = 0;
58334+ } else if (i == shifted - 1 && shift->merging_units) {
58335+ count = shift->merging_units;
58336+ from = coord_num_units(&coord) - count;
58337+ } else {
58338+ count = coord_num_units(&coord);
58339+ from = 0;
58340+ }
58341+
58342+ if (iplug->b.shift_hook) {
58343+ iplug->b.shift_hook(&coord, from, count,
58344+ shift->wish_stop.node);
58345+ }
58346+ coord_add_item_pos(&coord, -shift->pend);
58347+ }
58348+ } else {
58349+ /* moved items are at the beginning */
58350+ coord_init_first_unit(&coord, shift->target);
58351+
58352+ assert("vs-278", shift->pend == -1);
58353+ for (i = 0; i < shifted; i++) {
58354+ unsigned from, count;
58355+
58356+ iplug = item_plugin_by_coord(&coord);
58357+ if (i == 0 && shift->part_units) {
58358+ assert("vs-277",
58359+ coord_num_units(&coord) ==
58360+ shift->part_units);
58361+ count = coord_num_units(&coord);
58362+ from = 0;
58363+ } else if (i == shifted - 1 && shift->merging_units) {
58364+ count = shift->merging_units;
58365+ from = 0;
58366+ } else {
58367+ count = coord_num_units(&coord);
58368+ from = 0;
58369+ }
58370+
58371+ if (iplug->b.shift_hook) {
58372+ iplug->b.shift_hook(&coord, from, count,
58373+ shift->wish_stop.node);
58374+ }
58375+ coord_add_item_pos(&coord, -shift->pend);
58376+ }
58377+ }
58378+
58379+ return 0;
58380+}
58381+
58382+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
58383+static int
58384+unit_moved_left(const struct shift_params *shift, const coord_t * old)
58385+{
58386+ assert("vs-944", shift->real_stop.node == old->node);
58387+
58388+ if (shift->real_stop.item_pos < old->item_pos)
58389+ return 0;
58390+ if (shift->real_stop.item_pos == old->item_pos) {
58391+ if (shift->real_stop.unit_pos < old->unit_pos)
58392+ return 0;
58393+ }
58394+ return 1;
58395+}
58396+
58397+/* shift to right is completed. Return 1 if unit @old was moved to right
58398+ neighbor */
58399+static int
58400+unit_moved_right(const struct shift_params *shift, const coord_t * old)
58401+{
58402+ assert("vs-944", shift->real_stop.node == old->node);
58403+
58404+ if (shift->real_stop.item_pos > old->item_pos)
58405+ return 0;
58406+ if (shift->real_stop.item_pos == old->item_pos) {
58407+ if (shift->real_stop.unit_pos > old->unit_pos)
58408+ return 0;
58409+ }
58410+ return 1;
58411+}
58412+
58413+/* coord @old was set in node from which shift was performed. What was shifted
58414+ is stored in @shift. Update @old correspondingly to performed shift */
58415+static coord_t *adjust_coord2(const struct shift_params *shift,
58416+ const coord_t * old, coord_t * new)
58417+{
58418+ coord_clear_iplug(new);
58419+ new->between = old->between;
58420+
58421+ coord_clear_iplug(new);
58422+ if (old->node == shift->target) {
58423+ if (shift->pend == SHIFT_LEFT) {
58424+ /* coord which is set inside of left neighbor does not
58425+ change during shift to left */
58426+ coord_dup(new, old);
58427+ return new;
58428+ }
58429+ new->node = old->node;
58430+ coord_set_item_pos(new,
58431+ old->item_pos + shift->entire +
58432+ (shift->part_units ? 1 : 0));
58433+ new->unit_pos = old->unit_pos;
58434+ if (old->item_pos == 0 && shift->merging_units)
58435+ new->unit_pos += shift->merging_units;
58436+ return new;
58437+ }
58438+
58439+ assert("vs-977", old->node == shift->wish_stop.node);
58440+ if (shift->pend == SHIFT_LEFT) {
58441+ if (unit_moved_left(shift, old)) {
58442+ /* unit @old moved to left neighbor. Calculate its
58443+ coordinate there */
58444+ new->node = shift->target;
58445+ coord_set_item_pos(new,
58446+ node_num_items(shift->target) -
58447+ shift->entire -
58448+ (shift->part_units ? 1 : 0) +
58449+ old->item_pos);
58450+
58451+ new->unit_pos = old->unit_pos;
58452+ if (shift->merging_units) {
58453+ coord_dec_item_pos(new);
58454+ if (old->item_pos == 0) {
58455+ /* unit_pos only changes if item got
58456+ merged */
58457+ new->unit_pos =
58458+ coord_num_units(new) -
58459+ (shift->merging_units -
58460+ old->unit_pos);
58461+ }
58462+ }
58463+ } else {
58464+ /* unit @old did not move to left neighbor.
58465+
58466+ Use _nocheck, because @old is outside of its node.
58467+ */
58468+ coord_dup_nocheck(new, old);
58469+ coord_add_item_pos(new,
58470+ -shift->u.future_first.item_pos);
58471+ if (new->item_pos == 0)
58472+ new->unit_pos -= shift->u.future_first.unit_pos;
58473+ }
58474+ } else {
58475+ if (unit_moved_right(shift, old)) {
58476+ /* unit @old moved to right neighbor */
58477+ new->node = shift->target;
58478+ coord_set_item_pos(new,
58479+ old->item_pos -
58480+ shift->real_stop.item_pos);
58481+ if (new->item_pos == 0) {
58482+ /* unit @old might change unit pos */
58483+ coord_set_item_pos(new,
58484+ old->unit_pos -
58485+ shift->real_stop.unit_pos);
58486+ }
58487+ } else {
58488+ /* unit @old did not move to right neighbor, therefore
58489+ it did not change */
58490+ coord_dup(new, old);
58491+ }
58492+ }
58493+ coord_set_iplug(new, item_plugin_by_coord(new));
58494+ return new;
58495+}
58496+
58497+/* this is called when shift is completed (something of source node is copied
58498+ to target and deleted in source) to update all taps set in current
58499+ context */
58500+static void update_taps(const struct shift_params *shift)
58501+{
58502+ tap_t *tap;
58503+ coord_t new;
58504+
58505+ for_all_taps(tap) {
58506+ /* update only taps set to nodes participating in shift */
58507+ if (tap->coord->node == shift->wish_stop.node
58508+ || tap->coord->node == shift->target)
58509+ tap_to_coord(tap,
58510+ adjust_coord2(shift, tap->coord, &new));
58511+ }
58512+}
58513+
58514+#if REISER4_DEBUG
58515+
58516+struct shift_check {
58517+ reiser4_key key;
58518+ __u16 plugin_id;
58519+ union {
58520+ __u64 bytes;
58521+ __u64 entries;
58522+ void *unused;
58523+ } u;
58524+};
58525+
58526+void *shift_check_prepare(const znode * left, const znode * right)
58527+{
58528+ pos_in_node_t i, nr_items;
58529+ int mergeable;
58530+ struct shift_check *data;
58531+ item_header40 *ih;
58532+
58533+ if (node_is_empty(left) || node_is_empty(right))
58534+ mergeable = 0;
58535+ else {
58536+ coord_t l, r;
58537+
58538+ coord_init_last_unit(&l, left);
58539+ coord_init_first_unit(&r, right);
58540+ mergeable = are_items_mergeable(&l, &r);
58541+ }
58542+ nr_items =
58543+ node40_num_of_items_internal(left) +
58544+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58545+ data =
58546+ kmalloc(sizeof(struct shift_check) * nr_items,
58547+ reiser4_ctx_gfp_mask_get());
58548+ if (data != NULL) {
58549+ coord_t coord;
58550+ pos_in_node_t item_pos;
58551+
58552+ coord_init_first_unit(&coord, left);
58553+ i = 0;
58554+
58555+ for (item_pos = 0;
58556+ item_pos < node40_num_of_items_internal(left);
58557+ item_pos++) {
58558+
58559+ coord_set_item_pos(&coord, item_pos);
58560+ ih = node40_ih_at_coord(&coord);
58561+
58562+ data[i].key = ih->key;
58563+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58564+ switch (data[i].plugin_id) {
58565+ case CTAIL_ID:
58566+ case FORMATTING_ID:
58567+ data[i].u.bytes = coord_num_units(&coord);
58568+ break;
58569+ case EXTENT_POINTER_ID:
58570+ data[i].u.bytes =
58571+ reiser4_extent_size(&coord,
58572+ coord_num_units(&coord));
58573+ break;
58574+ case COMPOUND_DIR_ID:
58575+ data[i].u.entries = coord_num_units(&coord);
58576+ break;
58577+ default:
58578+ data[i].u.unused = NULL;
58579+ break;
58580+ }
58581+ i++;
58582+ }
58583+
58584+ coord_init_first_unit(&coord, right);
58585+
58586+ if (mergeable) {
58587+ assert("vs-1609", i != 0);
58588+
58589+ ih = node40_ih_at_coord(&coord);
58590+
58591+ assert("vs-1589",
58592+ data[i - 1].plugin_id ==
58593+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
58594+ switch (data[i - 1].plugin_id) {
58595+ case CTAIL_ID:
58596+ case FORMATTING_ID:
58597+ data[i - 1].u.bytes += coord_num_units(&coord);
58598+ break;
58599+ case EXTENT_POINTER_ID:
58600+ data[i - 1].u.bytes +=
58601+ reiser4_extent_size(&coord,
58602+ coord_num_units(&coord));
58603+ break;
58604+ case COMPOUND_DIR_ID:
58605+ data[i - 1].u.entries +=
58606+ coord_num_units(&coord);
58607+ break;
58608+ default:
58609+ impossible("vs-1605", "wrong mergeable item");
58610+ break;
58611+ }
58612+ item_pos = 1;
58613+ } else
58614+ item_pos = 0;
58615+ for (; item_pos < node40_num_of_items_internal(right);
58616+ item_pos++) {
58617+
58618+ assert("vs-1604", i < nr_items);
58619+ coord_set_item_pos(&coord, item_pos);
58620+ ih = node40_ih_at_coord(&coord);
58621+
58622+ data[i].key = ih->key;
58623+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58624+ switch (data[i].plugin_id) {
58625+ case CTAIL_ID:
58626+ case FORMATTING_ID:
58627+ data[i].u.bytes = coord_num_units(&coord);
58628+ break;
58629+ case EXTENT_POINTER_ID:
58630+ data[i].u.bytes =
58631+ reiser4_extent_size(&coord,
58632+ coord_num_units(&coord));
58633+ break;
58634+ case COMPOUND_DIR_ID:
58635+ data[i].u.entries = coord_num_units(&coord);
58636+ break;
58637+ default:
58638+ data[i].u.unused = NULL;
58639+ break;
58640+ }
58641+ i++;
58642+ }
58643+ assert("vs-1606", i == nr_items);
58644+ }
58645+ return data;
58646+}
58647+
58648+void shift_check(void *vp, const znode * left, const znode * right)
58649+{
58650+ pos_in_node_t i, nr_items;
58651+ coord_t coord;
58652+ __u64 last_bytes;
58653+ int mergeable;
58654+ item_header40 *ih;
58655+ pos_in_node_t item_pos;
58656+ struct shift_check *data;
58657+
58658+ data = (struct shift_check *)vp;
58659+
58660+ if (data == NULL)
58661+ return;
58662+
58663+ if (node_is_empty(left) || node_is_empty(right))
58664+ mergeable = 0;
58665+ else {
58666+ coord_t l, r;
58667+
58668+ coord_init_last_unit(&l, left);
58669+ coord_init_first_unit(&r, right);
58670+ mergeable = are_items_mergeable(&l, &r);
58671+ }
58672+
58673+ nr_items =
58674+ node40_num_of_items_internal(left) +
58675+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58676+
58677+ i = 0;
58678+ last_bytes = 0;
58679+
58680+ coord_init_first_unit(&coord, left);
58681+
58682+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
58683+ item_pos++) {
58684+
58685+ coord_set_item_pos(&coord, item_pos);
58686+ ih = node40_ih_at_coord(&coord);
58687+
58688+ assert("vs-1611", i == item_pos);
58689+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
58690+ assert("vs-1591",
58691+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58692+ if ((i < (node40_num_of_items_internal(left) - 1))
58693+ || !mergeable) {
58694+ switch (data[i].plugin_id) {
58695+ case CTAIL_ID:
58696+ case FORMATTING_ID:
58697+ assert("vs-1592",
58698+ data[i].u.bytes ==
58699+ coord_num_units(&coord));
58700+ break;
58701+ case EXTENT_POINTER_ID:
58702+ assert("vs-1593",
58703+ data[i].u.bytes ==
58704+ reiser4_extent_size(&coord,
58705+ coord_num_units
58706+ (&coord)));
58707+ break;
58708+ case COMPOUND_DIR_ID:
58709+ assert("vs-1594",
58710+ data[i].u.entries ==
58711+ coord_num_units(&coord));
58712+ break;
58713+ default:
58714+ break;
58715+ }
58716+ }
58717+ if (item_pos == (node40_num_of_items_internal(left) - 1)
58718+ && mergeable) {
58719+ switch (data[i].plugin_id) {
58720+ case CTAIL_ID:
58721+ case FORMATTING_ID:
58722+ last_bytes = coord_num_units(&coord);
58723+ break;
58724+ case EXTENT_POINTER_ID:
58725+ last_bytes =
58726+ reiser4_extent_size(&coord,
58727+ coord_num_units(&coord));
58728+ break;
58729+ case COMPOUND_DIR_ID:
58730+ last_bytes = coord_num_units(&coord);
58731+ break;
58732+ default:
58733+ impossible("vs-1595", "wrong mergeable item");
58734+ break;
58735+ }
58736+ }
58737+ i++;
58738+ }
58739+
58740+ coord_init_first_unit(&coord, right);
58741+ if (mergeable) {
58742+ ih = node40_ih_at_coord(&coord);
58743+
58744+ assert("vs-1589",
58745+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
58746+ assert("vs-1608", last_bytes != 0);
58747+ switch (data[i - 1].plugin_id) {
58748+ case CTAIL_ID:
58749+ case FORMATTING_ID:
58750+ assert("vs-1596",
58751+ data[i - 1].u.bytes ==
58752+ last_bytes + coord_num_units(&coord));
58753+ break;
58754+
58755+ case EXTENT_POINTER_ID:
58756+ assert("vs-1597",
58757+ data[i - 1].u.bytes ==
58758+ last_bytes + reiser4_extent_size(&coord,
58759+ coord_num_units
58760+ (&coord)));
58761+ break;
58762+
58763+ case COMPOUND_DIR_ID:
58764+ assert("vs-1598",
58765+ data[i - 1].u.bytes ==
58766+ last_bytes + coord_num_units(&coord));
58767+ break;
58768+ default:
58769+ impossible("vs-1599", "wrong mergeable item");
58770+ break;
58771+ }
58772+ item_pos = 1;
58773+ } else
58774+ item_pos = 0;
58775+
58776+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
58777+
58778+ coord_set_item_pos(&coord, item_pos);
58779+ ih = node40_ih_at_coord(&coord);
58780+
58781+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
58782+ assert("vs-1613",
58783+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58784+ switch (data[i].plugin_id) {
58785+ case CTAIL_ID:
58786+ case FORMATTING_ID:
58787+ assert("vs-1600",
58788+ data[i].u.bytes == coord_num_units(&coord));
58789+ break;
58790+ case EXTENT_POINTER_ID:
58791+ assert("vs-1601",
58792+ data[i].u.bytes ==
58793+ reiser4_extent_size(&coord,
58794+ coord_num_units
58795+ (&coord)));
58796+ break;
58797+ case COMPOUND_DIR_ID:
58798+ assert("vs-1602",
58799+ data[i].u.entries == coord_num_units(&coord));
58800+ break;
58801+ default:
58802+ break;
58803+ }
58804+ i++;
58805+ }
58806+
58807+ assert("vs-1603", i == nr_items);
58808+ kfree(data);
58809+}
58810+
58811+#endif
58812+
58813+/* plugin->u.node.shift
58814+ look for description of this method in plugin/node/node.h */
58815+int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child, /* if @from->node becomes empty - it will be
58816+ deleted from the tree if this is set to 1 */
58817+ int including_stop_coord, carry_plugin_info * info)
58818+{
58819+ struct shift_params shift;
58820+ int result;
58821+ znode *left, *right;
58822+ znode *source;
58823+ int target_empty;
58824+
58825+ assert("nikita-2161", coord_check(from));
58826+
58827+ memset(&shift, 0, sizeof(shift));
58828+ shift.pend = pend;
58829+ shift.wish_stop = *from;
58830+ shift.target = to;
58831+
58832+ assert("nikita-1473", znode_is_write_locked(from->node));
58833+ assert("nikita-1474", znode_is_write_locked(to));
58834+
58835+ source = from->node;
58836+
58837+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
58838+ shifted */
58839+ if (pend == SHIFT_LEFT) {
58840+ result = coord_set_to_left(&shift.wish_stop);
58841+ left = to;
58842+ right = from->node;
58843+ } else {
58844+ result = coord_set_to_right(&shift.wish_stop);
58845+ left = from->node;
58846+ right = to;
58847+ }
58848+
58849+ if (result) {
58850+ /* move insertion coord even if there is nothing to move */
58851+ if (including_stop_coord) {
58852+ /* move insertion coord (@from) */
58853+ if (pend == SHIFT_LEFT) {
58854+ /* after last item in target node */
58855+ coord_init_after_last_item(from, to);
58856+ } else {
58857+ /* before first item in target node */
58858+ coord_init_before_first_item(from, to);
58859+ }
58860+ }
58861+
58862+ if (delete_child && node_is_empty(shift.wish_stop.node))
58863+ result =
58864+ prepare_removal_node40(shift.wish_stop.node, info);
58865+ else
58866+ result = 0;
58867+ /* there is nothing to shift */
58868+ assert("nikita-2078", coord_check(from));
58869+ return result;
58870+ }
58871+
58872+ target_empty = node_is_empty(to);
58873+
58874+ /* when first node plugin with item body compression is implemented,
58875+ this must be changed to call node specific plugin */
58876+
58877+ /* shift->stop_coord is updated to last unit which really will be
58878+ shifted */
58879+ estimate_shift(&shift, get_current_context());
58880+ if (!shift.shift_bytes) {
58881+ /* we could not shift anything */
58882+ assert("nikita-2079", coord_check(from));
58883+ return 0;
58884+ }
58885+
58886+ copy(&shift);
58887+
58888+ /* result value of this is important. It is used by adjust_coord below */
58889+ result = delete_copied(&shift);
58890+
58891+ assert("vs-1610", result >= 0);
58892+ assert("vs-1471",
58893+ ((reiser4_context *) current->journal_info)->magic ==
58894+ context_magic);
58895+
58896+ /* item which has been moved from one node to another might want to do
58897+ something on that event. This can be done by item's shift_hook
58898+ method, which will be now called for every moved items */
58899+ call_shift_hooks(&shift);
58900+
58901+ assert("vs-1472",
58902+ ((reiser4_context *) current->journal_info)->magic ==
58903+ context_magic);
58904+
58905+ update_taps(&shift);
58906+
58907+ assert("vs-1473",
58908+ ((reiser4_context *) current->journal_info)->magic ==
58909+ context_magic);
58910+
58911+ /* adjust @from pointer in accordance with @including_stop_coord flag
58912+ and amount of data which was really shifted */
58913+ adjust_coord(from, &shift, result, including_stop_coord);
58914+
58915+ if (target_empty)
58916+ /*
58917+ * items were shifted into empty node. Update delimiting key.
58918+ */
58919+ result = prepare_for_update(NULL, left, info);
58920+
58921+ /* add update operation to @info, which is the list of operations to
58922+ be performed on a higher level */
58923+ result = prepare_for_update(left, right, info);
58924+ if (!result && node_is_empty(source) && delete_child) {
58925+ /* all contents of @from->node is moved to @to and @from->node
58926+ has to be removed from the tree, so, on higher level we
58927+ will be removing the pointer to node @from->node */
58928+ result = prepare_removal_node40(source, info);
58929+ }
58930+ assert("nikita-2080", coord_check(from));
58931+ return result ? result : (int)shift.shift_bytes;
58932+}
58933+
58934+/* plugin->u.node.fast_insert()
58935+ look for description of this method in plugin/node/node.h */
58936+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58937+{
58938+ return 1;
58939+}
58940+
58941+/* plugin->u.node.fast_paste()
58942+ look for description of this method in plugin/node/node.h */
58943+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58944+{
58945+ return 1;
58946+}
58947+
58948+/* plugin->u.node.fast_cut()
58949+ look for description of this method in plugin/node/node.h */
58950+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58951+{
58952+ return 1;
58953+}
58954+
58955+/* plugin->u.node.modify - not defined */
58956+
58957+/* plugin->u.node.max_item_size */
58958+int max_item_size_node40(void)
58959+{
58960+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58961+ sizeof(item_header40);
58962+}
58963+
58964+/* plugin->u.node.set_item_plugin */
58965+int set_item_plugin_node40(coord_t *coord, item_id id)
58966+{
58967+ item_header40 *ih;
58968+
58969+ ih = node40_ih_at_coord(coord);
58970+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58971+ coord->iplugid = id;
58972+ return 0;
58973+}
58974+
58975+/*
58976+ Local variables:
58977+ c-indentation-style: "K&R"
58978+ mode-name: "LC"
58979+ c-basic-offset: 8
58980+ tab-width: 8
58981+ fill-column: 120
58982+ scroll-step: 1
58983+ End:
58984+*/
58985diff --git a/fs/reiser4/plugin/node/node40.h b/fs/reiser4/plugin/node/node40.h
58986new file mode 100644
58987index 0000000..8ae375b
58988--- /dev/null
58989+++ b/fs/reiser4/plugin/node/node40.h
58990@@ -0,0 +1,125 @@
58991+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58992+
58993+#if !defined( __REISER4_NODE40_H__ )
58994+#define __REISER4_NODE40_H__
58995+
58996+#include "../../forward.h"
58997+#include "../../dformat.h"
58998+#include "node.h"
58999+
59000+#include <linux/types.h>
59001+
59002+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
59003+typedef struct node40_header {
59004+ /* identifier of node plugin. Must be located at the very beginning
59005+ of a node. */
59006+ common_node_header common_header; /* this is 16 bits */
59007+ /* number of items. Should be first element in the node header,
59008+ because we haven't yet finally decided whether it shouldn't go into
59009+ common_header.
59010+ */
59011+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
59012+ * node format at compile time, and it is this one, accesses do not function dereference when
59013+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
59014+ d16 nr_items;
59015+ /* free space in node measured in bytes */
59016+ d16 free_space;
59017+ /* offset to start of free space in node */
59018+ d16 free_space_start;
59019+ /* for reiser4_fsck. When information about what is a free
59020+ block is corrupted, and we try to recover everything even
59021+ if marked as freed, then old versions of data may
59022+ duplicate newer versions, and this field allows us to
59023+ restore the newer version. Also useful for when users
59024+ who don't have the new trashcan installed on their linux distro
59025+ delete the wrong files and send us desperate emails
59026+ offering $25 for them back. */
59027+
59028+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
59029+ d32 magic;
59030+ /* flushstamp is made of mk_id and write_counter. mk_id is an
59031+ id generated randomly at mkreiserfs time. So we can just
59032+ skip all nodes with different mk_id. write_counter is d64
59033+ incrementing counter of writes on disk. It is used for
59034+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
59035+
59036+ d32 mkfs_id;
59037+ d64 flush_id;
59038+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
59039+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
59040+ d16 flags;
59041+
59042+ /* 1 is leaf level, 2 is twig level, root is the numerically
59043+ largest level */
59044+ d8 level;
59045+
59046+ d8 pad;
59047+} PACKED node40_header;
59048+
59049+/* item headers are not standard across all node layouts, pass
59050+ pos_in_node to functions instead */
59051+typedef struct item_header40 {
59052+ /* key of item */
59053+ /* 0 */ reiser4_key key;
59054+ /* offset from start of a node measured in 8-byte chunks */
59055+ /* 24 */ d16 offset;
59056+ /* 26 */ d16 flags;
59057+ /* 28 */ d16 plugin_id;
59058+} PACKED item_header40;
59059+
59060+size_t item_overhead_node40(const znode * node, flow_t * aflow);
59061+size_t free_space_node40(znode * node);
59062+node_search_result lookup_node40(znode * node, const reiser4_key * key,
59063+ lookup_bias bias, coord_t * coord);
59064+int num_of_items_node40(const znode * node);
59065+char *item_by_coord_node40(const coord_t * coord);
59066+int length_by_coord_node40(const coord_t * coord);
59067+item_plugin *plugin_by_coord_node40(const coord_t * coord);
59068+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
59069+size_t estimate_node40(znode * node);
59070+int check_node40(const znode * node, __u32 flags, const char **error);
59071+int parse_node40(znode * node);
59072+int init_node40(znode * node);
59073+#ifdef GUESS_EXISTS
59074+int guess_node40(const znode * node);
59075+#endif
59076+void change_item_size_node40(coord_t * coord, int by);
59077+int create_item_node40(coord_t * target, const reiser4_key * key,
59078+ reiser4_item_data * data, carry_plugin_info * info);
59079+void update_item_key_node40(coord_t * target, const reiser4_key * key,
59080+ carry_plugin_info * info);
59081+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
59082+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
59083+int shift_node40(coord_t * from, znode * to, shift_direction pend,
59084+ /* if @from->node becomes
59085+ empty - it will be deleted from
59086+ the tree if this is set to 1
59087+ */
59088+ int delete_child, int including_stop_coord,
59089+ carry_plugin_info * info);
59090+
59091+int fast_insert_node40(const coord_t * coord);
59092+int fast_paste_node40(const coord_t * coord);
59093+int fast_cut_node40(const coord_t * coord);
59094+int max_item_size_node40(void);
59095+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
59096+int set_item_plugin_node40(coord_t * coord, item_id id);
59097+int shrink_item_node40(coord_t * coord, int delta);
59098+
59099+#if REISER4_DEBUG
59100+void *shift_check_prepare(const znode *left, const znode *right);
59101+void shift_check(void *vp, const znode *left, const znode *right);
59102+#endif
59103+
59104+/* __REISER4_NODE40_H__ */
59105+#endif
59106+/*
59107+ Local variables:
59108+ c-indentation-style: "K&R"
59109+ mode-name: "LC"
59110+ c-basic-offset: 8
59111+ tab-width: 8
59112+ fill-column: 120
59113+ scroll-step: 1
59114+ End:
59115+*/
59116diff --git a/fs/reiser4/plugin/object.c b/fs/reiser4/plugin/object.c
59117new file mode 100644
59118index 0000000..ae999e3
59119--- /dev/null
59120+++ b/fs/reiser4/plugin/object.c
59121@@ -0,0 +1,516 @@
59122+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59123+ * reiser4/README */
59124+
59125+/*
59126+ * Examples of object plugins: file, directory, symlink, special file.
59127+ *
59128+ * Plugins associated with inode:
59129+ *
59130+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
59131+ * stat-data. How we store this plugin in in-core inode is not
59132+ * important. Currently pointers are used, another variant is to store offsets
59133+ * and do array lookup on each access.
59134+ *
59135+ * Now, each inode has one selected plugin: object plugin that
59136+ * determines what type of file this object is: directory, regular etc.
59137+ *
59138+ * This main plugin can use other plugins that are thus subordinated to
59139+ * it. Directory instance of object plugin uses hash; regular file
59140+ * instance uses tail policy plugin.
59141+ *
59142+ * Object plugin is either taken from id in stat-data or guessed from
59143+ * i_mode bits. Once it is established we ask it to install its
59144+ * subordinate plugins, by looking again in stat-data or inheriting them
59145+ * from parent.
59146+ *
59147+ * How new inode is initialized during ->read_inode():
59148+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
59149+ * i_generation, capabilities etc.
59150+ * 2 read plugin id from stat data or try to guess plugin id
59151+ * from inode->i_mode bits if plugin id is missing.
59152+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
59153+ *
59154+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
59155+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
59156+ *
59157+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
59158+ * from stat-data or guessed from mode bits
59159+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
59160+ * plugins from parent.
59161+ *
59162+ * Easy induction proves that on last step all plugins of inode would be
59163+ * initialized.
59164+ *
59165+ * When creating new object:
59166+ * 1 obtain object plugin id (see next period)
59167+ * NIKITA-FIXME-HANS: period?
59168+ * 2 ->install() this plugin
59169+ * 3 ->inherit() the rest from the parent
59170+ *
59171+ * We need some examples of creating an object with default and non-default
59172+ * plugin ids. Nikita, please create them.
59173+ */
59174+
59175+#include "../inode.h"
59176+
59177+static int _bugop(void)
59178+{
59179+ BUG_ON(1);
59180+ return 0;
59181+}
59182+
59183+#define bugop ((void *)_bugop)
59184+
59185+static int _dummyop(void)
59186+{
59187+ return 0;
59188+}
59189+
59190+#define dummyop ((void *)_dummyop)
59191+
59192+static int change_file(struct inode *inode,
59193+ reiser4_plugin * plugin,
59194+ pset_member memb)
59195+{
59196+ /* cannot change object plugin of already existing object */
59197+ if (memb == PSET_FILE)
59198+ return RETERR(-EINVAL);
59199+
59200+ /* Change PSET_CREATE */
59201+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
59202+}
59203+
59204+static reiser4_plugin_ops file_plugin_ops = {
59205+ .change = change_file
59206+};
59207+
59208+/*
59209+ * Definitions of object plugins.
59210+ */
59211+
59212+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
59213+ [UNIX_FILE_PLUGIN_ID] = {
59214+ .h = {
59215+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59216+ .id = UNIX_FILE_PLUGIN_ID,
59217+ .groups = (1 << REISER4_REGULAR_FILE),
59218+ .pops = &file_plugin_ops,
59219+ .label = "reg",
59220+ .desc = "regular file",
59221+ .linkage = {NULL, NULL},
59222+ },
59223+ .inode_ops = {
59224+ .permission = reiser4_permission_common,
59225+ .setattr = setattr_unix_file,
59226+ .getattr = reiser4_getattr_common
59227+ },
59228+ .file_ops = {
59229+ .llseek = generic_file_llseek,
59230+ .read = read_unix_file,
59231+ .write = write_unix_file,
59232+ .aio_read = generic_file_aio_read,
59233+ .ioctl = ioctl_unix_file,
59234+ .mmap = mmap_unix_file,
59235+ .open = open_unix_file,
59236+ .release = release_unix_file,
59237+ .fsync = sync_unix_file,
59238+ .sendfile = sendfile_unix_file
59239+ },
59240+ .as_ops = {
59241+ .writepage = reiser4_writepage,
59242+ .readpage = readpage_unix_file,
59243+ .sync_page = block_sync_page,
59244+ .writepages = writepages_unix_file,
59245+ .set_page_dirty = reiser4_set_page_dirty,
59246+ .readpages = readpages_unix_file,
59247+ .prepare_write = prepare_write_unix_file,
59248+ .commit_write = commit_write_unix_file,
59249+ .bmap = bmap_unix_file,
59250+ .invalidatepage = reiser4_invalidatepage,
59251+ .releasepage = reiser4_releasepage
59252+ },
59253+ .write_sd_by_inode = write_sd_by_inode_common,
59254+ .flow_by_inode = flow_by_inode_unix_file,
59255+ .key_by_inode = key_by_inode_and_offset_common,
59256+ .set_plug_in_inode = set_plug_in_inode_common,
59257+ .adjust_to_parent = adjust_to_parent_common,
59258+ .create_object = reiser4_create_object_common,
59259+ .delete_object = delete_object_unix_file,
59260+ .add_link = reiser4_add_link_common,
59261+ .rem_link = reiser4_rem_link_common,
59262+ .owns_item = owns_item_unix_file,
59263+ .can_add_link = can_add_link_common,
59264+ .detach = dummyop,
59265+ .bind = dummyop,
59266+ .safelink = safelink_common,
59267+ .estimate = {
59268+ .create = estimate_create_common,
59269+ .update = estimate_update_common,
59270+ .unlink = estimate_unlink_common
59271+ },
59272+ .init_inode_data = init_inode_data_unix_file,
59273+ .cut_tree_worker = cut_tree_worker_common,
59274+ .wire = {
59275+ .write = wire_write_common,
59276+ .read = wire_read_common,
59277+ .get = wire_get_common,
59278+ .size = wire_size_common,
59279+ .done = wire_done_common
59280+ }
59281+ },
59282+ [DIRECTORY_FILE_PLUGIN_ID] = {
59283+ .h = {
59284+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59285+ .id = DIRECTORY_FILE_PLUGIN_ID,
59286+ .groups = (1 << REISER4_DIRECTORY_FILE),
59287+ .pops = &file_plugin_ops,
59288+ .label = "dir",
59289+ .desc = "directory",
59290+ .linkage = {NULL, NULL}
59291+ },
59292+ .inode_ops = {.create = NULL},
59293+ .file_ops = {.owner = NULL},
59294+ .as_ops = {.writepage = NULL},
59295+
59296+ .write_sd_by_inode = write_sd_by_inode_common,
59297+ .flow_by_inode = bugop,
59298+ .key_by_inode = bugop,
59299+ .set_plug_in_inode = set_plug_in_inode_common,
59300+ .adjust_to_parent = adjust_to_parent_common_dir,
59301+ .create_object = reiser4_create_object_common,
59302+ .delete_object = reiser4_delete_dir_common,
59303+ .add_link = reiser4_add_link_common,
59304+ .rem_link = rem_link_common_dir,
59305+ .owns_item = owns_item_common_dir,
59306+ .can_add_link = can_add_link_common,
59307+ .can_rem_link = can_rem_link_common_dir,
59308+ .detach = reiser4_detach_common_dir,
59309+ .bind = reiser4_bind_common_dir,
59310+ .safelink = safelink_common,
59311+ .estimate = {
59312+ .create = estimate_create_common_dir,
59313+ .update = estimate_update_common,
59314+ .unlink = estimate_unlink_common_dir
59315+ },
59316+ .wire = {
59317+ .write = wire_write_common,
59318+ .read = wire_read_common,
59319+ .get = wire_get_common,
59320+ .size = wire_size_common,
59321+ .done = wire_done_common
59322+ },
59323+ .init_inode_data = init_inode_ordering,
59324+ .cut_tree_worker = cut_tree_worker_common,
59325+ },
59326+ [SYMLINK_FILE_PLUGIN_ID] = {
59327+ .h = {
59328+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59329+ .id = SYMLINK_FILE_PLUGIN_ID,
59330+ .groups = (1 << REISER4_SYMLINK_FILE),
59331+ .pops = &file_plugin_ops,
59332+ .label = "symlink",
59333+ .desc = "symbolic link",
59334+ .linkage = {NULL,NULL}
59335+ },
59336+ .inode_ops = {
59337+ .readlink = generic_readlink,
59338+ .follow_link = reiser4_follow_link_common,
59339+ .permission = reiser4_permission_common,
59340+ .setattr = reiser4_setattr_common,
59341+ .getattr = reiser4_getattr_common
59342+ },
59343+ /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
59344+ .file_ops = {.owner = NULL},
59345+ .as_ops = {.writepage = NULL},
59346+
59347+ .write_sd_by_inode = write_sd_by_inode_common,
59348+ .set_plug_in_inode = set_plug_in_inode_common,
59349+ .adjust_to_parent = adjust_to_parent_common,
59350+ .create_object = reiser4_create_symlink,
59351+ .delete_object = reiser4_delete_object_common,
59352+ .add_link = reiser4_add_link_common,
59353+ .rem_link = reiser4_rem_link_common,
59354+ .can_add_link = can_add_link_common,
59355+ .detach = dummyop,
59356+ .bind = dummyop,
59357+ .safelink = safelink_common,
59358+ .estimate = {
59359+ .create = estimate_create_common,
59360+ .update = estimate_update_common,
59361+ .unlink = estimate_unlink_common
59362+ },
59363+ .init_inode_data = init_inode_ordering,
59364+ .cut_tree_worker = cut_tree_worker_common,
59365+ .destroy_inode = destroy_inode_symlink,
59366+ .wire = {
59367+ .write = wire_write_common,
59368+ .read = wire_read_common,
59369+ .get = wire_get_common,
59370+ .size = wire_size_common,
59371+ .done = wire_done_common
59372+ }
59373+ },
59374+ [SPECIAL_FILE_PLUGIN_ID] = {
59375+ .h = {
59376+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59377+ .id = SPECIAL_FILE_PLUGIN_ID,
59378+ .groups = (1 << REISER4_SPECIAL_FILE),
59379+ .pops = &file_plugin_ops,
59380+ .label = "special",
59381+ .desc =
59382+ "special: fifo, device or socket",
59383+ .linkage = {NULL, NULL}
59384+ },
59385+ .inode_ops = {
59386+ .permission = reiser4_permission_common,
59387+ .setattr = reiser4_setattr_common,
59388+ .getattr = reiser4_getattr_common
59389+ },
59390+ /* file_ops of special files (sockets, block, char, fifo) are
59391+ initialized by init_special_inode. */
59392+ .file_ops = {.owner = NULL},
59393+ .as_ops = {.writepage = NULL},
59394+
59395+ .write_sd_by_inode = write_sd_by_inode_common,
59396+ .set_plug_in_inode = set_plug_in_inode_common,
59397+ .adjust_to_parent = adjust_to_parent_common,
59398+ .create_object = reiser4_create_object_common,
59399+ .delete_object = reiser4_delete_object_common,
59400+ .add_link = reiser4_add_link_common,
59401+ .rem_link = reiser4_rem_link_common,
59402+ .owns_item = owns_item_common,
59403+ .can_add_link = can_add_link_common,
59404+ .detach = dummyop,
59405+ .bind = dummyop,
59406+ .safelink = safelink_common,
59407+ .estimate = {
59408+ .create = estimate_create_common,
59409+ .update = estimate_update_common,
59410+ .unlink = estimate_unlink_common
59411+ },
59412+ .init_inode_data = init_inode_ordering,
59413+ .cut_tree_worker = cut_tree_worker_common,
59414+ .wire = {
59415+ .write = wire_write_common,
59416+ .read = wire_read_common,
59417+ .get = wire_get_common,
59418+ .size = wire_size_common,
59419+ .done = wire_done_common
59420+ }
59421+ },
59422+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
59423+ .h = {
59424+ .type_id = REISER4_FILE_PLUGIN_TYPE,
59425+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
59426+ .groups = (1 << REISER4_REGULAR_FILE),
59427+ .pops = &file_plugin_ops,
59428+ .label = "cryptcompress",
59429+ .desc = "cryptcompress file",
59430+ .linkage = {NULL, NULL}
59431+ },
59432+ .inode_ops = {
59433+ .permission = reiser4_permission_common,
59434+ .setattr = prot_setattr_cryptcompress,
59435+ .getattr = reiser4_getattr_common
59436+ },
59437+ .file_ops = {
59438+ .llseek = generic_file_llseek,
59439+ .read = prot_read_cryptcompress,
59440+ .write = prot_write_cryptcompress,
59441+ .aio_read = generic_file_aio_read,
59442+ .mmap = prot_mmap_cryptcompress,
59443+ .release = prot_release_cryptcompress,
59444+ .fsync = reiser4_sync_common,
59445+ .sendfile = prot_sendfile_cryptcompress
59446+ },
59447+ .as_ops = {
59448+ .writepage = reiser4_writepage,
59449+ .readpage = readpage_cryptcompress,
59450+ .sync_page = block_sync_page,
59451+ .writepages = writepages_cryptcompress,
59452+ .set_page_dirty = reiser4_set_page_dirty,
59453+ .readpages = readpages_cryptcompress,
59454+ .prepare_write = prepare_write_common,
59455+ .invalidatepage = reiser4_invalidatepage,
59456+ .releasepage = reiser4_releasepage
59457+ },
59458+ .write_sd_by_inode = write_sd_by_inode_common,
59459+ .flow_by_inode = flow_by_inode_cryptcompress,
59460+ .key_by_inode = key_by_inode_cryptcompress,
59461+ .set_plug_in_inode = set_plug_in_inode_common,
59462+ .adjust_to_parent = adjust_to_parent_cryptcompress,
59463+ .create_object = create_cryptcompress,
59464+ .open_object = open_object_cryptcompress,
59465+ .delete_object = delete_object_cryptcompress,
59466+ .add_link = reiser4_add_link_common,
59467+ .rem_link = reiser4_rem_link_common,
59468+ .owns_item = owns_item_common,
59469+ .can_add_link = can_add_link_common,
59470+ .detach = dummyop,
59471+ .bind = dummyop,
59472+ .safelink = safelink_common,
59473+ .estimate = {
59474+ .create = estimate_create_common,
59475+ .update = estimate_update_common,
59476+ .unlink = estimate_unlink_common
59477+ },
59478+ .init_inode_data = init_inode_data_cryptcompress,
59479+ .cut_tree_worker = cut_tree_worker_cryptcompress,
59480+ .destroy_inode = destroy_inode_cryptcompress,
59481+ .wire = {
59482+ .write = wire_write_common,
59483+ .read = wire_read_common,
59484+ .get = wire_get_common,
59485+ .size = wire_size_common,
59486+ .done = wire_done_common
59487+ }
59488+ }
59489+};
59490+
59491+static int change_dir(struct inode *inode,
59492+ reiser4_plugin * plugin,
59493+ pset_member memb)
59494+{
59495+ /* cannot change dir plugin of already existing object */
59496+ return RETERR(-EINVAL);
59497+}
59498+
59499+static reiser4_plugin_ops dir_plugin_ops = {
59500+ .change = change_dir
59501+};
59502+
59503+/*
59504+ * definition of directory plugins
59505+ */
59506+
59507+dir_plugin dir_plugins[LAST_DIR_ID] = {
59508+ /* standard hashed directory plugin */
59509+ [HASHED_DIR_PLUGIN_ID] = {
59510+ .h = {
59511+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59512+ .id = HASHED_DIR_PLUGIN_ID,
59513+ .pops = &dir_plugin_ops,
59514+ .label = "dir",
59515+ .desc = "hashed directory",
59516+ .linkage = {NULL, NULL}
59517+ },
59518+ .inode_ops = {
59519+ .create = reiser4_create_common,
59520+ .lookup = reiser4_lookup_common,
59521+ .link = reiser4_link_common,
59522+ .unlink = reiser4_unlink_common,
59523+ .symlink = reiser4_symlink_common,
59524+ .mkdir = reiser4_mkdir_common,
59525+ .rmdir = reiser4_unlink_common,
59526+ .mknod = reiser4_mknod_common,
59527+ .rename = reiser4_rename_common,
59528+ .permission = reiser4_permission_common,
59529+ .setattr = reiser4_setattr_common,
59530+ .getattr = reiser4_getattr_common
59531+ },
59532+ .file_ops = {
59533+ .llseek = reiser4_llseek_dir_common,
59534+ .read = generic_read_dir,
59535+ .readdir = reiser4_readdir_common,
59536+ .release = reiser4_release_dir_common,
59537+ .fsync = reiser4_sync_common
59538+ },
59539+ .as_ops = {
59540+ .writepage = bugop,
59541+ .sync_page = bugop,
59542+ .writepages = dummyop,
59543+ .set_page_dirty = bugop,
59544+ .readpages = bugop,
59545+ .prepare_write = bugop,
59546+ .commit_write = bugop,
59547+ .bmap = bugop,
59548+ .invalidatepage = bugop,
59549+ .releasepage = bugop
59550+ },
59551+ .get_parent = get_parent_common,
59552+ .is_name_acceptable = is_name_acceptable_common,
59553+ .build_entry_key = build_entry_key_hashed,
59554+ .build_readdir_key = build_readdir_key_common,
59555+ .add_entry = reiser4_add_entry_common,
59556+ .rem_entry = reiser4_rem_entry_common,
59557+ .init = reiser4_dir_init_common,
59558+ .done = reiser4_dir_done_common,
59559+ .attach = reiser4_attach_common,
59560+ .detach = reiser4_detach_common,
59561+ .estimate = {
59562+ .add_entry = estimate_add_entry_common,
59563+ .rem_entry = estimate_rem_entry_common,
59564+ .unlink = dir_estimate_unlink_common
59565+ }
59566+ },
59567+ /* hashed directory for which seekdir/telldir are guaranteed to
59568+ * work. Brain-damage. */
59569+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59570+ .h = {
59571+ .type_id = REISER4_DIR_PLUGIN_TYPE,
59572+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59573+ .pops = &dir_plugin_ops,
59574+ .label = "dir32",
59575+ .desc = "directory hashed with 31 bit hash",
59576+ .linkage = {NULL, NULL}
59577+ },
59578+ .inode_ops = {
59579+ .create = reiser4_create_common,
59580+ .lookup = reiser4_lookup_common,
59581+ .link = reiser4_link_common,
59582+ .unlink = reiser4_unlink_common,
59583+ .symlink = reiser4_symlink_common,
59584+ .mkdir = reiser4_mkdir_common,
59585+ .rmdir = reiser4_unlink_common,
59586+ .mknod = reiser4_mknod_common,
59587+ .rename = reiser4_rename_common,
59588+ .permission = reiser4_permission_common,
59589+ .setattr = reiser4_setattr_common,
59590+ .getattr = reiser4_getattr_common
59591+ },
59592+ .file_ops = {
59593+ .llseek = reiser4_llseek_dir_common,
59594+ .read = generic_read_dir,
59595+ .readdir = reiser4_readdir_common,
59596+ .release = reiser4_release_dir_common,
59597+ .fsync = reiser4_sync_common
59598+ },
59599+ .as_ops = {
59600+ .writepage = bugop,
59601+ .sync_page = bugop,
59602+ .writepages = dummyop,
59603+ .set_page_dirty = bugop,
59604+ .readpages = bugop,
59605+ .prepare_write = bugop,
59606+ .commit_write = bugop,
59607+ .bmap = bugop,
59608+ .invalidatepage = bugop,
59609+ .releasepage = bugop
59610+ },
59611+ .get_parent = get_parent_common,
59612+ .is_name_acceptable = is_name_acceptable_common,
59613+ .build_entry_key = build_entry_key_seekable,
59614+ .build_readdir_key = build_readdir_key_common,
59615+ .add_entry = reiser4_add_entry_common,
59616+ .rem_entry = reiser4_rem_entry_common,
59617+ .init = reiser4_dir_init_common,
59618+ .done = reiser4_dir_done_common,
59619+ .attach = reiser4_attach_common,
59620+ .detach = reiser4_detach_common,
59621+ .estimate = {
59622+ .add_entry = estimate_add_entry_common,
59623+ .rem_entry = estimate_rem_entry_common,
59624+ .unlink = dir_estimate_unlink_common
59625+ }
59626+ }
59627+};
59628+
59629+/* Make Linus happy.
59630+ Local variables:
59631+ c-indentation-style: "K&R"
59632+ mode-name: "LC"
59633+ c-basic-offset: 8
59634+ tab-width: 8
59635+ fill-column: 120
59636+ End:
59637+*/
59638diff --git a/fs/reiser4/plugin/object.h b/fs/reiser4/plugin/object.h
59639new file mode 100644
59640index 0000000..440c369
59641--- /dev/null
59642+++ b/fs/reiser4/plugin/object.h
59643@@ -0,0 +1,121 @@
59644+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59645+ * reiser4/README */
59646+
59647+/* Declaration of object plugin functions. */
59648+
59649+#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59650+#define __FS_REISER4_PLUGIN_OBJECT_H__
59651+
59652+#include "../type_safe_hash.h"
59653+
59654+/* common implementations of inode operations */
59655+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
59656+ int mode, struct nameidata *);
59657+struct dentry * reiser4_lookup_common(struct inode *parent,
59658+ struct dentry *dentry,
59659+ struct nameidata *nameidata);
59660+int reiser4_link_common(struct dentry *existing, struct inode *parent,
59661+ struct dentry *newname);
59662+int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
59663+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59664+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
59665+ const char *linkname);
59666+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
59667+ int mode, dev_t rdev);
59668+int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
59669+ struct inode *new_dir, struct dentry *new_name);
59670+void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
59671+int reiser4_permission_common(struct inode *, int mask,
59672+ struct nameidata *nameidata);
59673+int reiser4_setattr_common(struct dentry *, struct iattr *);
59674+int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
59675+ struct kstat *);
59676+
59677+/* common implementations of file operations */
59678+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
59679+int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
59680+int reiser4_release_dir_common(struct inode *, struct file *);
59681+int reiser4_sync_common(struct file *, struct dentry *, int datasync);
59682+
59683+/* common implementations of address space operations */
59684+int prepare_write_common(struct file *, struct page *, unsigned from,
59685+ unsigned to);
59686+
59687+/* file plugin operations: common implementations */
59688+int write_sd_by_inode_common(struct inode *);
59689+int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59690+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59691+ reiser4_object_create_data *);
59692+int adjust_to_parent_common(struct inode *object, struct inode *parent,
59693+ struct inode *root);
59694+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59695+ struct inode *root);
59696+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59697+ struct inode *root);
59698+int reiser4_create_object_common(struct inode *object, struct inode *parent,
59699+ reiser4_object_create_data *);
59700+int reiser4_delete_object_common(struct inode *);
59701+int reiser4_delete_dir_common(struct inode *);
59702+int reiser4_add_link_common(struct inode *object, struct inode *parent);
59703+int reiser4_rem_link_common(struct inode *object, struct inode *parent);
59704+int rem_link_common_dir(struct inode *object, struct inode *parent);
59705+int owns_item_common(const struct inode *, const coord_t *);
59706+int owns_item_common_dir(const struct inode *, const coord_t *);
59707+int can_add_link_common(const struct inode *);
59708+int can_rem_link_common_dir(const struct inode *);
59709+int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
59710+int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
59711+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59712+reiser4_block_nr estimate_create_common(const struct inode *);
59713+reiser4_block_nr estimate_create_common_dir(const struct inode *);
59714+reiser4_block_nr estimate_update_common(const struct inode *);
59715+reiser4_block_nr estimate_unlink_common(const struct inode *,
59716+ const struct inode *);
59717+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59718+ const struct inode *);
59719+char *wire_write_common(struct inode *, char *start);
59720+char *wire_read_common(char *addr, reiser4_object_on_wire *);
59721+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59722+int wire_size_common(struct inode *);
59723+void wire_done_common(reiser4_object_on_wire *);
59724+
59725+/* dir plugin operations: common implementations */
59726+struct dentry *get_parent_common(struct inode *child);
59727+int is_name_acceptable_common(const struct inode *, const char *name, int len);
59728+void build_entry_key_common(const struct inode *,
59729+ const struct qstr *qname, reiser4_key *);
59730+int build_readdir_key_common(struct file *dir, reiser4_key *);
59731+int reiser4_add_entry_common(struct inode *object, struct dentry *where,
59732+ reiser4_object_create_data *, reiser4_dir_entry_desc *);
59733+int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
59734+ reiser4_dir_entry_desc *);
59735+int reiser4_dir_init_common(struct inode *object, struct inode *parent,
59736+ reiser4_object_create_data *);
59737+int reiser4_dir_done_common(struct inode *);
59738+int reiser4_attach_common(struct inode *child, struct inode *parent);
59739+int reiser4_detach_common(struct inode *object, struct inode *parent);
59740+reiser4_block_nr estimate_add_entry_common(const struct inode *);
59741+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59742+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59743+ const struct inode *);
59744+
59745+/* these are essential parts of common implementations, they are to make
59746+ customized implementations easier */
59747+int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59748+
59749+/* merely useful functions */
59750+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59751+ const reiser4_key *, int silent);
59752+
59753+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59754+#endif
59755+
59756+/* Make Linus happy.
59757+ Local variables:
59758+ c-indentation-style: "K&R"
59759+ mode-name: "LC"
59760+ c-basic-offset: 8
59761+ tab-width: 8
59762+ fill-column: 120
59763+ End:
59764+*/
59765diff --git a/fs/reiser4/plugin/plugin.c b/fs/reiser4/plugin/plugin.c
59766new file mode 100644
59767index 0000000..8261878
59768--- /dev/null
59769+++ b/fs/reiser4/plugin/plugin.c
59770@@ -0,0 +1,578 @@
59771+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59772+ * reiser4/README */
59773+
59774+/* Basic plugin infrastructure, lookup etc. */
59775+
59776+/* PLUGINS:
59777+
59778+ Plugins are internal Reiser4 "modules" or "objects" used to increase
59779+ extensibility and allow external users to easily adapt reiser4 to
59780+ their needs.
59781+
59782+ Plugins are classified into several disjoint "types". Plugins
59783+ belonging to the particular plugin type are termed "instances" of
59784+ this type. Currently the following types are present:
59785+
59786+ . object plugin
59787+ . hash plugin
59788+ . tail plugin
59789+ . perm plugin
59790+ . item plugin
59791+ . node layout plugin
59792+
59793+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59794+
59795+ Object (file) plugin determines how given file-system object serves
59796+ standard VFS requests for read, write, seek, mmap etc. Instances of
59797+ file plugins are: regular file, directory, symlink. Another example
59798+ of file plugin is audit plugin, that optionally records accesses to
59799+ underlying object and forwards requests to it.
59800+
59801+ Hash plugins compute hashes used by reiser4 to store and locate
59802+ files within directories. Instances of hash plugin type are: r5,
59803+ tea, rupasov.
59804+
59805+ Tail plugins (or, more precisely, tail policy plugins) determine
59806+ when last part of the file should be stored in a formatted item.
59807+
59808+ Perm plugins control permissions granted for a process accessing a file.
59809+
59810+ Scope and lookup:
59811+
59812+ label such that pair ( type_label, plugin_label ) is unique. This
59813+ pair is a globally persistent and user-visible plugin
59814+ identifier. Internally kernel maintains plugins and plugin types in
59815+ arrays using an index into those arrays as plugin and plugin type
59816+ identifiers. File-system in turn, also maintains persistent
59817+ "dictionary" which is mapping from plugin label to numerical
59818+ identifier which is stored in file-system objects. That is, we
59819+ store the offset into the plugin array for that plugin type as the
59820+ plugin id in the stat data of the filesystem object.
59821+
59822+ plugin_labels have meaning for the user interface that assigns
59823+ plugins to files, and may someday have meaning for dynamic loading of
59824+ plugins and for copying of plugins from one fs instance to
59825+ another by utilities like cp and tar.
59826+
59827+ Internal kernel plugin type identifier (index in plugins[] array) is
59828+ of type reiser4_plugin_type. Set of available plugin types is
59829+ currently static, but dynamic loading doesn't seem to pose
59830+ insurmountable problems.
59831+
59832+ Within each type plugins are addressed by the identifiers of type
59833+ reiser4_plugin_id (indices in
59834+ reiser4_plugin_type_data.builtin[]). Such identifiers are only
59835+ required to be unique within one type, not globally.
59836+
59837+ Thus, plugin in memory is uniquely identified by the pair (type_id,
59838+ id).
59839+
59840+ Usage:
59841+
59842+ There exists only one instance of each plugin instance, but this
59843+ single instance can be associated with many entities (file-system
59844+ objects, items, nodes, transactions, file-descriptors etc.). Entity
59845+ to which plugin of given type is termed (due to the lack of
59846+ imagination) "subject" of this plugin type and, by abuse of
59847+ terminology, subject of particular instance of this type to which
59848+ it's attached currently. For example, inode is subject of object
59849+ plugin type. Inode representing directory is subject of directory
59850+ plugin, hash plugin type and some particular instance of hash plugin
59851+ type. Inode, representing regular file is subject of "regular file"
59852+ plugin, tail-policy plugin type etc.
59853+
59854+ With each subject the plugin possibly stores some state. For example,
59855+ the state of a directory plugin (instance of object plugin type) is pointer
59856+ to hash plugin (if directories always use hashing that is). State of
59857+ audit plugin is file descriptor (struct file) of log file or some
59858+ magic value to do logging through printk().
59859+
59860+ Interface:
59861+
59862+ In addition to a scalar identifier, each plugin type and plugin
59863+ proper has a "label": short string and a "description"---longer
59864+ descriptive string. Labels and descriptions of plugin types are
59865+ hard-coded into plugins[] array, declared and defined in
59866+ plugin.c. Label and description of plugin are stored in .label and
59867+ .desc fields of reiser4_plugin_header respectively. It's possible to
59868+ locate plugin by the pair of labels.
59869+
59870+ Features:
59871+
59872+ . user-level plugin manipulations:
59873+ + reiser4("filename/..file_plugin<='audit'");
59874+ + write(open("filename/..file_plugin"), "audit", 8);
59875+
59876+ . user level utilities lsplug and chplug to manipulate plugins.
59877+ Utilities are not of primary priority. Possibly they will be not
59878+ working on v4.0
59879+
59880+NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree? I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59881+
59882+ . mount option "plug" to set-up plugins of root-directory.
59883+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
59884+
59885+ Limitations:
59886+
59887+ . each plugin type has to provide at least one builtin
59888+ plugin. This is technical limitation and it can be lifted in the
59889+ future.
59890+
59891+ TODO:
59892+
59893+ New plugin types/plugings:
59894+ Things we should be able to separately choose to inherit:
59895+
59896+ security plugins
59897+
59898+ stat data
59899+
59900+ file bodies
59901+
59902+ file plugins
59903+
59904+ dir plugins
59905+
59906+ . perm:acl
59907+
59908+ d audi---audit plugin intercepting and possibly logging all
59909+ accesses to object. Requires to put stub functions in file_operations
59910+ in stead of generic_file_*.
59911+
59912+NIKITA-FIXME-HANS: why make overflows a plugin?
59913+ . over---handle hash overflows
59914+
59915+ . sqnt---handle different access patterns and instruments read-ahead
59916+
59917+NIKITA-FIXME-HANS: describe the line below in more detail.
59918+
59919+ . hier---handle inheritance of plugins along file-system hierarchy
59920+
59921+ Different kinds of inheritance: on creation vs. on access.
59922+ Compatible/incompatible plugins.
59923+ Inheritance for multi-linked files.
59924+ Layered plugins.
59925+ Notion of plugin context is abandoned.
59926+
59927+Each file is associated
59928+ with one plugin and dependant plugins (hash, etc.) are stored as
59929+ main plugin state. Now, if we have plugins used for regular files
59930+ but not for directories, how such plugins would be inherited?
59931+ . always store them with directories also
59932+
59933+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing the line below which is also useful.
59934+
59935+ . use inheritance hierarchy, independent of file-system namespace
59936+
59937+*/
59938+
59939+#include "../debug.h"
59940+#include "../dformat.h"
59941+#include "plugin_header.h"
59942+#include "item/static_stat.h"
59943+#include "node/node.h"
59944+#include "security/perm.h"
59945+#include "space/space_allocator.h"
59946+#include "disk_format/disk_format.h"
59947+#include "plugin.h"
59948+#include "../reiser4.h"
59949+#include "../jnode.h"
59950+#include "../inode.h"
59951+
59952+#include <linux/fs.h> /* for struct super_block */
59953+
59954+/* public interface */
59955+
59956+/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59957+int init_plugins(void);
59958+int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59959+int locate_plugin(struct inode *inode, plugin_locator * loc);
59960+
59961+/**
59962+ * init_plugins - initialize plugins
59963+ *
59964+ * Initializes plugin sub-system. It is part of reiser4 module
59965+ * initialization. For each plugin of each type init method is called and each
59966+ * plugin is put into list of plugins.
59967+ */
59968+int init_plugins(void)
59969+{
59970+ reiser4_plugin_type type_id;
59971+
59972+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59973+ reiser4_plugin_type_data *ptype;
59974+ int i;
59975+
59976+ ptype = &plugins[type_id];
59977+ assert("nikita-3508", ptype->label != NULL);
59978+ assert("nikita-3509", ptype->type_id == type_id);
59979+
59980+ INIT_LIST_HEAD(&ptype->plugins_list);
59981+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59982+ for (i = 0; i < ptype->builtin_num; ++i) {
59983+ reiser4_plugin *plugin;
59984+
59985+ plugin = plugin_at(ptype, i);
59986+
59987+ if (plugin->h.label == NULL)
59988+ /* uninitialized slot encountered */
59989+ continue;
59990+ assert("nikita-3445", plugin->h.type_id == type_id);
59991+ plugin->h.id = i;
59992+ if (plugin->h.pops != NULL &&
59993+ plugin->h.pops->init != NULL) {
59994+ int result;
59995+
59996+ result = plugin->h.pops->init(plugin);
59997+ if (result != 0)
59998+ return result;
59999+ }
60000+ INIT_LIST_HEAD(&plugin->h.linkage);
60001+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
60002+ }
60003+ }
60004+ return 0;
60005+}
60006+
60007+/* true if plugin type id is valid */
60008+int is_plugin_type_valid(reiser4_plugin_type type)
60009+{
60010+ /* "type" is unsigned, so no comparison with 0 is
60011+ necessary */
60012+ return (type < REISER4_PLUGIN_TYPES);
60013+}
60014+
60015+/* true if plugin id is valid */
60016+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
60017+{
60018+ assert("nikita-1653", is_plugin_type_valid(type));
60019+ return id < plugins[type].builtin_num;
60020+}
60021+
60022+/* return plugin by its @type and @id.
60023+
60024+ Both arguments are checked for validness: this is supposed to be called
60025+ from user-level.
60026+
60027+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
60028+user space, and passed to the filesystem by use of method files? Your
60029+comment really confused me on the first reading....
60030+
60031+*/
60032+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
60033+ * unchecked */,
60034+ reiser4_plugin_id id /* plugin id,
60035+ * unchecked */)
60036+{
60037+ if (is_plugin_type_valid(type)) {
60038+ if (is_plugin_id_valid(type, id))
60039+ return plugin_at(&plugins[type], id);
60040+ else
60041+ /* id out of bounds */
60042+ warning("nikita-2913",
60043+ "Invalid plugin id: [%i:%i]", type, id);
60044+ } else
60045+ /* type_id out of bounds */
60046+ warning("nikita-2914", "Invalid type_id: %i", type);
60047+ return NULL;
60048+}
60049+
60050+/**
60051+ * save_plugin_id - store plugin id in disk format
60052+ * @plugin: plugin to convert
60053+ * @area: where to store result
60054+ *
60055+ * Puts id of @plugin in little endian format to address @area.
60056+ */
60057+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
60058+ d16 *area /* where to store result */ )
60059+{
60060+ assert("nikita-1261", plugin != NULL);
60061+ assert("nikita-1262", area != NULL);
60062+
60063+ put_unaligned(cpu_to_le16(plugin->h.id), area);
60064+ return 0;
60065+}
60066+
60067+/* list of all plugins of given type */
60068+struct list_head *get_plugin_list(reiser4_plugin_type type)
60069+{
60070+ assert("nikita-1056", is_plugin_type_valid(type));
60071+ return &plugins[type].plugins_list;
60072+}
60073+
60074+static void update_pset_mask(reiser4_inode * info, pset_member memb)
60075+{
60076+ struct dentry *rootdir;
60077+ reiser4_inode *root;
60078+
60079+ assert("edward-1443", memb != PSET_FILE);
60080+
60081+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
60082+ if (rootdir != NULL) {
60083+ root = reiser4_inode_data(rootdir->d_inode);
60084+ /*
60085+ * if inode is different from the default one, or we are
60086+ * changing plugin of root directory, update plugin_mask
60087+ */
60088+ if (aset_get(info->pset, memb) !=
60089+ aset_get(root->pset, memb) ||
60090+ info == root)
60091+ info->plugin_mask |= (1 << memb);
60092+ else
60093+ info->plugin_mask &= ~(1 << memb);
60094+ }
60095+}
60096+
60097+/* Get specified plugin set member from parent,
60098+ or from fs-defaults (if no parent is given) and
60099+ install the result to pset of @self */
60100+int grab_plugin_pset(struct inode *self,
60101+ struct inode *ancestor,
60102+ pset_member memb)
60103+{
60104+ reiser4_plugin *plug;
60105+ reiser4_inode *info;
60106+ int result = 0;
60107+
60108+ /* Do not grab if initialised already. */
60109+ info = reiser4_inode_data(self);
60110+ if (aset_get(info->pset, memb) != NULL)
60111+ return 0;
60112+ if (ancestor) {
60113+ reiser4_inode *parent;
60114+
60115+ parent = reiser4_inode_data(ancestor);
60116+ plug = aset_get(parent->hset, memb) ? :
60117+ aset_get(parent->pset, memb);
60118+ }
60119+ else
60120+ plug = get_default_plugin(memb);
60121+
60122+ result = set_plugin(&info->pset, memb, plug);
60123+ if (result == 0) {
60124+ if (!ancestor || self->i_sb->s_root->d_inode != self)
60125+ update_pset_mask(info, memb);
60126+ }
60127+ return result;
60128+}
60129+
60130+/* Take missing pset members from root inode */
60131+int finish_pset(struct inode *inode)
60132+{
60133+ reiser4_plugin *plug;
60134+ reiser4_inode *root;
60135+ reiser4_inode *info;
60136+ pset_member memb;
60137+ int result = 0;
60138+
60139+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
60140+ info = reiser4_inode_data(inode);
60141+
60142+ assert("edward-1455", root != NULL);
60143+ assert("edward-1456", info != NULL);
60144+
60145+ /* file and directory plugins are already initialized. */
60146+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
60147+
60148+ /* Do not grab if initialised already. */
60149+ if (aset_get(info->pset, memb) != NULL)
60150+ continue;
60151+
60152+ plug = aset_get(root->pset, memb);
60153+ result = set_plugin(&info->pset, memb, plug);
60154+ if (result != 0)
60155+ break;
60156+ }
60157+ if (result != 0) {
60158+ warning("nikita-3447",
60159+ "Cannot set up plugins for %lli",
60160+ (unsigned long long)
60161+ get_inode_oid(inode));
60162+ }
60163+ return result;
60164+}
60165+
60166+int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
60167+{
60168+ reiser4_inode *info;
60169+ int result = 0;
60170+
60171+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
60172+ /* Changing pset in the root object. */
60173+ return RETERR(-EINVAL);
60174+ }
60175+
60176+ info = reiser4_inode_data(self);
60177+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
60178+ result = plug->h.pops->change(self, plug, memb);
60179+ else
60180+ result = aset_set_unsafe(&info->pset, memb, plug);
60181+ if (result == 0) {
60182+ __u16 oldmask = info->plugin_mask;
60183+
60184+ update_pset_mask(info, memb);
60185+ if (oldmask != info->plugin_mask)
60186+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
60187+ }
60188+ return result;
60189+}
60190+
60191+reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
60192+ /* C90 initializers */
60193+ [REISER4_FILE_PLUGIN_TYPE] = {
60194+ .type_id = REISER4_FILE_PLUGIN_TYPE,
60195+ .label = "file",
60196+ .desc = "Object plugins",
60197+ .builtin_num = sizeof_array(file_plugins),
60198+ .builtin = file_plugins,
60199+ .plugins_list = {NULL, NULL},
60200+ .size = sizeof(file_plugin)
60201+ },
60202+ [REISER4_DIR_PLUGIN_TYPE] = {
60203+ .type_id = REISER4_DIR_PLUGIN_TYPE,
60204+ .label = "dir",
60205+ .desc = "Directory plugins",
60206+ .builtin_num = sizeof_array(dir_plugins),
60207+ .builtin = dir_plugins,
60208+ .plugins_list = {NULL, NULL},
60209+ .size = sizeof(dir_plugin)
60210+ },
60211+ [REISER4_HASH_PLUGIN_TYPE] = {
60212+ .type_id = REISER4_HASH_PLUGIN_TYPE,
60213+ .label = "hash",
60214+ .desc = "Directory hashes",
60215+ .builtin_num = sizeof_array(hash_plugins),
60216+ .builtin = hash_plugins,
60217+ .plugins_list = {NULL, NULL},
60218+ .size = sizeof(hash_plugin)
60219+ },
60220+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
60221+ .type_id =
60222+ REISER4_FIBRATION_PLUGIN_TYPE,
60223+ .label = "fibration",
60224+ .desc = "Directory fibrations",
60225+ .builtin_num = sizeof_array(fibration_plugins),
60226+ .builtin = fibration_plugins,
60227+ .plugins_list = {NULL, NULL},
60228+ .size = sizeof(fibration_plugin)
60229+ },
60230+ [REISER4_CIPHER_PLUGIN_TYPE] = {
60231+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
60232+ .label = "cipher",
60233+ .desc = "Cipher plugins",
60234+ .builtin_num = sizeof_array(cipher_plugins),
60235+ .builtin = cipher_plugins,
60236+ .plugins_list = {NULL, NULL},
60237+ .size = sizeof(cipher_plugin)
60238+ },
60239+ [REISER4_DIGEST_PLUGIN_TYPE] = {
60240+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
60241+ .label = "digest",
60242+ .desc = "Digest plugins",
60243+ .builtin_num = sizeof_array(digest_plugins),
60244+ .builtin = digest_plugins,
60245+ .plugins_list = {NULL, NULL},
60246+ .size = sizeof(digest_plugin)
60247+ },
60248+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
60249+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
60250+ .label = "compression",
60251+ .desc = "Compression plugins",
60252+ .builtin_num = sizeof_array(compression_plugins),
60253+ .builtin = compression_plugins,
60254+ .plugins_list = {NULL, NULL},
60255+ .size = sizeof(compression_plugin)
60256+ },
60257+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
60258+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60259+ .label = "formatting",
60260+ .desc = "Tail inlining policies",
60261+ .builtin_num = sizeof_array(formatting_plugins),
60262+ .builtin = formatting_plugins,
60263+ .plugins_list = {NULL, NULL},
60264+ .size = sizeof(formatting_plugin)
60265+ },
60266+ [REISER4_PERM_PLUGIN_TYPE] = {
60267+ .type_id = REISER4_PERM_PLUGIN_TYPE,
60268+ .label = "perm",
60269+ .desc = "Permission checks",
60270+ .builtin_num = sizeof_array(perm_plugins),
60271+ .builtin = perm_plugins,
60272+ .plugins_list = {NULL, NULL},
60273+ .size = sizeof(perm_plugin)
60274+ },
60275+ [REISER4_ITEM_PLUGIN_TYPE] = {
60276+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
60277+ .label = "item",
60278+ .desc = "Item handlers",
60279+ .builtin_num = sizeof_array(item_plugins),
60280+ .builtin = item_plugins,
60281+ .plugins_list = {NULL, NULL},
60282+ .size = sizeof(item_plugin)
60283+ },
60284+ [REISER4_NODE_PLUGIN_TYPE] = {
60285+ .type_id = REISER4_NODE_PLUGIN_TYPE,
60286+ .label = "node",
60287+ .desc = "node layout handlers",
60288+ .builtin_num = sizeof_array(node_plugins),
60289+ .builtin = node_plugins,
60290+ .plugins_list = {NULL, NULL},
60291+ .size = sizeof(node_plugin)
60292+ },
60293+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
60294+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
60295+ .label = "sd_ext",
60296+ .desc = "Parts of stat-data",
60297+ .builtin_num = sizeof_array(sd_ext_plugins),
60298+ .builtin = sd_ext_plugins,
60299+ .plugins_list = {NULL, NULL},
60300+ .size = sizeof(sd_ext_plugin)
60301+ },
60302+ [REISER4_FORMAT_PLUGIN_TYPE] = {
60303+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
60304+ .label = "disk_layout",
60305+ .desc = "defines filesystem on disk layout",
60306+ .builtin_num = sizeof_array(format_plugins),
60307+ .builtin = format_plugins,
60308+ .plugins_list = {NULL, NULL},
60309+ .size = sizeof(disk_format_plugin)
60310+ },
60311+ [REISER4_JNODE_PLUGIN_TYPE] = {
60312+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
60313+ .label = "jnode",
60314+ .desc = "defines kind of jnode",
60315+ .builtin_num = sizeof_array(jnode_plugins),
60316+ .builtin = jnode_plugins,
60317+ .plugins_list = {NULL, NULL},
60318+ .size = sizeof(jnode_plugin)
60319+ },
60320+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
60321+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60322+ .label = "compression_mode",
60323+ .desc = "Defines compression mode",
60324+ .builtin_num = sizeof_array(compression_mode_plugins),
60325+ .builtin = compression_mode_plugins,
60326+ .plugins_list = {NULL, NULL},
60327+ .size = sizeof(compression_mode_plugin)
60328+ },
60329+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
60330+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
60331+ .label = "cluster",
60332+ .desc = "Defines cluster size",
60333+ .builtin_num = sizeof_array(cluster_plugins),
60334+ .builtin = cluster_plugins,
60335+ .plugins_list = {NULL, NULL},
60336+ .size = sizeof(cluster_plugin)
60337+ }
60338+};
60339+
60340+/*
60341+ * Local variables:
60342+ * c-indentation-style: "K&R"
60343+ * mode-name: "LC"
60344+ * c-basic-offset: 8
60345+ * tab-width: 8
60346+ * fill-column: 120
60347+ * End:
60348+ */
60349diff --git a/fs/reiser4/plugin/plugin.h b/fs/reiser4/plugin/plugin.h
60350new file mode 100644
60351index 0000000..a1d1097
60352--- /dev/null
60353+++ b/fs/reiser4/plugin/plugin.h
60354@@ -0,0 +1,920 @@
60355+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60356+
60357+/* Basic plugin data-types.
60358+ see fs/reiser4/plugin/plugin.c for details */
60359+
60360+#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
60361+#define __FS_REISER4_PLUGIN_TYPES_H__
60362+
60363+#include "../forward.h"
60364+#include "../debug.h"
60365+#include "../dformat.h"
60366+#include "../key.h"
60367+#include "compress/compress.h"
60368+#include "crypto/cipher.h"
60369+#include "plugin_header.h"
60370+#include "item/static_stat.h"
60371+#include "item/internal.h"
60372+#include "item/sde.h"
60373+#include "item/cde.h"
60374+#include "item/item.h"
60375+#include "node/node.h"
60376+#include "node/node40.h"
60377+#include "security/perm.h"
60378+#include "fibration.h"
60379+
60380+#include "space/bitmap.h"
60381+#include "space/space_allocator.h"
60382+
60383+#include "disk_format/disk_format40.h"
60384+#include "disk_format/disk_format.h"
60385+
60386+#include <linux/fs.h> /* for struct super_block, address_space */
60387+#include <linux/mm.h> /* for struct page */
60388+#include <linux/buffer_head.h> /* for struct buffer_head */
60389+#include <linux/dcache.h> /* for struct dentry */
60390+#include <linux/types.h>
60391+#include <linux/crypto.h>
60392+
60393+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
60394+
60395+/*
60396+ * File plugin. Defines the set of methods that file plugins implement, some
60397+ * of which are optional.
60398+ *
60399+ * A file plugin offers to the caller an interface for IO ( writing to and/or
60400+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
60401+ * may affect more than one physical sequence of bytes, or no physical sequence
60402+ * of bytes, it may affect sequences of bytes offered by other file plugins to
60403+ * the semantic layer, and the file plugin may invoke other plugins and
60404+ * delegate work to them, but its interface is structured for offering the
60405+ * caller the ability to read and/or write what the caller sees as being a
60406+ * single sequence of bytes.
60407+ *
60408+ * The file plugin must present a sequence of bytes to the caller, but it does
60409+ * not necessarily have to store a sequence of bytes, it does not necessarily
60410+ * have to support efficient tree traversal to any offset in the sequence of
60411+ * bytes (tail and extent items, whose keys contain offsets, do however provide
60412+ * efficient non-sequential lookup of any offset in the sequence of bytes).
60413+ *
60414+ * Directory plugins provide methods for selecting file plugins by resolving a
60415+ * name for them.
60416+ *
60417+ * The functionality other filesystems call an attribute, and rigidly tie
60418+ * together, we decompose into orthogonal selectable features of files. Using
60419+ * the terminology we will define next, an attribute is a perhaps constrained,
60420+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
60421+ * which might be grandparent-major-packed, and whose parent has a deletion
60422+ * method that deletes it.
60423+ *
60424+ * File plugins can implement constraints.
60425+ *
60426+ * Files can be of variable length (e.g. regular unix files), or of static
60427+ * length (e.g. static sized attributes).
60428+ *
60429+ * An object may have many sequences of bytes, and many file plugins, but, it
60430+ * has exactly one objectid. It is usually desirable that an object has a
60431+ * deletion method which deletes every item with that objectid. Items cannot
60432+ * in general be found by just their objectids. This means that an object must
60433+ * have either a method built into its deletion plugin method for knowing what
60434+ * items need to be deleted, or links stored with the object that provide the
60435+ * plugin with a method for finding those items. Deleting a file within an
60436+ * object may or may not have the effect of deleting the entire object,
60437+ * depending on the file plugin's deletion method.
60438+ *
60439+ * LINK TAXONOMY:
60440+ *
60441+ * Many objects have a reference count, and when the reference count reaches 0
60442+ * the object's deletion method is invoked. Some links embody a reference
60443+ * count increase ("countlinks"), and others do not ("nocountlinks").
60444+ *
60445+ * Some links are bi-directional links ("bilinks"), and some are
60446+ * uni-directional("unilinks").
60447+ *
60448+ * Some links are between parts of the same object ("intralinks"), and some are
60449+ * between different objects ("interlinks").
60450+ *
60451+ * PACKING TAXONOMY:
60452+ *
60453+ * Some items of an object are stored with a major packing locality based on
60454+ * their object's objectid (e.g. unix directory items in plan A), and these are
60455+ * called "self-major-packed".
60456+ *
60457+ * Some items of an object are stored with a major packing locality based on
60458+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
60459+ * and these are called "parent-major-packed".
60460+ *
60461+ * Some items of an object are stored with a major packing locality based on
60462+ * their semantic grandparent, and these are called "grandparent-major-packed".
60463+ * Now carefully notice that we run into trouble with key length if we have to
60464+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
60465+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
60466+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
60467+ * grandparent-major-packed, and which to sacrifice is left to the item author
60468+ * choosing to make the item grandparent-major-packed. You cannot make tail
60469+ * items and extent items grandparent-major-packed, though you could make them
60470+ * self-major-packed (usually they are parent-major-packed).
60471+ *
60472+ * In the case of ACLs (which are composed of fixed length ACEs which consist
60473+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
60474+ * to not have an offset field in the ACE item key, and to allow duplicate keys
60475+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
60476+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
60477+ * a directory together), the minor packing locality of ACE, the objectid of
60478+ * the file, and 0.
60479+ *
60480+ * IO involves moving data from one location to another, which means that two
60481+ * locations must be specified, source and destination.
60482+ *
60483+ * This source and destination can be in the filesystem, or they can be a
60484+ * pointer in the user process address space plus a byte count.
60485+ *
60486+ * If both source and destination are in the filesystem, then at least one of
60487+ * them must be representable as a pure stream of bytes (which we call a flow,
60488+ * and define as a struct containing a key, a data pointer, and a length).
60489+ * This may mean converting one of them into a flow. We provide a generic
60490+ * cast_into_flow() method, which will work for any plugin supporting
60491+ * read_flow(), though it is inefficiently implemented in that it temporarily
60492+ * stores the flow in a buffer (Question: what to do with huge flows that
60493+ * cannot fit into memory? Answer: we must not convert them all at once. )
60494+ *
60495+ * Performing a write requires resolving the write request into a flow defining
60496+ * the source, and a method that performs the write, and a key that defines
60497+ * where in the tree the write is to go.
60498+ *
60499+ * Performing a read requires resolving the read request into a flow defining
60500+ * the target, and a method that performs the read, and a key that defines
60501+ * where in the tree the read is to come from.
60502+ *
60503+ * There will exist file plugins which have no pluginid stored on the disk for
60504+ * them, and which are only invoked by other plugins.
60505+ */
60506+
60507+/* This should be incremented with each new contributed
60508+ pair (plugin type, plugin id).
60509+ NOTE: Make sure there is a release of reiser4progs
60510+ with the corresponding version number */
60511+#define PLUGIN_LIBRARY_VERSION 0
60512+
60513+ /* enumeration of fields within plugin_set */
60514+typedef enum {
60515+ PSET_FILE,
60516+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first elements:
60517+ * inode.c:read_inode() depends on this. */
60518+ PSET_PERM,
60519+ PSET_FORMATTING,
60520+ PSET_HASH,
60521+ PSET_FIBRATION,
60522+ PSET_SD,
60523+ PSET_DIR_ITEM,
60524+ PSET_CIPHER,
60525+ PSET_DIGEST,
60526+ PSET_COMPRESSION,
60527+ PSET_COMPRESSION_MODE,
60528+ PSET_CLUSTER,
60529+ PSET_CREATE,
60530+ PSET_LAST
60531+} pset_member;
60532+
60533+/* builtin file-plugins */
60534+typedef enum {
60535+ /* regular file */
60536+ UNIX_FILE_PLUGIN_ID,
60537+ /* directory */
60538+ DIRECTORY_FILE_PLUGIN_ID,
60539+ /* symlink */
60540+ SYMLINK_FILE_PLUGIN_ID,
60541+ /* for objects completely handled by the VFS: fifos, devices,
60542+ sockets */
60543+ SPECIAL_FILE_PLUGIN_ID,
60544+ /* regular cryptcompress file */
60545+ CRYPTCOMPRESS_FILE_PLUGIN_ID,
60546+ /* number of file plugins. Used as size of arrays to hold
60547+ file plugins. */
60548+ LAST_FILE_PLUGIN_ID
60549+} reiser4_file_id;
60550+
60551+typedef struct file_plugin {
60552+
60553+ /* generic fields */
60554+ plugin_header h;
60555+
60556+ struct inode_operations inode_ops;
60557+ struct file_operations file_ops;
60558+ struct address_space_operations as_ops;
60559+
60560+ /* save inode cached stat-data onto disk. It was called
60561+ reiserfs_update_sd() in 3.x */
60562+ int (*write_sd_by_inode) (struct inode *);
60563+
60564+ /*
60565+ * private methods: These are optional. If used they will allow you to
60566+ * minimize the amount of code needed to implement a deviation from
60567+ * some other method that also uses them.
60568+ */
60569+
60570+ /*
60571+ * Construct flow into @flow according to user-supplied data.
60572+ *
60573+ * This is used by read/write methods to construct a flow to
60574+ * write/read. ->flow_by_inode() is plugin method, rather than single
60575+ * global implementation, because key in a flow used by plugin may
60576+ * depend on data in a @buf.
60577+ *
60578+ * NIKITA-FIXME-HANS: please create statistics on what functions are
60579+ * dereferenced how often for the mongo benchmark. You can supervise
60580+ * Elena doing this for you if that helps. Email me the list of the
60581+ * top 10, with their counts, and an estimate of the total number of
60582+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
60583+ * processing (non-idle processing). If the total percent is, say,
60584+ * less than 1%, it will make our coding discussions much easier, and
60585+ * keep me from questioning whether functions like the below are too
60586+ * frequently called to be dereferenced. If the total percent is more
60587+ * than 1%, perhaps private methods should be listed in a "required"
60588+ * comment at the top of each plugin (with stern language about how if
60589+ * the comment is missing it will not be accepted by the maintainer),
60590+ * and implemented using macros not dereferenced functions. How about
60591+ * replacing this whole private methods part of the struct with a
60592+ * thorough documentation of what the standard helper functions are for
60593+ * use in constructing plugins? I think users have been asking for
60594+ * that, though not in so many words.
60595+ */
60596+ int (*flow_by_inode) (struct inode *, const char __user *buf,
60597+ int user, loff_t size,
60598+ loff_t off, rw_op op, flow_t *);
60599+
60600+ /*
60601+ * Return the key used to retrieve an offset of a file. It is used by
60602+ * default implementation of ->flow_by_inode() method
60603+ * (common_build_flow()) and, among other things, to get to the extent
60604+ * from jnode of unformatted node.
60605+ */
60606+ int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60607+
60608+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60609+ /*
60610+ * set the plugin for a file. Called during file creation in creat()
60611+ * but not reiser4() unless an inode already exists for the file.
60612+ */
60613+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60614+ reiser4_object_create_data *);
60615+
60616+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60617+ * are you setting up the object itself also or just adjusting the
60618+ * parent?.... */
60619+ /* set up plugins for new @object created in @parent. @root is root
60620+ directory. */
60621+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60622+ struct inode *root);
60623+ /*
60624+ * this does whatever is necessary to do when object is created. For
60625+ * instance, for unix files stat data is inserted. It is supposed to be
60626+ * called by create of struct inode_operations.
60627+ */
60628+ int (*create_object) (struct inode *object, struct inode *parent,
60629+ reiser4_object_create_data *);
60630+
60631+ /* this does whatever is necessary to do when object is opened */
60632+ int (*open_object) (struct inode * inode, struct file * file);
60633+ /*
60634+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60635+ * success. Deletion of an object usually includes removal of items
60636+ * building file body (for directories this is removal of "." and "..")
60637+ * and removal of stat-data item.
60638+ */
60639+ int (*delete_object) (struct inode *);
60640+
60641+ /* add link from @parent to @object */
60642+ int (*add_link) (struct inode *object, struct inode *parent);
60643+
60644+ /* remove link from @parent to @object */
60645+ int (*rem_link) (struct inode *object, struct inode *parent);
60646+
60647+ /*
60648+ * return true if item addressed by @coord belongs to @inode. This is
60649+ * used by read/write to properly slice flow into items in presence of
60650+ * multiple key assignment policies, because items of a file are not
60651+ * necessarily contiguous in a key space, for example, in a plan-b.
60652+ */
60653+ int (*owns_item) (const struct inode *, const coord_t *);
60654+
60655+ /* checks whether yet another hard links to this object can be
60656+ added */
60657+ int (*can_add_link) (const struct inode *);
60658+
60659+ /* checks whether hard links to this object can be removed */
60660+ int (*can_rem_link) (const struct inode *);
60661+
60662+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60663+ detach of directory plugin to remove ".." */
60664+ int (*detach) (struct inode * child, struct inode * parent);
60665+
60666+ /* called when @child was just looked up in the @parent. It is not
60667+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60668+ directory plugin */
60669+ int (*bind) (struct inode * child, struct inode * parent);
60670+
60671+ /* process safe-link during mount */
60672+ int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60673+ __u64 value);
60674+
60675+ /* The couple of estimate methods for all file operations */
60676+ struct {
60677+ reiser4_block_nr(*create) (const struct inode *);
60678+ reiser4_block_nr(*update) (const struct inode *);
60679+ reiser4_block_nr(*unlink) (const struct inode *,
60680+ const struct inode *);
60681+ } estimate;
60682+
60683+ /*
60684+ * reiser4 specific part of inode has a union of structures which are
60685+ * specific to a plugin. This method is called when inode is read
60686+ * (read_inode) and when file is created (common_create_child) so that
60687+ * file plugin could initialize its inode data
60688+ */
60689+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60690+ int);
60691+
60692+ /*
60693+ * This method performs progressive deletion of items and whole nodes
60694+ * from right to left.
60695+ *
60696+ * @tap: the point deletion process begins from,
60697+ * @from_key: the beginning of the deleted key range,
60698+ * @to_key: the end of the deleted key range,
60699+ * @smallest_removed: the smallest removed key,
60700+ *
60701+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60702+ * operation was interrupted for allowing atom commit .
60703+ */
60704+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60705+ const reiser4_key * to_key,
60706+ reiser4_key * smallest_removed, struct inode *,
60707+ int, int *);
60708+
60709+ /* called from ->destroy_inode() */
60710+ void (*destroy_inode) (struct inode *);
60711+
60712+ /*
60713+ * methods to serialize object identify. This is used, for example, by
60714+ * reiser4_{en,de}code_fh().
60715+ */
60716+ struct {
60717+ /* store object's identity at @area */
60718+ char *(*write) (struct inode * inode, char *area);
60719+ /* parse object from wire to the @obj */
60720+ char *(*read) (char *area, reiser4_object_on_wire * obj);
60721+ /* given object identity in @obj, find or create its dentry */
60722+ struct dentry *(*get) (struct super_block * s,
60723+ reiser4_object_on_wire * obj);
60724+ /* how many bytes ->wire.write() consumes */
60725+ int (*size) (struct inode * inode);
60726+ /* finish with object identify */
60727+ void (*done) (reiser4_object_on_wire * obj);
60728+ } wire;
60729+} file_plugin;
60730+
60731+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60732+
60733+struct reiser4_object_on_wire {
60734+ file_plugin *plugin;
60735+ union {
60736+ struct {
60737+ obj_key_id key_id;
60738+ } std;
60739+ void *generic;
60740+ } u;
60741+};
60742+
60743+/* builtin dir-plugins */
60744+typedef enum {
60745+ HASHED_DIR_PLUGIN_ID,
60746+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
60747+ LAST_DIR_ID
60748+} reiser4_dir_id;
60749+
60750+typedef struct dir_plugin {
60751+ /* generic fields */
60752+ plugin_header h;
60753+
60754+ struct inode_operations inode_ops;
60755+ struct file_operations file_ops;
60756+ struct address_space_operations as_ops;
60757+
60758+ /*
60759+ * private methods: These are optional. If used they will allow you to
60760+ * minimize the amount of code needed to implement a deviation from
60761+ * some other method that uses them. You could logically argue that
60762+ * they should be a separate type of plugin.
60763+ */
60764+
60765+ struct dentry *(*get_parent) (struct inode * childdir);
60766+
60767+ /*
60768+ * check whether "name" is acceptable name to be inserted into this
60769+ * object. Optionally implemented by directory-like objects. Can check
60770+ * for maximal length, reserved symbols etc
60771+ */
60772+ int (*is_name_acceptable) (const struct inode * inode, const char *name,
60773+ int len);
60774+
60775+ void (*build_entry_key) (const struct inode * dir /* directory where
60776+ * entry is (or will
60777+ * be) in.*/ ,
60778+ const struct qstr * name /* name of file
60779+ * referenced by this
60780+ * entry */ ,
60781+ reiser4_key * result /* resulting key of
60782+ * directory entry */ );
60783+ int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60784+ int (*add_entry) (struct inode * object, struct dentry * where,
60785+ reiser4_object_create_data * data,
60786+ reiser4_dir_entry_desc * entry);
60787+ int (*rem_entry) (struct inode * object, struct dentry * where,
60788+ reiser4_dir_entry_desc * entry);
60789+
60790+ /*
60791+ * initialize directory structure for newly created object. For normal
60792+ * unix directories, insert dot and dotdot.
60793+ */
60794+ int (*init) (struct inode * object, struct inode * parent,
60795+ reiser4_object_create_data * data);
60796+
60797+ /* destroy directory */
60798+ int (*done) (struct inode * child);
60799+
60800+ /* called when @subdir was just looked up in the @dir */
60801+ int (*attach) (struct inode * subdir, struct inode * dir);
60802+ int (*detach) (struct inode * subdir, struct inode * dir);
60803+
60804+ struct {
60805+ reiser4_block_nr(*add_entry) (const struct inode *);
60806+ reiser4_block_nr(*rem_entry) (const struct inode *);
60807+ reiser4_block_nr(*unlink) (const struct inode *,
60808+ const struct inode *);
60809+ } estimate;
60810+} dir_plugin;
60811+
60812+extern dir_plugin dir_plugins[LAST_DIR_ID];
60813+
60814+typedef struct formatting_plugin {
60815+ /* generic fields */
60816+ plugin_header h;
60817+ /* returns non-zero iff file's tail has to be stored
60818+ in a direct item. */
60819+ int (*have_tail) (const struct inode * inode, loff_t size);
60820+} formatting_plugin;
60821+
60822+typedef struct hash_plugin {
60823+ /* generic fields */
60824+ plugin_header h;
60825+ /* computes hash of the given name */
60826+ __u64(*hash) (const unsigned char *name, int len);
60827+} hash_plugin;
60828+
60829+typedef struct cipher_plugin {
60830+ /* generic fields */
60831+ plugin_header h;
60832+ struct crypto_blkcipher * (*alloc) (void);
60833+ void (*free) (struct crypto_blkcipher * tfm);
60834+ /* Offset translator. For each offset this returns (k * offset), where
60835+ k (k >= 1) is an expansion factor of the cipher algorithm.
60836+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
60837+ inflate data) offset translation guarantees that all disk cluster's
60838+ units will have keys smaller then next cluster's one.
60839+ */
60840+ loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60841+ /* Cipher algorithms can accept data only by chunks of cipher block
60842+ size. This method is to align any flow up to cipher block size when
60843+ we pass it to cipher algorithm. To align means to append padding of
60844+ special format specific to the cipher algorithm */
60845+ int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60846+ /* low-level key manager (check, install, etc..) */
60847+ int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60848+ unsigned int keylen);
60849+ /* main text processing procedures */
60850+ void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60851+ void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60852+} cipher_plugin;
60853+
60854+typedef struct digest_plugin {
60855+ /* generic fields */
60856+ plugin_header h;
60857+ /* fingerprint size in bytes */
60858+ int fipsize;
60859+ struct crypto_hash * (*alloc) (void);
60860+ void (*free) (struct crypto_hash * tfm);
60861+} digest_plugin;
60862+
60863+typedef struct compression_plugin {
60864+ /* generic fields */
60865+ plugin_header h;
60866+ int (*init) (void);
60867+ /* the maximum number of bytes the size of the "compressed" data can
60868+ * exceed the uncompressed data. */
60869+ int (*overrun) (unsigned src_len);
60870+ coa_t(*alloc) (tfm_action act);
60871+ void (*free) (coa_t coa, tfm_action act);
60872+ /* minimal size of the flow we still try to compress */
60873+ int (*min_size_deflate) (void);
60874+ __u32(*checksum) (char *data, __u32 length);
60875+ /* main transform procedures */
60876+ void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60877+ __u8 * dst_first, unsigned *dst_len);
60878+ void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60879+ __u8 * dst_first, unsigned *dst_len);
60880+} compression_plugin;
60881+
60882+typedef struct compression_mode_plugin {
60883+ /* generic fields */
60884+ plugin_header h;
60885+ /* this is called when estimating compressibility
60886+ of a logical cluster by its content */
60887+ int (*should_deflate) (struct inode * inode, cloff_t index);
60888+ /* this is called when results of compression should be saved */
60889+ int (*accept_hook) (struct inode * inode, cloff_t index);
60890+ /* this is called when results of compression should be discarded */
60891+ int (*discard_hook) (struct inode * inode, cloff_t index);
60892+} compression_mode_plugin;
60893+
60894+typedef struct cluster_plugin {
60895+ /* generic fields */
60896+ plugin_header h;
60897+ int shift;
60898+} cluster_plugin;
60899+
60900+typedef struct sd_ext_plugin {
60901+ /* generic fields */
60902+ plugin_header h;
60903+ int (*present) (struct inode * inode, char **area, int *len);
60904+ int (*absent) (struct inode * inode);
60905+ int (*save_len) (struct inode * inode);
60906+ int (*save) (struct inode * inode, char **area);
60907+ /* alignment requirement for this stat-data part */
60908+ int alignment;
60909+} sd_ext_plugin;
60910+
60911+/* this plugin contains methods to allocate objectid for newly created files,
60912+ to deallocate objectid when file gets removed, to report number of used and
60913+ free objectids */
60914+typedef struct oid_allocator_plugin {
60915+ /* generic fields */
60916+ plugin_header h;
60917+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60918+ __u64 oids);
60919+ /* used to report statfs->f_files */
60920+ __u64(*oids_used) (reiser4_oid_allocator * map);
60921+ /* get next oid to use */
60922+ __u64(*next_oid) (reiser4_oid_allocator * map);
60923+ /* used to report statfs->f_ffree */
60924+ __u64(*oids_free) (reiser4_oid_allocator * map);
60925+ /* allocate new objectid */
60926+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60927+ /* release objectid */
60928+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60929+ /* how many pages to reserve in transaction for allocation of new
60930+ objectid */
60931+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60932+ /* how many pages to reserve in transaction for freeing of an
60933+ objectid */
60934+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
60935+ void (*print_info) (const char *, reiser4_oid_allocator *);
60936+} oid_allocator_plugin;
60937+
60938+/* disk layout plugin: this specifies super block, journal, bitmap (if there
60939+ are any) locations, etc */
60940+typedef struct disk_format_plugin {
60941+ /* generic fields */
60942+ plugin_header h;
60943+ /* replay journal, initialize super_info_data, etc */
60944+ int (*init_format) (struct super_block *, void *data);
60945+
60946+ /* key of root directory stat data */
60947+ const reiser4_key *(*root_dir_key) (const struct super_block *);
60948+
60949+ int (*release) (struct super_block *);
60950+ jnode *(*log_super) (struct super_block *);
60951+ int (*check_open) (const struct inode * object);
60952+ int (*version_update) (struct super_block *);
60953+} disk_format_plugin;
60954+
60955+struct jnode_plugin {
60956+ /* generic fields */
60957+ plugin_header h;
60958+ int (*init) (jnode * node);
60959+ int (*parse) (jnode * node);
60960+ struct address_space *(*mapping) (const jnode * node);
60961+ unsigned long (*index) (const jnode * node);
60962+ jnode *(*clone) (jnode * node);
60963+};
60964+
60965+/* plugin instance. */
60966+/* */
60967+/* This is "wrapper" union for all types of plugins. Most of the code uses */
60968+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
60969+/* operates with pointers to reiser4_plugin. This union is only used in */
60970+/* some generic code in plugin/plugin.c that operates on all */
60971+/* plugins. Technically speaking purpose of this union is to add type */
60972+/* safety to said generic code: each plugin type (file_plugin, for */
60973+/* example), contains plugin_header as its first memeber. This first member */
60974+/* is located at the same place in memory as .h member of */
60975+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
60976+/* looks in the .h which is header of plugin type located in union. This */
60977+/* allows to avoid type-casts. */
60978+union reiser4_plugin {
60979+ /* generic fields */
60980+ plugin_header h;
60981+ /* file plugin */
60982+ file_plugin file;
60983+ /* directory plugin */
60984+ dir_plugin dir;
60985+ /* hash plugin, used by directory plugin */
60986+ hash_plugin hash;
60987+ /* fibration plugin used by directory plugin */
60988+ fibration_plugin fibration;
60989+ /* cipher transform plugin, used by file plugin */
60990+ cipher_plugin cipher;
60991+ /* digest transform plugin, used by file plugin */
60992+ digest_plugin digest;
60993+ /* compression transform plugin, used by file plugin */
60994+ compression_plugin compression;
60995+ /* tail plugin, used by file plugin */
60996+ formatting_plugin formatting;
60997+ /* permission plugin */
60998+ perm_plugin perm;
60999+ /* node plugin */
61000+ node_plugin node;
61001+ /* item plugin */
61002+ item_plugin item;
61003+ /* stat-data extension plugin */
61004+ sd_ext_plugin sd_ext;
61005+ /* disk layout plugin */
61006+ disk_format_plugin format;
61007+ /* object id allocator plugin */
61008+ oid_allocator_plugin oid_allocator;
61009+ /* plugin for different jnode types */
61010+ jnode_plugin jnode;
61011+ /* compression mode plugin, used by object plugin */
61012+ compression_mode_plugin compression_mode;
61013+ /* cluster plugin, used by object plugin */
61014+ cluster_plugin clust;
61015+ /* place-holder for new plugin types that can be registered
61016+ dynamically, and used by other dynamically loaded plugins. */
61017+ void *generic;
61018+};
61019+
61020+struct reiser4_plugin_ops {
61021+ /* called when plugin is initialized */
61022+ int (*init) (reiser4_plugin * plugin);
61023+ /* called when plugin is unloaded */
61024+ int (*done) (reiser4_plugin * plugin);
61025+ /* load given plugin from disk */
61026+ int (*load) (struct inode * inode,
61027+ reiser4_plugin * plugin, char **area, int *len);
61028+ /* how many space is required to store this plugin's state
61029+ in stat-data */
61030+ int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
61031+ /* save persistent plugin-data to disk */
61032+ int (*save) (struct inode * inode, reiser4_plugin * plugin,
61033+ char **area);
61034+ /* alignment requirement for on-disk state of this plugin
61035+ in number of bytes */
61036+ int alignment;
61037+ /* install itself into given inode. This can return error
61038+ (e.g., you cannot change hash of non-empty directory). */
61039+ int (*change) (struct inode * inode, reiser4_plugin * plugin,
61040+ pset_member memb);
61041+ /* install itself into given inode. This can return error
61042+ (e.g., you cannot change hash of non-empty directory). */
61043+ int (*inherit) (struct inode * inode, struct inode * parent,
61044+ reiser4_plugin * plugin);
61045+};
61046+
61047+/* functions implemented in fs/reiser4/plugin/plugin.c */
61048+
61049+/* stores plugin reference in reiser4-specific part of inode */
61050+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
61051+extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
61052+extern int init_plugins(void);
61053+
61054+/* builtin plugins */
61055+
61056+/* builtin hash-plugins */
61057+
61058+typedef enum {
61059+ RUPASOV_HASH_ID,
61060+ R5_HASH_ID,
61061+ TEA_HASH_ID,
61062+ FNV1_HASH_ID,
61063+ DEGENERATE_HASH_ID,
61064+ LAST_HASH_ID
61065+} reiser4_hash_id;
61066+
61067+/* builtin cipher plugins */
61068+
61069+typedef enum {
61070+ NONE_CIPHER_ID,
61071+ LAST_CIPHER_ID
61072+} reiser4_cipher_id;
61073+
61074+/* builtin digest plugins */
61075+
61076+typedef enum {
61077+ SHA256_32_DIGEST_ID,
61078+ LAST_DIGEST_ID
61079+} reiser4_digest_id;
61080+
61081+/* builtin compression mode plugins */
61082+typedef enum {
61083+ NONE_COMPRESSION_MODE_ID,
61084+ LATTD_COMPRESSION_MODE_ID,
61085+ ULTIM_COMPRESSION_MODE_ID,
61086+ FORCE_COMPRESSION_MODE_ID,
61087+ CONVX_COMPRESSION_MODE_ID,
61088+ LAST_COMPRESSION_MODE_ID
61089+} reiser4_compression_mode_id;
61090+
61091+/* builtin cluster plugins */
61092+typedef enum {
61093+ CLUSTER_64K_ID,
61094+ CLUSTER_32K_ID,
61095+ CLUSTER_16K_ID,
61096+ CLUSTER_8K_ID,
61097+ CLUSTER_4K_ID,
61098+ LAST_CLUSTER_ID
61099+} reiser4_cluster_id;
61100+
61101+/* builtin tail-plugins */
61102+
61103+typedef enum {
61104+ NEVER_TAILS_FORMATTING_ID,
61105+ ALWAYS_TAILS_FORMATTING_ID,
61106+ SMALL_FILE_FORMATTING_ID,
61107+ LAST_TAIL_FORMATTING_ID
61108+} reiser4_formatting_id;
61109+
61110+/* compression/clustering specific data */
61111+typedef struct compression_data {
61112+ reiser4_compression_id coa; /* id of the compression algorithm */
61113+} compression_data_t;
61114+
61115+typedef __u8 cluster_data_t; /* cluster info */
61116+
61117+/* data type used to pack parameters that we pass to vfs object creation
61118+ function create_object() */
61119+struct reiser4_object_create_data {
61120+ /* plugin to control created object */
61121+ reiser4_file_id id;
61122+ /* mode of regular file, directory or special file */
61123+/* what happens if some other sort of perm plugin is in use? */
61124+ int mode;
61125+ /* rdev of special file */
61126+ dev_t rdev;
61127+ /* symlink target */
61128+ const char *name;
61129+ /* add here something for non-standard objects you invent, like
61130+ query for interpolation file etc. */
61131+
61132+ crypto_stat_t * crypto;
61133+ compression_data_t *compression;
61134+ cluster_data_t *cluster;
61135+
61136+ struct inode *parent;
61137+ struct dentry *dentry;
61138+};
61139+
61140+/* description of directory entry being created/destroyed/sought for
61141+
61142+ It is passed down to the directory plugin and farther to the
61143+ directory item plugin methods. Creation of new directory is done in
61144+ several stages: first we search for an entry with the same name, then
61145+ create new one. reiser4_dir_entry_desc is used to store some information
61146+ collected at some stage of this process and required later: key of
61147+ item that we want to insert/delete and pointer to an object that will
61148+ be bound by the new directory entry. Probably some more fields will
61149+ be added there.
61150+
61151+*/
61152+struct reiser4_dir_entry_desc {
61153+ /* key of directory entry */
61154+ reiser4_key key;
61155+ /* object bound by this entry. */
61156+ struct inode *obj;
61157+};
61158+
61159+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
61160+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
61161+
61162+/* used for interface with user-land: table-driven parsing in
61163+ reiser4(). */
61164+typedef struct plugin_locator {
61165+ reiser4_plugin_type type_id;
61166+ reiser4_plugin_id id;
61167+ char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
61168+ char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
61169+} plugin_locator;
61170+
61171+extern int locate_plugin(struct inode *inode, plugin_locator * loc);
61172+
61173+#define PLUGIN_BY_ID(TYPE,ID,FIELD) \
61174+static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \
61175+{ \
61176+ reiser4_plugin *plugin = plugin_by_id ( ID, id ); \
61177+ return plugin ? & plugin -> FIELD : NULL; \
61178+} \
61179+static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
61180+{ \
61181+ reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \
61182+ return plugin ? & plugin -> FIELD : NULL; \
61183+} \
61184+static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \
61185+{ \
61186+ reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \
61187+ return plugin ? & plugin -> FIELD : NULL; \
61188+} \
61189+static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \
61190+{ \
61191+ return ( reiser4_plugin * ) plugin; \
61192+} \
61193+static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \
61194+{ \
61195+ return TYPE ## _to_plugin (plugin) -> h.id; \
61196+} \
61197+typedef struct { int foo; } TYPE ## _plugin_dummy
61198+
61199+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
61200+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
61201+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
61202+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
61203+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
61204+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
61205+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
61206+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
61207+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
61208+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
61209+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
61210+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
61211+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
61212+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
61213+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61214+ compression_mode);
61215+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
61216+
61217+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
61218+
61219+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
61220+
61221+#define for_all_plugins(ptype, plugin) \
61222+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
61223+ get_plugin_list(ptype) != &plugin->h.linkage; \
61224+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
61225+
61226+
61227+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
61228+extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
61229+extern int finish_pset(struct inode *inode);
61230+
61231+/* defined in fs/reiser4/plugin/object.c */
61232+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
61233+/* defined in fs/reiser4/plugin/object.c */
61234+extern dir_plugin dir_plugins[LAST_DIR_ID];
61235+/* defined in fs/reiser4/plugin/item/static_stat.c */
61236+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
61237+/* defined in fs/reiser4/plugin/hash.c */
61238+extern hash_plugin hash_plugins[LAST_HASH_ID];
61239+/* defined in fs/reiser4/plugin/fibration.c */
61240+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
61241+/* defined in fs/reiser4/plugin/crypt.c */
61242+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
61243+/* defined in fs/reiser4/plugin/digest.c */
61244+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
61245+/* defined in fs/reiser4/plugin/compress/compress.c */
61246+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
61247+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
61248+extern compression_mode_plugin
61249+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
61250+/* defined in fs/reiser4/plugin/cluster.c */
61251+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
61252+/* defined in fs/reiser4/plugin/tail.c */
61253+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
61254+/* defined in fs/reiser4/plugin/security/security.c */
61255+extern perm_plugin perm_plugins[LAST_PERM_ID];
61256+/* defined in fs/reiser4/plugin/item/item.c */
61257+extern item_plugin item_plugins[LAST_ITEM_ID];
61258+/* defined in fs/reiser4/plugin/node/node.c */
61259+extern node_plugin node_plugins[LAST_NODE_ID];
61260+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
61261+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
61262+
61263+/* __FS_REISER4_PLUGIN_TYPES_H__ */
61264+#endif
61265+
61266+/* Make Linus happy.
61267+ Local variables:
61268+ c-indentation-style: "K&R"
61269+ mode-name: "LC"
61270+ c-basic-offset: 8
61271+ tab-width: 8
61272+ fill-column: 120
61273+ End:
61274+*/
61275diff --git a/fs/reiser4/plugin/plugin_header.h b/fs/reiser4/plugin/plugin_header.h
61276new file mode 100644
61277index 0000000..68cf5b0
61278--- /dev/null
61279+++ b/fs/reiser4/plugin/plugin_header.h
61280@@ -0,0 +1,144 @@
61281+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61282+
61283+/* plugin header. Data structures required by all plugin types. */
61284+
61285+#if !defined( __PLUGIN_HEADER_H__ )
61286+#define __PLUGIN_HEADER_H__
61287+
61288+/* plugin data-types and constants */
61289+
61290+#include "../debug.h"
61291+#include "../dformat.h"
61292+
61293+typedef enum {
61294+ REISER4_FILE_PLUGIN_TYPE,
61295+ REISER4_DIR_PLUGIN_TYPE,
61296+ REISER4_ITEM_PLUGIN_TYPE,
61297+ REISER4_NODE_PLUGIN_TYPE,
61298+ REISER4_HASH_PLUGIN_TYPE,
61299+ REISER4_FIBRATION_PLUGIN_TYPE,
61300+ REISER4_FORMATTING_PLUGIN_TYPE,
61301+ REISER4_PERM_PLUGIN_TYPE,
61302+ REISER4_SD_EXT_PLUGIN_TYPE,
61303+ REISER4_FORMAT_PLUGIN_TYPE,
61304+ REISER4_JNODE_PLUGIN_TYPE,
61305+ REISER4_CIPHER_PLUGIN_TYPE,
61306+ REISER4_DIGEST_PLUGIN_TYPE,
61307+ REISER4_COMPRESSION_PLUGIN_TYPE,
61308+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61309+ REISER4_CLUSTER_PLUGIN_TYPE,
61310+ REISER4_PLUGIN_TYPES
61311+} reiser4_plugin_type;
61312+
61313+typedef enum {
61314+ REISER4_DIRECTORY_FILE,
61315+ REISER4_REGULAR_FILE,
61316+ REISER4_SYMLINK_FILE,
61317+ REISER4_SPECIAL_FILE,
61318+} reiser4_plugin_group;
61319+
61320+struct reiser4_plugin_ops;
61321+/* generic plugin operations, supported by each
61322+ plugin type. */
61323+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
61324+
61325+/* the common part of all plugin instances. */
61326+typedef struct plugin_header {
61327+ /* plugin type */
61328+ reiser4_plugin_type type_id;
61329+ /* id of this plugin */
61330+ reiser4_plugin_id id;
61331+ /* bitmask of groups the plugin belongs to. */
61332+ reiser4_plugin_groups groups;
61333+ /* plugin operations */
61334+ reiser4_plugin_ops *pops;
61335+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
61336+ /* short label of this plugin */
61337+ const char *label;
61338+ /* descriptive string.. */
61339+ const char *desc;
61340+ /* list linkage */
61341+ struct list_head linkage;
61342+} plugin_header;
61343+
61344+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
61345+
61346+/* PRIVATE INTERFACES */
61347+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
61348+/* plugin type representation. */
61349+typedef struct reiser4_plugin_type_data {
61350+ /* internal plugin type identifier. Should coincide with
61351+ index of this item in plugins[] array. */
61352+ reiser4_plugin_type type_id;
61353+ /* short symbolic label of this plugin type. Should be no longer
61354+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
61355+ const char *label;
61356+ /* plugin type description longer than .label */
61357+ const char *desc;
61358+
61359+/* NIKITA-FIXME-HANS: define built-in */
61360+ /* number of built-in plugin instances of this type */
61361+ int builtin_num;
61362+ /* array of built-in plugins */
61363+ void *builtin;
61364+ struct list_head plugins_list;
61365+ size_t size;
61366+} reiser4_plugin_type_data;
61367+
61368+extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
61369+
61370+int is_plugin_type_valid(reiser4_plugin_type type);
61371+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
61372+
61373+static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
61374+{
61375+ char *builtin;
61376+
61377+ builtin = ptype->builtin;
61378+ return (reiser4_plugin *) (builtin + i * ptype->size);
61379+}
61380+
61381+/* return plugin by its @type_id and @id */
61382+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
61383+ reiser4_plugin_id id)
61384+{
61385+ assert("nikita-1651", is_plugin_type_valid(type));
61386+ assert("nikita-1652", is_plugin_id_valid(type, id));
61387+ return plugin_at(&plugins[type], id);
61388+}
61389+
61390+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
61391+ reiser4_plugin_id id);
61392+
61393+/**
61394+ * plugin_by_disk_id - get reiser4_plugin
61395+ * @type_id: plugin type id
61396+ * @did: plugin id in disk format
61397+ *
61398+ * Returns reiser4_plugin by plugin type id an dplugin_id.
61399+ */
61400+static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
61401+ reiser4_plugin_type type_id,
61402+ __le16 *plugin_id)
61403+{
61404+ /*
61405+ * what we should do properly is to maintain within each file-system a
61406+ * dictionary that maps on-disk plugin ids to "universal" ids. This
61407+ * dictionary will be resolved on mount time, so that this function
61408+ * will perform just one additional array lookup.
61409+ */
61410+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
61411+}
61412+
61413+/* __PLUGIN_HEADER_H__ */
61414+#endif
61415+
61416+/*
61417+ * Local variables:
61418+ * c-indentation-style: "K&R"
61419+ * mode-name: "LC"
61420+ * c-basic-offset: 8
61421+ * tab-width: 8
61422+ * fill-column: 79
61423+ * End:
61424+ */
61425diff --git a/fs/reiser4/plugin/plugin_set.c b/fs/reiser4/plugin/plugin_set.c
61426new file mode 100644
61427index 0000000..528632d
61428--- /dev/null
61429+++ b/fs/reiser4/plugin/plugin_set.c
61430@@ -0,0 +1,379 @@
61431+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61432+ * reiser4/README */
61433+/* This file contains Reiser4 plugin set operations */
61434+
61435+/* plugin sets
61436+ *
61437+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
61438+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
61439+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This
61440+ * set of plugins (so called pset) is described by structure plugin_set (see
61441+ * plugin/plugin_set.h), which contains pointers to all required plugins.
61442+ *
61443+ * Children can inherit some pset members from their parent, however sometimes
61444+ * it is useful to specify members different from parent ones. Since object's
61445+ * pset can not be easily changed without fatal consequences, we use for this
61446+ * purpose another special plugin table (so called hset, or heir set) described
61447+ * by the same structure.
61448+ *
61449+ * Inode only stores a pointers to pset and hset. Different inodes with the
61450+ * same set of pset (hset) members point to the same pset (hset). This is
61451+ * archived by storing psets and hsets in global hash table. Races are avoided
61452+ * by simple (and efficient so far) solution of never recycling psets, even
61453+ * when last inode pointing to it is destroyed.
61454+ */
61455+
61456+#include "../debug.h"
61457+#include "../super.h"
61458+#include "plugin_set.h"
61459+
61460+#include <linux/slab.h>
61461+#include <linux/stddef.h>
61462+
61463+/* slab for plugin sets */
61464+static struct kmem_cache *plugin_set_slab;
61465+
61466+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
61467+ [0 ... 7] = SPIN_LOCK_UNLOCKED
61468+};
61469+
61470+/* hash table support */
61471+
61472+#define PS_TABLE_SIZE (32)
61473+
61474+static inline plugin_set *cast_to(const unsigned long *a)
61475+{
61476+ return container_of(a, plugin_set, hashval);
61477+}
61478+
61479+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
61480+{
61481+ plugin_set *set1;
61482+ plugin_set *set2;
61483+
61484+ /* make sure fields are not missed in the code below */
61485+ cassert(sizeof *set1 ==
61486+ sizeof set1->hashval +
61487+ sizeof set1->link +
61488+ sizeof set1->file +
61489+ sizeof set1->dir +
61490+ sizeof set1->perm +
61491+ sizeof set1->formatting +
61492+ sizeof set1->hash +
61493+ sizeof set1->fibration +
61494+ sizeof set1->sd +
61495+ sizeof set1->dir_item +
61496+ sizeof set1->cipher +
61497+ sizeof set1->digest +
61498+ sizeof set1->compression +
61499+ sizeof set1->compression_mode +
61500+ sizeof set1->cluster +
61501+ sizeof set1->create);
61502+
61503+ set1 = cast_to(a1);
61504+ set2 = cast_to(a2);
61505+ return
61506+ set1->hashval == set2->hashval &&
61507+ set1->file == set2->file &&
61508+ set1->dir == set2->dir &&
61509+ set1->perm == set2->perm &&
61510+ set1->formatting == set2->formatting &&
61511+ set1->hash == set2->hash &&
61512+ set1->fibration == set2->fibration &&
61513+ set1->sd == set2->sd &&
61514+ set1->dir_item == set2->dir_item &&
61515+ set1->cipher == set2->cipher &&
61516+ set1->digest == set2->digest &&
61517+ set1->compression == set2->compression &&
61518+ set1->compression_mode == set2->compression_mode &&
61519+ set1->cluster == set2->cluster &&
61520+ set1->create == set2->create;
61521+}
61522+
61523+#define HASH_FIELD(hash, set, field) \
61524+({ \
61525+ (hash) += (unsigned long)(set)->field >> 2; \
61526+})
61527+
61528+static inline unsigned long calculate_hash(const plugin_set * set)
61529+{
61530+ unsigned long result;
61531+
61532+ result = 0;
61533+ HASH_FIELD(result, set, file);
61534+ HASH_FIELD(result, set, dir);
61535+ HASH_FIELD(result, set, perm);
61536+ HASH_FIELD(result, set, formatting);
61537+ HASH_FIELD(result, set, hash);
61538+ HASH_FIELD(result, set, fibration);
61539+ HASH_FIELD(result, set, sd);
61540+ HASH_FIELD(result, set, dir_item);
61541+ HASH_FIELD(result, set, cipher);
61542+ HASH_FIELD(result, set, digest);
61543+ HASH_FIELD(result, set, compression);
61544+ HASH_FIELD(result, set, compression_mode);
61545+ HASH_FIELD(result, set, cluster);
61546+ HASH_FIELD(result, set, create);
61547+ return result & (PS_TABLE_SIZE - 1);
61548+}
61549+
61550+static inline unsigned long
61551+pshash(ps_hash_table * table, const unsigned long *a)
61552+{
61553+ return *a;
61554+}
61555+
61556+/* The hash table definition */
61557+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
61558+#define KFREE(ptr, size) kfree(ptr)
61559+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
61560+ pseq);
61561+#undef KFREE
61562+#undef KMALLOC
61563+
61564+static ps_hash_table ps_table;
61565+static plugin_set empty_set = {
61566+ .hashval = 0,
61567+ .file = NULL,
61568+ .dir = NULL,
61569+ .perm = NULL,
61570+ .formatting = NULL,
61571+ .hash = NULL,
61572+ .fibration = NULL,
61573+ .sd = NULL,
61574+ .dir_item = NULL,
61575+ .cipher = NULL,
61576+ .digest = NULL,
61577+ .compression = NULL,
61578+ .compression_mode = NULL,
61579+ .cluster = NULL,
61580+ .create = NULL,
61581+ .link = {NULL}
61582+};
61583+
61584+plugin_set *plugin_set_get_empty(void)
61585+{
61586+ return &empty_set;
61587+}
61588+
61589+void plugin_set_put(plugin_set * set)
61590+{
61591+}
61592+
61593+static inline unsigned long *pset_field(plugin_set * set, int offset)
61594+{
61595+ return (unsigned long *)(((char *)set) + offset);
61596+}
61597+
61598+static int plugin_set_field(plugin_set ** set, const unsigned long val,
61599+ const int offset)
61600+{
61601+ unsigned long *spot;
61602+ spinlock_t *lock;
61603+ plugin_set replica;
61604+ plugin_set *twin;
61605+ plugin_set *psal;
61606+ plugin_set *orig;
61607+
61608+ assert("nikita-2902", set != NULL);
61609+ assert("nikita-2904", *set != NULL);
61610+
61611+ spot = pset_field(*set, offset);
61612+ if (unlikely(*spot == val))
61613+ return 0;
61614+
61615+ replica = *(orig = *set);
61616+ *pset_field(&replica, offset) = val;
61617+ replica.hashval = calculate_hash(&replica);
61618+ rcu_read_lock();
61619+ twin = ps_hash_find(&ps_table, &replica.hashval);
61620+ if (unlikely(twin == NULL)) {
61621+ rcu_read_unlock();
61622+ psal = kmem_cache_alloc(plugin_set_slab,
61623+ reiser4_ctx_gfp_mask_get());
61624+ if (psal == NULL)
61625+ return RETERR(-ENOMEM);
61626+ *psal = replica;
61627+ lock = &plugin_set_lock[replica.hashval & 7];
61628+ spin_lock(lock);
61629+ twin = ps_hash_find(&ps_table, &replica.hashval);
61630+ if (likely(twin == NULL)) {
61631+ *set = psal;
61632+ ps_hash_insert_rcu(&ps_table, psal);
61633+ } else {
61634+ *set = twin;
61635+ kmem_cache_free(plugin_set_slab, psal);
61636+ }
61637+ spin_unlock(lock);
61638+ } else {
61639+ rcu_read_unlock();
61640+ *set = twin;
61641+ }
61642+ return 0;
61643+}
61644+
61645+static struct {
61646+ int offset;
61647+ reiser4_plugin_groups groups;
61648+ reiser4_plugin_type type;
61649+} pset_descr[PSET_LAST] = {
61650+ [PSET_FILE] = {
61651+ .offset = offsetof(plugin_set, file),
61652+ .type = REISER4_FILE_PLUGIN_TYPE,
61653+ .groups = 0
61654+ },
61655+ [PSET_DIR] = {
61656+ .offset = offsetof(plugin_set, dir),
61657+ .type = REISER4_DIR_PLUGIN_TYPE,
61658+ .groups = 0
61659+ },
61660+ [PSET_PERM] = {
61661+ .offset = offsetof(plugin_set, perm),
61662+ .type = REISER4_PERM_PLUGIN_TYPE,
61663+ .groups = 0
61664+ },
61665+ [PSET_FORMATTING] = {
61666+ .offset = offsetof(plugin_set, formatting),
61667+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
61668+ .groups = 0
61669+ },
61670+ [PSET_HASH] = {
61671+ .offset = offsetof(plugin_set, hash),
61672+ .type = REISER4_HASH_PLUGIN_TYPE,
61673+ .groups = 0
61674+ },
61675+ [PSET_FIBRATION] = {
61676+ .offset = offsetof(plugin_set, fibration),
61677+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
61678+ .groups = 0
61679+ },
61680+ [PSET_SD] = {
61681+ .offset = offsetof(plugin_set, sd),
61682+ .type = REISER4_ITEM_PLUGIN_TYPE,
61683+ .groups = (1 << STAT_DATA_ITEM_TYPE)
61684+ },
61685+ [PSET_DIR_ITEM] = {
61686+ .offset = offsetof(plugin_set, dir_item),
61687+ .type = REISER4_ITEM_PLUGIN_TYPE,
61688+ .groups = (1 << DIR_ENTRY_ITEM_TYPE)
61689+ },
61690+ [PSET_CIPHER] = {
61691+ .offset = offsetof(plugin_set, cipher),
61692+ .type = REISER4_CIPHER_PLUGIN_TYPE,
61693+ .groups = 0
61694+ },
61695+ [PSET_DIGEST] = {
61696+ .offset = offsetof(plugin_set, digest),
61697+ .type = REISER4_DIGEST_PLUGIN_TYPE,
61698+ .groups = 0
61699+ },
61700+ [PSET_COMPRESSION] = {
61701+ .offset = offsetof(plugin_set, compression),
61702+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
61703+ .groups = 0
61704+ },
61705+ [PSET_COMPRESSION_MODE] = {
61706+ .offset = offsetof(plugin_set, compression_mode),
61707+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61708+ .groups = 0
61709+ },
61710+ [PSET_CLUSTER] = {
61711+ .offset = offsetof(plugin_set, cluster),
61712+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
61713+ .groups = 0
61714+ },
61715+ [PSET_CREATE] = {
61716+ .offset = offsetof(plugin_set, create),
61717+ .type = REISER4_FILE_PLUGIN_TYPE,
61718+ .groups = (1 << REISER4_REGULAR_FILE)
61719+ }
61720+};
61721+
61722+#define DEFINE_PSET_OPS(PREFIX) \
61723+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
61724+{ \
61725+ if (memb > PSET_LAST) \
61726+ return REISER4_PLUGIN_TYPES; \
61727+ return pset_descr[memb].type; \
61728+} \
61729+ \
61730+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
61731+ reiser4_plugin * plugin) \
61732+{ \
61733+ assert("nikita-3492", set != NULL); \
61734+ assert("nikita-3493", *set != NULL); \
61735+ assert("nikita-3494", plugin != NULL); \
61736+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
61737+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
61738+ \
61739+ if (pset_descr[memb].groups) \
61740+ if (!(pset_descr[memb].groups & plugin->h.groups)) \
61741+ return -EINVAL; \
61742+ \
61743+ return plugin_set_field(set, \
61744+ (unsigned long)plugin, pset_descr[memb].offset); \
61745+} \
61746+ \
61747+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
61748+{ \
61749+ assert("nikita-3497", set != NULL); \
61750+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
61751+ \
61752+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
61753+}
61754+
61755+DEFINE_PSET_OPS(aset);
61756+
61757+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
61758+ return plugin_set_field(set,
61759+ (unsigned long)plugin, pset_descr[memb].offset);
61760+}
61761+
61762+/**
61763+ * init_plugin_set - create plugin set cache and hash table
61764+ *
61765+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
61766+ * reiser4 module initialization.
61767+ */
61768+int init_plugin_set(void)
61769+{
61770+ int result;
61771+
61772+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61773+ if (result == 0) {
61774+ plugin_set_slab = kmem_cache_create("plugin_set",
61775+ sizeof(plugin_set), 0,
61776+ SLAB_HWCACHE_ALIGN,
61777+ NULL, NULL);
61778+ if (plugin_set_slab == NULL)
61779+ result = RETERR(-ENOMEM);
61780+ }
61781+ return result;
61782+}
61783+
61784+/**
61785+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
61786+ *
61787+ * This is called on reiser4 module unloading or system shutdown.
61788+ */
61789+void done_plugin_set(void)
61790+{
61791+ plugin_set *cur, *next;
61792+
61793+ for_all_in_htable(&ps_table, ps, cur, next) {
61794+ ps_hash_remove(&ps_table, cur);
61795+ kmem_cache_free(plugin_set_slab, cur);
61796+ }
61797+ destroy_reiser4_cache(&plugin_set_slab);
61798+ ps_hash_done(&ps_table);
61799+}
61800+
61801+/*
61802+ * Local variables:
61803+ * c-indentation-style: "K&R"
61804+ * mode-name: "LC"
61805+ * c-basic-offset: 8
61806+ * tab-width: 8
61807+ * fill-column: 120
61808+ * End:
61809+ */
61810diff --git a/fs/reiser4/plugin/plugin_set.h b/fs/reiser4/plugin/plugin_set.h
61811new file mode 100644
61812index 0000000..8edcaea
61813--- /dev/null
61814+++ b/fs/reiser4/plugin/plugin_set.h
61815@@ -0,0 +1,77 @@
61816+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61817+
61818+/* Reiser4 plugin set definition.
61819+ See fs/reiser4/plugin/plugin_set.c for details */
61820+
61821+#if !defined( __PLUGIN_SET_H__ )
61822+#define __PLUGIN_SET_H__
61823+
61824+#include "../type_safe_hash.h"
61825+#include "plugin.h"
61826+
61827+#include <linux/rcupdate.h>
61828+
61829+struct plugin_set;
61830+typedef struct plugin_set plugin_set;
61831+
61832+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61833+
61834+struct plugin_set {
61835+ unsigned long hashval;
61836+ /* plugin of file */
61837+ file_plugin *file;
61838+ /* plugin of dir */
61839+ dir_plugin *dir;
61840+ /* perm plugin for this file */
61841+ perm_plugin *perm;
61842+ /* tail policy plugin. Only meaningful for regular files */
61843+ formatting_plugin *formatting;
61844+ /* hash plugin. Only meaningful for directories. */
61845+ hash_plugin *hash;
61846+ /* fibration plugin. Only meaningful for directories. */
61847+ fibration_plugin *fibration;
61848+ /* plugin of stat-data */
61849+ item_plugin *sd;
61850+ /* plugin of items a directory is built of */
61851+ item_plugin *dir_item;
61852+ /* cipher plugin */
61853+ cipher_plugin *cipher;
61854+ /* digest plugin */
61855+ digest_plugin *digest;
61856+ /* compression plugin */
61857+ compression_plugin *compression;
61858+ /* compression mode plugin */
61859+ compression_mode_plugin *compression_mode;
61860+ /* cluster plugin */
61861+ cluster_plugin *cluster;
61862+ /* this specifies file plugin of regular children.
61863+ only meaningful for directories */
61864+ file_plugin *create;
61865+ ps_hash_link link;
61866+};
61867+
61868+extern plugin_set *plugin_set_get_empty(void);
61869+extern void plugin_set_put(plugin_set * set);
61870+
61871+extern int init_plugin_set(void);
61872+extern void done_plugin_set(void);
61873+
61874+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
61875+extern int set_plugin(plugin_set ** set, pset_member memb,
61876+ reiser4_plugin * plugin);
61877+extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
61878+ reiser4_plugin * plugin);
61879+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
61880+
61881+/* __PLUGIN_SET_H__ */
61882+#endif
61883+
61884+/* Make Linus happy.
61885+ Local variables:
61886+ c-indentation-style: "K&R"
61887+ mode-name: "LC"
61888+ c-basic-offset: 8
61889+ tab-width: 8
61890+ fill-column: 120
61891+ End:
61892+*/
61893diff --git a/fs/reiser4/plugin/security/Makefile b/fs/reiser4/plugin/security/Makefile
61894new file mode 100644
61895index 0000000..645dbb5
61896--- /dev/null
61897+++ b/fs/reiser4/plugin/security/Makefile
61898@@ -0,0 +1,4 @@
61899+obj-$(CONFIG_REISER4_FS) += security_plugins.o
61900+
61901+security_plugins-objs := \
61902+ perm.o
61903diff --git a/fs/reiser4/plugin/security/perm.c b/fs/reiser4/plugin/security/perm.c
61904new file mode 100644
61905index 0000000..ab3b4fc
61906--- /dev/null
61907+++ b/fs/reiser4/plugin/security/perm.c
61908@@ -0,0 +1,44 @@
61909+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61910+
61911+/*
61912+ * this file contains implementation of permission plugins. Currently, only
61913+ * RWX_PERM_ID is implemented
61914+ */
61915+
61916+#include "../plugin.h"
61917+#include "../plugin_header.h"
61918+#include "../../debug.h"
61919+
61920+perm_plugin perm_plugins[LAST_PERM_ID] = {
61921+ [NULL_PERM_ID] = {
61922+ .h = {
61923+ .type_id = REISER4_PERM_PLUGIN_TYPE,
61924+ .id = NULL_PERM_ID,
61925+ .pops = NULL,
61926+ .label = "null",
61927+ .desc = "stub permission plugin",
61928+ .linkage = {NULL, NULL}
61929+ },
61930+ .read_ok = NULL,
61931+ .write_ok = NULL,
61932+ .lookup_ok = NULL,
61933+ .create_ok = NULL,
61934+ .link_ok = NULL,
61935+ .unlink_ok = NULL,
61936+ .delete_ok = NULL,
61937+ .mask_ok = NULL,
61938+ .setattr_ok = NULL,
61939+ .getattr_ok = NULL,
61940+ .rename_ok = NULL,
61941+ }
61942+};
61943+
61944+/*
61945+ * Local variables:
61946+ * c-indentation-style: "K&R"
61947+ * mode-name: "LC"
61948+ * c-basic-offset: 8
61949+ * tab-width: 8
61950+ * fill-column: 79
61951+ * End:
61952+ */
61953diff --git a/fs/reiser4/plugin/security/perm.h b/fs/reiser4/plugin/security/perm.h
61954new file mode 100644
61955index 0000000..747e8f7
61956--- /dev/null
61957+++ b/fs/reiser4/plugin/security/perm.h
61958@@ -0,0 +1,82 @@
61959+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61960+
61961+/* Perm (short for "permissions") plugins common stuff. */
61962+
61963+#if !defined( __REISER4_PERM_H__ )
61964+#define __REISER4_PERM_H__
61965+
61966+#include "../../forward.h"
61967+#include "../plugin_header.h"
61968+
61969+#include <linux/types.h>
61970+#include <linux/fs.h> /* for struct file */
61971+#include <linux/dcache.h> /* for struct dentry */
61972+
61973+/* interface for perm plugin.
61974+
61975+ Perm plugin method can be implemented through:
61976+
61977+ 1. consulting ->i_mode bits in stat data
61978+
61979+ 2. obtaining acl from the tree and inspecting it
61980+
61981+ 3. asking some kernel module or user-level program to authorize access.
61982+
61983+ This allows for integration with things like capabilities, SELinux-style
61984+ secutiry contexts, etc.
61985+
61986+*/
61987+/* NIKITA-FIXME-HANS: define what this is targeted for. It does not seem to be intended for use with sys_reiser4. Explain. */
61988+typedef struct perm_plugin {
61989+ /* generic plugin fields */
61990+ plugin_header h;
61991+
61992+ /* check permissions for read/write */
61993+ int (*read_ok) (struct file *file, const char __user *buf,
61994+ size_t size, loff_t *off);
61995+ int (*write_ok) (struct file *file, const char __user *buf,
61996+ size_t size, loff_t *off);
61997+
61998+ /* check permissions for lookup */
61999+ int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
62000+
62001+ /* check permissions for create */
62002+ int (*create_ok) (struct inode * parent, struct dentry * dentry,
62003+ reiser4_object_create_data * data);
62004+
62005+ /* check permissions for linking @where to @existing */
62006+ int (*link_ok) (struct dentry * existing, struct inode * parent,
62007+ struct dentry * where);
62008+
62009+ /* check permissions for unlinking @victim from @parent */
62010+ int (*unlink_ok) (struct inode * parent, struct dentry * victim);
62011+
62012+ /* check permissions for deletion of @object whose last reference is
62013+ by @parent */
62014+ int (*delete_ok) (struct inode * parent, struct dentry * victim);
62015+ int (*mask_ok) (struct inode * inode, int mask);
62016+ /* check whether attribute change is acceptable */
62017+ int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
62018+
62019+ /* check whether stat(2) is allowed */
62020+ int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
62021+ struct dentry * dentry, struct kstat * stat);
62022+ /* check whether rename(2) is allowed */
62023+ int (*rename_ok) (struct inode * old_dir, struct dentry * old,
62024+ struct inode * new_dir, struct dentry * new);
62025+} perm_plugin;
62026+
62027+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
62028+
62029+/* __REISER4_PERM_H__ */
62030+#endif
62031+
62032+/* Make Linus happy.
62033+ Local variables:
62034+ c-indentation-style: "K&R"
62035+ mode-name: "LC"
62036+ c-basic-offset: 8
62037+ tab-width: 8
62038+ fill-column: 120
62039+ End:
62040+*/
62041diff --git a/fs/reiser4/plugin/space/Makefile b/fs/reiser4/plugin/space/Makefile
62042new file mode 100644
62043index 0000000..5a0c94f
62044--- /dev/null
62045+++ b/fs/reiser4/plugin/space/Makefile
62046@@ -0,0 +1,4 @@
62047+obj-$(CONFIG_REISER4_FS) += space_plugins.o
62048+
62049+space_plugins-objs := \
62050+ bitmap.o
62051diff --git a/fs/reiser4/plugin/space/bitmap.c b/fs/reiser4/plugin/space/bitmap.c
62052new file mode 100644
62053index 0000000..a0ff17a
62054--- /dev/null
62055+++ b/fs/reiser4/plugin/space/bitmap.c
62056@@ -0,0 +1,1585 @@
62057+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62058+
62059+#include "../../debug.h"
62060+#include "../../dformat.h"
62061+#include "../../txnmgr.h"
62062+#include "../../jnode.h"
62063+#include "../../block_alloc.h"
62064+#include "../../tree.h"
62065+#include "../../super.h"
62066+#include "../plugin.h"
62067+#include "space_allocator.h"
62068+#include "bitmap.h"
62069+
62070+#include <linux/types.h>
62071+#include <linux/fs.h> /* for struct super_block */
62072+#include <linux/mutex.h>
62073+#include <asm/div64.h>
62074+
62075+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
62076+ * blocks
62077+
62078+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
62079+ blocks loading/unloading which is different from v3.x where all bitmap
62080+ blocks are loaded at mount time.
62081+
62082+ To implement bitmap blocks unloading we need to count bitmap block usage
62083+ and detect currently unused blocks allowing them to be unloaded. It is not
62084+ a simple task since we allow several threads to modify one bitmap block
62085+ simultaneously.
62086+
62087+ Briefly speaking, the following schema is proposed: we count in special
62088+ variable associated with each bitmap block. That is for counting of block
62089+ alloc/dealloc operations on that bitmap block. With a deferred block
62090+ deallocation feature of reiser4 all those operation will be represented in
62091+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
62092+ nodes.
62093+
62094+ So, we increment usage counter for each new node allocated or deleted, and
62095+ decrement it at atom commit one time for each node from the dirty/deleted
62096+ atom's list. Of course, freshly allocated node deletion and node reusing
62097+ from atom deleted (if we do so) list should decrement bitmap usage counter
62098+ also.
62099+
62100+ This schema seems to be working but that reference counting is
62101+ not easy to debug. I think we should agree with Hans and do not implement
62102+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
62103+
62104+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
62105+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
62106+ first access to it, the "dont_load_bitmap" mount option controls whether
62107+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
62108+ nodes currently is not supported. */
62109+
62110+#define CHECKSUM_SIZE 4
62111+
62112+#define BYTES_PER_LONG (sizeof(long))
62113+
62114+#if BITS_PER_LONG == 64
62115+# define LONG_INT_SHIFT (6)
62116+#else
62117+# define LONG_INT_SHIFT (5)
62118+#endif
62119+
62120+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
62121+
62122+typedef unsigned long ulong_t;
62123+
62124+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
62125+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
62126+
62127+/* Block allocation/deallocation are done through special bitmap objects which
62128+ are allocated in an array at fs mount. */
62129+struct bitmap_node {
62130+ struct mutex mutex; /* long term lock object */
62131+
62132+ jnode *wjnode; /* j-nodes for WORKING ... */
62133+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
62134+
62135+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
62136+
62137+ atomic_t loaded; /* a flag which shows that bnode is loaded
62138+ * already */
62139+};
62140+
62141+static inline char *bnode_working_data(struct bitmap_node *bnode)
62142+{
62143+ char *data;
62144+
62145+ data = jdata(bnode->wjnode);
62146+ assert("zam-429", data != NULL);
62147+
62148+ return data + CHECKSUM_SIZE;
62149+}
62150+
62151+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
62152+{
62153+ char *data;
62154+
62155+ data = jdata(bnode->cjnode);
62156+ assert("zam-430", data != NULL);
62157+
62158+ return data + CHECKSUM_SIZE;
62159+}
62160+
62161+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
62162+{
62163+ char *data;
62164+
62165+ data = jdata(bnode->cjnode);
62166+ assert("vpf-261", data != NULL);
62167+
62168+ return le32_to_cpu(get_unaligned((d32 *)data));
62169+}
62170+
62171+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
62172+{
62173+ char *data;
62174+
62175+ data = jdata(bnode->cjnode);
62176+ assert("vpf-261", data != NULL);
62177+
62178+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
62179+}
62180+
62181+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
62182+ * written the code, does this added abstraction still have */
62183+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
62184+ * reiser4_space_allocator structure) */
62185+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
62186+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
62187+ * someday?". What they about? If there is a reason to have a union, it should
62188+ * be a union, if not, it should not be a union. "..might be someday" means no
62189+ * reason. */
62190+struct bitmap_allocator_data {
62191+ /* an array for bitmap blocks direct access */
62192+ struct bitmap_node *bitmap;
62193+};
62194+
62195+#define get_barray(super) \
62196+(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
62197+
62198+#define get_bnode(super, i) (get_barray(super) + i)
62199+
62200+/* allocate and initialize jnode with JNODE_BITMAP type */
62201+static jnode *bnew(void)
62202+{
62203+ jnode *jal = jalloc();
62204+
62205+ if (jal)
62206+ jnode_init(jal, current_tree, JNODE_BITMAP);
62207+
62208+ return jal;
62209+}
62210+
62211+/* this file contains:
62212+ - bitmap based implementation of space allocation plugin
62213+ - all the helper functions like set bit, find_first_zero_bit, etc */
62214+
62215+/* Audited by: green(2002.06.12) */
62216+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
62217+{
62218+ ulong_t mask = 1UL << start_bit;
62219+ int i = start_bit;
62220+
62221+ while ((word & mask) != 0) {
62222+ mask <<= 1;
62223+ if (++i >= BITS_PER_LONG)
62224+ break;
62225+ }
62226+
62227+ return i;
62228+}
62229+
62230+#include <asm/bitops.h>
62231+
62232+#if BITS_PER_LONG == 64
62233+
62234+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
62235+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
62236+
62237+static inline void reiser4_set_bit(int nr, void *addr)
62238+{
62239+ ext2_set_bit(nr + OFF(addr), BASE(addr));
62240+}
62241+
62242+static inline void reiser4_clear_bit(int nr, void *addr)
62243+{
62244+ ext2_clear_bit(nr + OFF(addr), BASE(addr));
62245+}
62246+
62247+static inline int reiser4_test_bit(int nr, void *addr)
62248+{
62249+ return ext2_test_bit(nr + OFF(addr), BASE(addr));
62250+}
62251+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
62252+ int offset)
62253+{
62254+ int off = OFF(addr);
62255+
62256+ return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
62257+ offset + off) - off;
62258+}
62259+
62260+#else
62261+
62262+#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr)
62263+#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
62264+#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr)
62265+
62266+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
62267+ext2_find_next_zero_bit(addr, maxoffset, offset)
62268+#endif
62269+
62270+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
62271+ * are counted from @addr, return the offset of the first bit if it is found,
62272+ * @maxoffset otherwise. */
62273+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
62274+ bmap_off_t start_offset)
62275+{
62276+ ulong_t *base = addr;
62277+ /* start_offset is in bits, convert it to byte offset within bitmap. */
62278+ int word_nr = start_offset >> LONG_INT_SHIFT;
62279+ /* bit number within the byte. */
62280+ int bit_nr = start_offset & LONG_INT_MASK;
62281+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
62282+
62283+ assert("zam-387", max_offset != 0);
62284+
62285+ /* Unaligned @start_offset case. */
62286+ if (bit_nr != 0) {
62287+ bmap_nr_t nr;
62288+
62289+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
62290+
62291+ if (nr < BITS_PER_LONG)
62292+ return (word_nr << LONG_INT_SHIFT) + nr;
62293+
62294+ ++word_nr;
62295+ }
62296+
62297+ /* Fast scan trough aligned words. */
62298+ while (word_nr <= max_word_nr) {
62299+ if (base[word_nr] != 0) {
62300+ return (word_nr << LONG_INT_SHIFT)
62301+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
62302+ }
62303+
62304+ ++word_nr;
62305+ }
62306+
62307+ return max_offset;
62308+}
62309+
62310+#if BITS_PER_LONG == 64
62311+
62312+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
62313+ bmap_off_t start_offset)
62314+{
62315+ bmap_off_t off = OFF(addr);
62316+
62317+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
62318+ start_offset + off) - off;
62319+}
62320+
62321+#else
62322+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
62323+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
62324+#endif
62325+
62326+/* search for the first set bit in single word. */
62327+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
62328+{
62329+ ulong_t bit_mask;
62330+ int nr = start_bit;
62331+
62332+ assert("zam-965", start_bit < BITS_PER_LONG);
62333+ assert("zam-966", start_bit >= 0);
62334+
62335+ bit_mask = (1UL << nr);
62336+
62337+ while (bit_mask != 0) {
62338+ if (bit_mask & word)
62339+ return nr;
62340+ bit_mask >>= 1;
62341+ nr--;
62342+ }
62343+ return BITS_PER_LONG;
62344+}
62345+
62346+/* Search bitmap for a set bit in backward direction from the end to the
62347+ * beginning of given region
62348+ *
62349+ * @result: result offset of the last set bit
62350+ * @addr: base memory address,
62351+ * @low_off: low end of the search region, edge bit included into the region,
62352+ * @high_off: high end of the search region, edge bit included into the region,
62353+ *
62354+ * @return: 0 - set bit was found, -1 otherwise.
62355+ */
62356+static int
62357+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
62358+ bmap_off_t high_off)
62359+{
62360+ ulong_t *base = addr;
62361+ int last_word;
62362+ int first_word;
62363+ int last_bit;
62364+ int nr;
62365+
62366+ assert("zam-962", high_off >= low_off);
62367+
62368+ last_word = high_off >> LONG_INT_SHIFT;
62369+ last_bit = high_off & LONG_INT_MASK;
62370+ first_word = low_off >> LONG_INT_SHIFT;
62371+
62372+ if (last_bit < BITS_PER_LONG) {
62373+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
62374+ if (nr < BITS_PER_LONG) {
62375+ *result = (last_word << LONG_INT_SHIFT) + nr;
62376+ return 0;
62377+ }
62378+ --last_word;
62379+ }
62380+ while (last_word >= first_word) {
62381+ if (base[last_word] != 0x0) {
62382+ last_bit =
62383+ find_last_set_bit_in_word(base[last_word],
62384+ BITS_PER_LONG - 1);
62385+ assert("zam-972", last_bit < BITS_PER_LONG);
62386+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
62387+ return 0;
62388+ }
62389+ --last_word;
62390+ }
62391+
62392+ return -1; /* set bit not found */
62393+}
62394+
62395+/* Search bitmap for a clear bit in backward direction from the end to the
62396+ * beginning of given region */
62397+static int
62398+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
62399+ bmap_off_t high_off)
62400+{
62401+ ulong_t *base = addr;
62402+ int last_word;
62403+ int first_word;
62404+ int last_bit;
62405+ int nr;
62406+
62407+ last_word = high_off >> LONG_INT_SHIFT;
62408+ last_bit = high_off & LONG_INT_MASK;
62409+ first_word = low_off >> LONG_INT_SHIFT;
62410+
62411+ if (last_bit < BITS_PER_LONG) {
62412+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
62413+ if (nr < BITS_PER_LONG) {
62414+ *result = (last_word << LONG_INT_SHIFT) + nr;
62415+ return 0;
62416+ }
62417+ --last_word;
62418+ }
62419+ while (last_word >= first_word) {
62420+ if (base[last_word] != (ulong_t) (-1)) {
62421+ *result = (last_word << LONG_INT_SHIFT) +
62422+ find_last_set_bit_in_word(~base[last_word],
62423+ BITS_PER_LONG - 1);
62424+ return 0;
62425+ }
62426+ --last_word;
62427+ }
62428+
62429+ return -1; /* zero bit not found */
62430+}
62431+
62432+/* Audited by: green(2002.06.12) */
62433+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
62434+{
62435+ int first_byte;
62436+ int last_byte;
62437+
62438+ unsigned char first_byte_mask = 0xFF;
62439+ unsigned char last_byte_mask = 0xFF;
62440+
62441+ assert("zam-410", start < end);
62442+
62443+ first_byte = start >> 3;
62444+ last_byte = (end - 1) >> 3;
62445+
62446+ if (last_byte > first_byte + 1)
62447+ memset(addr + first_byte + 1, 0,
62448+ (size_t) (last_byte - first_byte - 1));
62449+
62450+ first_byte_mask >>= 8 - (start & 0x7);
62451+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
62452+
62453+ if (first_byte == last_byte) {
62454+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
62455+ } else {
62456+ addr[first_byte] &= first_byte_mask;
62457+ addr[last_byte] &= last_byte_mask;
62458+ }
62459+}
62460+
62461+/* Audited by: green(2002.06.12) */
62462+/* ZAM-FIXME-HANS: comment this */
62463+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
62464+{
62465+ int first_byte;
62466+ int last_byte;
62467+
62468+ unsigned char first_byte_mask = 0xFF;
62469+ unsigned char last_byte_mask = 0xFF;
62470+
62471+ assert("zam-386", start < end);
62472+
62473+ first_byte = start >> 3;
62474+ last_byte = (end - 1) >> 3;
62475+
62476+ if (last_byte > first_byte + 1)
62477+ memset(addr + first_byte + 1, 0xFF,
62478+ (size_t) (last_byte - first_byte - 1));
62479+
62480+ first_byte_mask <<= start & 0x7;
62481+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
62482+
62483+ if (first_byte == last_byte) {
62484+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
62485+ } else {
62486+ addr[first_byte] |= first_byte_mask;
62487+ addr[last_byte] |= last_byte_mask;
62488+ }
62489+}
62490+
62491+#define ADLER_BASE 65521
62492+#define ADLER_NMAX 5552
62493+
62494+/* Calculates the adler32 checksum for the data pointed by `data` of the
62495+ length `len`. This function was originally taken from zlib, version 1.1.3,
62496+ July 9th, 1998.
62497+
62498+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
62499+
62500+ This software is provided 'as-is', without any express or implied
62501+ warranty. In no event will the authors be held liable for any damages
62502+ arising from the use of this software.
62503+
62504+ Permission is granted to anyone to use this software for any purpose,
62505+ including commercial applications, and to alter it and redistribute it
62506+ freely, subject to the following restrictions:
62507+
62508+ 1. The origin of this software must not be misrepresented; you must not
62509+ claim that you wrote the original software. If you use this software
62510+ in a product, an acknowledgment in the product documentation would be
62511+ appreciated but is not required.
62512+ 2. Altered source versions must be plainly marked as such, and must not be
62513+ misrepresented as being the original software.
62514+ 3. This notice may not be removed or altered from any source distribution.
62515+
62516+ Jean-loup Gailly Mark Adler
62517+ jloup@gzip.org madler@alumni.caltech.edu
62518+
62519+ The above comment applies only to the reiser4_adler32 function.
62520+*/
62521+
62522+__u32 reiser4_adler32(char *data, __u32 len)
62523+{
62524+ unsigned char *t = data;
62525+ __u32 s1 = 1;
62526+ __u32 s2 = 0;
62527+ int k;
62528+
62529+ while (len > 0) {
62530+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
62531+ len -= k;
62532+
62533+ while (k--) {
62534+ s1 += *t++;
62535+ s2 += s1;
62536+ }
62537+
62538+ s1 %= ADLER_BASE;
62539+ s2 %= ADLER_BASE;
62540+ }
62541+ return (s2 << 16) | s1;
62542+}
62543+
62544+#define sb_by_bnode(bnode) \
62545+ ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
62546+
62547+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
62548+{
62549+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
62550+}
62551+
62552+static int
62553+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
62554+{
62555+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
62556+ bmap_nr_t bmap;
62557+
62558+ bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
62559+
62560+ warning("vpf-263",
62561+ "Checksum for the bitmap block %llu is incorrect",
62562+ bmap);
62563+
62564+ return RETERR(-EIO);
62565+ }
62566+
62567+ return 0;
62568+}
62569+
62570+#define REISER4_CHECK_BMAP_CRC (0)
62571+
62572+#if REISER4_CHECK_BMAP_CRC
62573+static int bnode_check_crc(const struct bitmap_node *bnode)
62574+{
62575+ return bnode_check_adler32(bnode,
62576+ bmap_size(sb_by_bnode(bnode)->s_blocksize));
62577+}
62578+
62579+/* REISER4_CHECK_BMAP_CRC */
62580+#else
62581+
62582+#define bnode_check_crc(bnode) (0)
62583+
62584+/* REISER4_CHECK_BMAP_CRC */
62585+#endif
62586+
62587+/* Recalculates the adler32 checksum for only 1 byte change.
62588+ adler - previous adler checksum
62589+ old_data, data - old, new byte values.
62590+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
62591+ the changed byte within this chunk.
62592+ This function can be used for checksum calculation optimisation.
62593+*/
62594+
62595+static __u32
62596+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62597+ __u32 tail)
62598+{
62599+ __u32 delta = data - old_data + 2 * ADLER_BASE;
62600+ __u32 s1 = adler & 0xffff;
62601+ __u32 s2 = (adler >> 16) & 0xffff;
62602+
62603+ s1 = (delta + s1) % ADLER_BASE;
62604+ s2 = (delta * tail + s2) % ADLER_BASE;
62605+
62606+ return (s2 << 16) | s1;
62607+}
62608+
62609+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62610+
62611+/**
62612+ * get_nr_bitmap - calculate number of bitmap blocks
62613+ * @super: super block with initialized blocksize and block count
62614+ *
62615+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62616+ * maintain free disk space. It assumes that each bitmap addresses the same
62617+ * number of blocks which is calculated by bmap_block_count macro defined in
62618+ * above. Number of blocks in the filesystem has to be initialized in reiser4
62619+ * private data of super block already so that it can be obtained via
62620+ * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62621+ * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62622+ * to use special function to divide and modulo 64bits filesystem block
62623+ * counters.
62624+ *
62625+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62626+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62627+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62628+ */
62629+static bmap_nr_t get_nr_bmap(const struct super_block *super)
62630+{
62631+ u64 quotient;
62632+
62633+ assert("zam-393", reiser4_block_count(super) != 0);
62634+
62635+ quotient = reiser4_block_count(super) - 1;
62636+ do_div(quotient, bmap_bit_count(super->s_blocksize));
62637+ return quotient + 1;
62638+}
62639+
62640+/**
62641+ * parse_blocknr - calculate bitmap number and offset in it by block number
62642+ * @block: pointer to block number to calculate location in bitmap of
62643+ * @bmap: pointer where to store bitmap block number
62644+ * @offset: pointer where to store offset within bitmap block
62645+ *
62646+ * Calculates location of bit which is responsible for allocation/freeing of
62647+ * block @*block. That location is represented by bitmap block number and offset
62648+ * within that bitmap block.
62649+ */
62650+static void
62651+parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62652+ bmap_off_t *offset)
62653+{
62654+ struct super_block *super = get_current_context()->super;
62655+ u64 quotient = *block;
62656+
62657+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62658+ *bmap = quotient;
62659+
62660+ assert("zam-433", *bmap < get_nr_bmap(super));
62661+ assert("", *offset < bmap_bit_count(super->s_blocksize));
62662+}
62663+
62664+#if REISER4_DEBUG
62665+/* Audited by: green(2002.06.12) */
62666+static void
62667+check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62668+{
62669+ struct super_block *sb = reiser4_get_current_sb();
62670+
62671+ assert("zam-436", sb != NULL);
62672+
62673+ assert("zam-455", start != NULL);
62674+ assert("zam-437", *start != 0);
62675+ assert("zam-541", !reiser4_blocknr_is_fake(start));
62676+ assert("zam-441", *start < reiser4_block_count(sb));
62677+
62678+ if (len != NULL) {
62679+ assert("zam-438", *len != 0);
62680+ assert("zam-442", *start + *len <= reiser4_block_count(sb));
62681+ }
62682+}
62683+
62684+static void check_bnode_loaded(const struct bitmap_node *bnode)
62685+{
62686+ assert("zam-485", bnode != NULL);
62687+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62688+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62689+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62690+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62691+}
62692+
62693+#else
62694+
62695+# define check_block_range(start, len) do { /* nothing */} while(0)
62696+# define check_bnode_loaded(bnode) do { /* nothing */} while(0)
62697+
62698+#endif
62699+
62700+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62701+ spin-locked */
62702+static inline void
62703+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62704+{
62705+ if (offset < bnode->first_zero_bit)
62706+ bnode->first_zero_bit = offset;
62707+}
62708+
62709+/* return a physical disk address for logical bitmap number @bmap */
62710+/* FIXME-VS: this is somehow related to disk layout? */
62711+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62712+ * per block allocation so that performance is not affected. Probably this
62713+ * whole file should be considered part of the disk layout plugin, and other
62714+ * disk layouts can use other defines and efficiency will not be significantly
62715+ * affected. */
62716+
62717+#define REISER4_FIRST_BITMAP_BLOCK \
62718+ ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62719+
62720+/* Audited by: green(2002.06.12) */
62721+static void
62722+get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62723+ reiser4_block_nr * bnr)
62724+{
62725+
62726+ assert("zam-390", bmap < get_nr_bmap(super));
62727+
62728+#ifdef CONFIG_REISER4_BADBLOCKS
62729+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62730+ /* Check if the diskmap have this already, first. */
62731+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62732+ return; /* Found it in diskmap */
62733+#endif
62734+ /* FIXME_ZAM: before discussing of disk layouts and disk format
62735+ plugins I implement bitmap location scheme which is close to scheme
62736+ used in reiser 3.6 */
62737+ if (bmap == 0) {
62738+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
62739+ } else {
62740+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
62741+ }
62742+}
62743+
62744+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62745+/* Audited by: green(2002.06.12) */
62746+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62747+{
62748+ *bnr =
62749+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62750+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62751+}
62752+
62753+/* bnode structure initialization */
62754+static void
62755+init_bnode(struct bitmap_node *bnode,
62756+ struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62757+{
62758+ memset(bnode, 0, sizeof(struct bitmap_node));
62759+
62760+ mutex_init(&bnode->mutex);
62761+ atomic_set(&bnode->loaded, 0);
62762+}
62763+
62764+static void release(jnode * node)
62765+{
62766+ jrelse(node);
62767+ JF_SET(node, JNODE_HEARD_BANSHEE);
62768+ jput(node);
62769+}
62770+
62771+/* This function is for internal bitmap.c use because it assumes that jnode is
62772+ in under full control of this thread */
62773+static void done_bnode(struct bitmap_node *bnode)
62774+{
62775+ if (bnode) {
62776+ atomic_set(&bnode->loaded, 0);
62777+ if (bnode->wjnode != NULL)
62778+ release(bnode->wjnode);
62779+ if (bnode->cjnode != NULL)
62780+ release(bnode->cjnode);
62781+ bnode->wjnode = bnode->cjnode = NULL;
62782+ }
62783+}
62784+
62785+/* ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()*/
62786+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
62787+ jnode **wjnode_ret)
62788+{
62789+ struct super_block *super;
62790+ jnode *cjnode;
62791+ jnode *wjnode;
62792+ bmap_nr_t bmap;
62793+ int ret;
62794+
62795+ super = reiser4_get_current_sb();
62796+
62797+ *wjnode_ret = wjnode = bnew();
62798+ if (wjnode == NULL) {
62799+ *cjnode_ret = NULL;
62800+ return RETERR(-ENOMEM);
62801+ }
62802+
62803+ *cjnode_ret = cjnode = bnew();
62804+ if (cjnode == NULL)
62805+ return RETERR(-ENOMEM);
62806+
62807+ bmap = bnode - get_bnode(super, 0);
62808+
62809+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62810+ get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62811+
62812+ jref(cjnode);
62813+ jref(wjnode);
62814+
62815+ /* load commit bitmap */
62816+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
62817+
62818+ if (ret)
62819+ goto error;
62820+
62821+ /* allocate memory for working bitmap block. Note that for
62822+ * bitmaps jinit_new() doesn't actually modifies node content,
62823+ * so parallel calls to this are ok. */
62824+ ret = jinit_new(wjnode, GFP_NOFS);
62825+
62826+ if (ret != 0) {
62827+ jrelse(cjnode);
62828+ goto error;
62829+ }
62830+
62831+ return 0;
62832+
62833+ error:
62834+ jput(cjnode);
62835+ jput(wjnode);
62836+ *wjnode_ret = *cjnode_ret = NULL;
62837+ return ret;
62838+
62839+}
62840+
62841+/* Check the bnode data on read. */
62842+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62843+{
62844+ void *data;
62845+ int ret;
62846+
62847+ /* Check CRC */
62848+ ret = bnode_check_adler32(bnode, blksize);
62849+
62850+ if (ret) {
62851+ return ret;
62852+ }
62853+
62854+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62855+
62856+ /* Check the very first bit -- it must be busy. */
62857+ if (!reiser4_test_bit(0, data)) {
62858+ warning("vpf-1362", "The allocator block %llu is not marked "
62859+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
62860+
62861+ return -EINVAL;
62862+ }
62863+
62864+ return 0;
62865+}
62866+
62867+/* load bitmap blocks "on-demand" */
62868+static int load_and_lock_bnode(struct bitmap_node *bnode)
62869+{
62870+ int ret;
62871+
62872+ jnode *cjnode;
62873+ jnode *wjnode;
62874+
62875+ assert("nikita-3040", reiser4_schedulable());
62876+
62877+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62878+ * need to be atomic, right? Just leave a comment that if bitmaps were
62879+ * unloadable, this would need to be atomic. */
62880+ if (atomic_read(&bnode->loaded)) {
62881+ /* bitmap is already loaded, nothing to do */
62882+ check_bnode_loaded(bnode);
62883+ mutex_lock(&bnode->mutex);
62884+ assert("nikita-2827", atomic_read(&bnode->loaded));
62885+ return 0;
62886+ }
62887+
62888+ ret = prepare_bnode(bnode, &cjnode, &wjnode);
62889+ if (ret == 0) {
62890+ mutex_lock(&bnode->mutex);
62891+
62892+ if (!atomic_read(&bnode->loaded)) {
62893+ assert("nikita-2822", cjnode != NULL);
62894+ assert("nikita-2823", wjnode != NULL);
62895+ assert("nikita-2824", jnode_is_loaded(cjnode));
62896+ assert("nikita-2825", jnode_is_loaded(wjnode));
62897+
62898+ bnode->wjnode = wjnode;
62899+ bnode->cjnode = cjnode;
62900+
62901+ ret = check_struct_bnode(bnode, current_blocksize);
62902+ if (!ret) {
62903+ cjnode = wjnode = NULL;
62904+ atomic_set(&bnode->loaded, 1);
62905+ /* working bitmap is initialized by on-disk
62906+ * commit bitmap. This should be performed
62907+ * under mutex. */
62908+ memcpy(bnode_working_data(bnode),
62909+ bnode_commit_data(bnode),
62910+ bmap_size(current_blocksize));
62911+ } else
62912+ mutex_unlock(&bnode->mutex);
62913+ } else
62914+ /* race: someone already loaded bitmap while we were
62915+ * busy initializing data. */
62916+ check_bnode_loaded(bnode);
62917+ }
62918+
62919+ if (wjnode != NULL) {
62920+ release(wjnode);
62921+ bnode->wjnode = NULL;
62922+ }
62923+ if (cjnode != NULL) {
62924+ release(cjnode);
62925+ bnode->cjnode = NULL;
62926+ }
62927+
62928+ return ret;
62929+}
62930+
62931+static void release_and_unlock_bnode(struct bitmap_node *bnode)
62932+{
62933+ check_bnode_loaded(bnode);
62934+ mutex_unlock(&bnode->mutex);
62935+}
62936+
62937+/* This function does all block allocation work but only for one bitmap
62938+ block.*/
62939+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62940+ block responsibility zone boundaries. This had no sense in v3.6 but may
62941+ have it in v4.x */
62942+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62943+static int
62944+search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62945+ bmap_off_t max_offset, int min_len, int max_len)
62946+{
62947+ struct super_block *super = get_current_context()->super;
62948+ struct bitmap_node *bnode = get_bnode(super, bmap);
62949+
62950+ char *data;
62951+
62952+ bmap_off_t search_end;
62953+ bmap_off_t start;
62954+ bmap_off_t end;
62955+
62956+ int set_first_zero_bit = 0;
62957+
62958+ int ret;
62959+
62960+ assert("zam-364", min_len > 0);
62961+ assert("zam-365", max_len >= min_len);
62962+ assert("zam-366", *offset <= max_offset);
62963+
62964+ ret = load_and_lock_bnode(bnode);
62965+
62966+ if (ret)
62967+ return ret;
62968+
62969+ data = bnode_working_data(bnode);
62970+
62971+ start = *offset;
62972+
62973+ if (bnode->first_zero_bit >= start) {
62974+ start = bnode->first_zero_bit;
62975+ set_first_zero_bit = 1;
62976+ }
62977+
62978+ while (start + min_len < max_offset) {
62979+
62980+ start =
62981+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
62982+ if (set_first_zero_bit) {
62983+ bnode->first_zero_bit = start;
62984+ set_first_zero_bit = 0;
62985+ }
62986+ if (start >= max_offset)
62987+ break;
62988+
62989+ search_end = LIMIT(start + max_len, max_offset);
62990+ end =
62991+ reiser4_find_next_set_bit((long *)data, search_end, start);
62992+ if (end >= start + min_len) {
62993+ /* we can't trust find_next_set_bit result if set bit
62994+ was not fount, result may be bigger than
62995+ max_offset */
62996+ if (end > search_end)
62997+ end = search_end;
62998+
62999+ ret = end - start;
63000+ *offset = start;
63001+
63002+ reiser4_set_bits(data, start, end);
63003+
63004+ /* FIXME: we may advance first_zero_bit if [start,
63005+ end] region overlaps the first_zero_bit point */
63006+
63007+ break;
63008+ }
63009+
63010+ start = end + 1;
63011+ }
63012+
63013+ release_and_unlock_bnode(bnode);
63014+
63015+ return ret;
63016+}
63017+
63018+static int
63019+search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
63020+ bmap_off_t end_offset, int min_len, int max_len)
63021+{
63022+ struct super_block *super = get_current_context()->super;
63023+ struct bitmap_node *bnode = get_bnode(super, bmap);
63024+ char *data;
63025+ bmap_off_t start;
63026+ int ret;
63027+
63028+ assert("zam-958", min_len > 0);
63029+ assert("zam-959", max_len >= min_len);
63030+ assert("zam-960", *start_offset >= end_offset);
63031+
63032+ ret = load_and_lock_bnode(bnode);
63033+ if (ret)
63034+ return ret;
63035+
63036+ data = bnode_working_data(bnode);
63037+ start = *start_offset;
63038+
63039+ while (1) {
63040+ bmap_off_t end, search_end;
63041+
63042+ /* Find the beginning of the zero filled region */
63043+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
63044+ break;
63045+ /* Is there more than `min_len' bits from `start' to
63046+ * `end_offset'? */
63047+ if (start < end_offset + min_len - 1)
63048+ break;
63049+
63050+ /* Do not search to `end_offset' if we need to find less than
63051+ * `max_len' zero bits. */
63052+ if (end_offset + max_len - 1 < start)
63053+ search_end = start - max_len + 1;
63054+ else
63055+ search_end = end_offset;
63056+
63057+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
63058+ end = search_end;
63059+ else
63060+ end++;
63061+
63062+ if (end + min_len <= start + 1) {
63063+ if (end < search_end)
63064+ end = search_end;
63065+ ret = start - end + 1;
63066+ *start_offset = end; /* `end' is lowest offset */
63067+ assert("zam-987",
63068+ reiser4_find_next_set_bit(data, start + 1,
63069+ end) >= start + 1);
63070+ reiser4_set_bits(data, end, start + 1);
63071+ break;
63072+ }
63073+
63074+ if (end <= end_offset)
63075+ /* left search boundary reached. */
63076+ break;
63077+ start = end - 1;
63078+ }
63079+
63080+ release_and_unlock_bnode(bnode);
63081+ return ret;
63082+}
63083+
63084+/* allocate contiguous range of blocks in bitmap */
63085+static int bitmap_alloc_forward(reiser4_block_nr * start,
63086+ const reiser4_block_nr * end, int min_len,
63087+ int max_len)
63088+{
63089+ bmap_nr_t bmap, end_bmap;
63090+ bmap_off_t offset, end_offset;
63091+ int len;
63092+
63093+ reiser4_block_nr tmp;
63094+
63095+ struct super_block *super = get_current_context()->super;
63096+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
63097+
63098+ parse_blocknr(start, &bmap, &offset);
63099+
63100+ tmp = *end - 1;
63101+ parse_blocknr(&tmp, &end_bmap, &end_offset);
63102+ ++end_offset;
63103+
63104+ assert("zam-358", end_bmap >= bmap);
63105+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
63106+
63107+ for (; bmap < end_bmap; bmap++, offset = 0) {
63108+ len =
63109+ search_one_bitmap_forward(bmap, &offset, max_offset,
63110+ min_len, max_len);
63111+ if (len != 0)
63112+ goto out;
63113+ }
63114+
63115+ len =
63116+ search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
63117+ max_len);
63118+ out:
63119+ *start = bmap * max_offset + offset;
63120+ return len;
63121+}
63122+
63123+/* allocate contiguous range of blocks in bitmap (from @start to @end in
63124+ * backward direction) */
63125+static int bitmap_alloc_backward(reiser4_block_nr * start,
63126+ const reiser4_block_nr * end, int min_len,
63127+ int max_len)
63128+{
63129+ bmap_nr_t bmap, end_bmap;
63130+ bmap_off_t offset, end_offset;
63131+ int len;
63132+ struct super_block *super = get_current_context()->super;
63133+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
63134+
63135+ parse_blocknr(start, &bmap, &offset);
63136+ parse_blocknr(end, &end_bmap, &end_offset);
63137+
63138+ assert("zam-961", end_bmap <= bmap);
63139+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
63140+
63141+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
63142+ len =
63143+ search_one_bitmap_backward(bmap, &offset, 0, min_len,
63144+ max_len);
63145+ if (len != 0)
63146+ goto out;
63147+ }
63148+
63149+ len =
63150+ search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
63151+ max_len);
63152+ out:
63153+ *start = bmap * max_offset + offset;
63154+ return len;
63155+}
63156+
63157+/* plugin->u.space_allocator.alloc_blocks() */
63158+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
63159+ reiser4_block_nr *start, reiser4_block_nr *len)
63160+{
63161+ struct super_block *super = get_current_context()->super;
63162+ int actual_len;
63163+
63164+ reiser4_block_nr search_start;
63165+ reiser4_block_nr search_end;
63166+
63167+ assert("zam-398", super != NULL);
63168+ assert("zam-412", hint != NULL);
63169+ assert("zam-397", hint->blk <= reiser4_block_count(super));
63170+
63171+ if (hint->max_dist == 0)
63172+ search_end = reiser4_block_count(super);
63173+ else
63174+ search_end =
63175+ LIMIT(hint->blk + hint->max_dist,
63176+ reiser4_block_count(super));
63177+
63178+ /* We use @hint -> blk as a search start and search from it to the end
63179+ of the disk or in given region if @hint -> max_dist is not zero */
63180+ search_start = hint->blk;
63181+
63182+ actual_len =
63183+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
63184+
63185+ /* There is only one bitmap search if max_dist was specified or first
63186+ pass was from the beginning of the bitmap. We also do one pass for
63187+ scanning bitmap in backward direction. */
63188+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
63189+ /* next step is a scanning from 0 to search_start */
63190+ search_end = search_start;
63191+ search_start = 0;
63192+ actual_len =
63193+ bitmap_alloc_forward(&search_start, &search_end, 1, needed);
63194+ }
63195+ if (actual_len == 0)
63196+ return RETERR(-ENOSPC);
63197+ if (actual_len < 0)
63198+ return RETERR(actual_len);
63199+ *len = actual_len;
63200+ *start = search_start;
63201+ return 0;
63202+}
63203+
63204+static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
63205+ reiser4_block_nr * start,
63206+ reiser4_block_nr * len)
63207+{
63208+ reiser4_block_nr search_start;
63209+ reiser4_block_nr search_end;
63210+ int actual_len;
63211+
63212+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
63213+
63214+ assert("zam-969", super != NULL);
63215+ assert("zam-970", hint != NULL);
63216+ assert("zam-971", hint->blk <= reiser4_block_count(super));
63217+
63218+ search_start = hint->blk;
63219+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
63220+ search_end = 0;
63221+ else
63222+ search_end = search_start - hint->max_dist;
63223+
63224+ actual_len =
63225+ bitmap_alloc_backward(&search_start, &search_end, 1, needed);
63226+ if (actual_len == 0)
63227+ return RETERR(-ENOSPC);
63228+ if (actual_len < 0)
63229+ return RETERR(actual_len);
63230+ *len = actual_len;
63231+ *start = search_start;
63232+ return 0;
63233+}
63234+
63235+/* plugin->u.space_allocator.alloc_blocks() */
63236+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
63237+ reiser4_blocknr_hint * hint, int needed,
63238+ reiser4_block_nr * start, reiser4_block_nr * len)
63239+{
63240+ if (hint->backward)
63241+ return alloc_blocks_backward(hint, needed, start, len);
63242+ return alloc_blocks_forward(hint, needed, start, len);
63243+}
63244+
63245+/* plugin->u.space_allocator.dealloc_blocks(). */
63246+/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
63247+ nodes deletion is deferred until transaction commit. However, deallocation
63248+ of temporary objects like wandered blocks and transaction commit records
63249+ requires immediate node deletion from WORKING BITMAP.*/
63250+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
63251+ reiser4_block_nr start, reiser4_block_nr len)
63252+{
63253+ struct super_block *super = reiser4_get_current_sb();
63254+
63255+ bmap_nr_t bmap;
63256+ bmap_off_t offset;
63257+
63258+ struct bitmap_node *bnode;
63259+ int ret;
63260+
63261+ assert("zam-468", len != 0);
63262+ check_block_range(&start, &len);
63263+
63264+ parse_blocknr(&start, &bmap, &offset);
63265+
63266+ assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
63267+
63268+ bnode = get_bnode(super, bmap);
63269+
63270+ assert("zam-470", bnode != NULL);
63271+
63272+ ret = load_and_lock_bnode(bnode);
63273+ assert("zam-481", ret == 0);
63274+
63275+ reiser4_clear_bits(bnode_working_data(bnode), offset,
63276+ (bmap_off_t) (offset + len));
63277+
63278+ adjust_first_zero_bit(bnode, offset);
63279+
63280+ release_and_unlock_bnode(bnode);
63281+}
63282+
63283+/* plugin->u.space_allocator.check_blocks(). */
63284+void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
63285+ const reiser4_block_nr * len, int desired)
63286+{
63287+#if REISER4_DEBUG
63288+ struct super_block *super = reiser4_get_current_sb();
63289+
63290+ bmap_nr_t bmap;
63291+ bmap_off_t start_offset;
63292+ bmap_off_t end_offset;
63293+
63294+ struct bitmap_node *bnode;
63295+ int ret;
63296+
63297+ assert("zam-622", len != NULL);
63298+ check_block_range(start, len);
63299+ parse_blocknr(start, &bmap, &start_offset);
63300+
63301+ end_offset = start_offset + *len;
63302+ assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
63303+
63304+ bnode = get_bnode(super, bmap);
63305+
63306+ assert("nikita-2215", bnode != NULL);
63307+
63308+ ret = load_and_lock_bnode(bnode);
63309+ assert("zam-626", ret == 0);
63310+
63311+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
63312+
63313+ if (desired) {
63314+ assert("zam-623",
63315+ reiser4_find_next_zero_bit(bnode_working_data(bnode),
63316+ end_offset, start_offset)
63317+ >= end_offset);
63318+ } else {
63319+ assert("zam-624",
63320+ reiser4_find_next_set_bit(bnode_working_data(bnode),
63321+ end_offset, start_offset)
63322+ >= end_offset);
63323+ }
63324+
63325+ release_and_unlock_bnode(bnode);
63326+#endif
63327+}
63328+
63329+/* conditional insertion of @node into atom's overwrite set if it was not there */
63330+static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
63331+{
63332+ assert("zam-546", atom != NULL);
63333+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
63334+ assert("zam-548", node != NULL);
63335+
63336+ spin_lock_atom(atom);
63337+ spin_lock_jnode(node);
63338+
63339+ if (node->atom == NULL) {
63340+ JF_SET(node, JNODE_OVRWR);
63341+ insert_into_atom_ovrwr_list(atom, node);
63342+ } else {
63343+ assert("zam-549", node->atom == atom);
63344+ }
63345+
63346+ spin_unlock_jnode(node);
63347+ spin_unlock_atom(atom);
63348+}
63349+
63350+/* an actor which applies delete set to COMMIT bitmap pages and link modified
63351+ pages in a single-linked list */
63352+static int
63353+apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
63354+ const reiser4_block_nr * len, void *data)
63355+{
63356+
63357+ bmap_nr_t bmap;
63358+ bmap_off_t offset;
63359+ int ret;
63360+
63361+ long long *blocks_freed_p = data;
63362+
63363+ struct bitmap_node *bnode;
63364+
63365+ struct super_block *sb = reiser4_get_current_sb();
63366+
63367+ check_block_range(start, len);
63368+
63369+ parse_blocknr(start, &bmap, &offset);
63370+
63371+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
63372+ bitmap-based allocator and each block range can't go over a zone of
63373+ responsibility of one bitmap block; same assumption is used in
63374+ other journal hooks in bitmap code. */
63375+ bnode = get_bnode(sb, bmap);
63376+ assert("zam-448", bnode != NULL);
63377+
63378+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
63379+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
63380+ ret = load_and_lock_bnode(bnode);
63381+ if (ret)
63382+ return ret;
63383+
63384+ /* put bnode into atom's overwrite set */
63385+ cond_add_to_overwrite_set(atom, bnode->cjnode);
63386+
63387+ data = bnode_commit_data(bnode);
63388+
63389+ ret = bnode_check_crc(bnode);
63390+ if (ret != 0)
63391+ return ret;
63392+
63393+ if (len != NULL) {
63394+ /* FIXME-ZAM: a check that all bits are set should be there */
63395+ assert("zam-443",
63396+ offset + *len <= bmap_bit_count(sb->s_blocksize));
63397+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
63398+
63399+ (*blocks_freed_p) += *len;
63400+ } else {
63401+ reiser4_clear_bit(offset, data);
63402+ (*blocks_freed_p)++;
63403+ }
63404+
63405+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
63406+
63407+ release_and_unlock_bnode(bnode);
63408+
63409+ return 0;
63410+}
63411+
63412+/* plugin->u.space_allocator.pre_commit_hook(). */
63413+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
63414+ rest is done by transaction manager (allocate wandered locations for COMMIT
63415+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
63416+/* Only one instance of this function can be running at one given time, because
63417+ only one transaction can be committed a time, therefore it is safe to access
63418+ some global variables without any locking */
63419+
63420+int reiser4_pre_commit_hook_bitmap(void)
63421+{
63422+ struct super_block *super = reiser4_get_current_sb();
63423+ txn_atom *atom;
63424+
63425+ long long blocks_freed = 0;
63426+
63427+ atom = get_current_atom_locked();
63428+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
63429+ spin_unlock_atom(atom);
63430+
63431+ { /* scan atom's captured list and find all freshly allocated nodes,
63432+ * mark corresponded bits in COMMIT BITMAP as used */
63433+ struct list_head *head = ATOM_CLEAN_LIST(atom);
63434+ jnode *node = list_entry(head->next, jnode, capture_link);
63435+
63436+ while (head != &node->capture_link) {
63437+ /* we detect freshly allocated jnodes */
63438+ if (JF_ISSET(node, JNODE_RELOC)) {
63439+ int ret;
63440+ bmap_nr_t bmap;
63441+
63442+ bmap_off_t offset;
63443+ bmap_off_t index;
63444+ struct bitmap_node *bn;
63445+ __u32 size = bmap_size(super->s_blocksize);
63446+ __u32 crc;
63447+ char byte;
63448+
63449+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
63450+ assert("zam-460",
63451+ !reiser4_blocknr_is_fake(&node->blocknr));
63452+
63453+ parse_blocknr(&node->blocknr, &bmap, &offset);
63454+ bn = get_bnode(super, bmap);
63455+
63456+ index = offset >> 3;
63457+ assert("vpf-276", index < size);
63458+
63459+ ret = bnode_check_crc(bnode);
63460+ if (ret != 0)
63461+ return ret;
63462+
63463+ check_bnode_loaded(bn);
63464+ load_and_lock_bnode(bn);
63465+
63466+ byte = *(bnode_commit_data(bn) + index);
63467+ reiser4_set_bit(offset, bnode_commit_data(bn));
63468+
63469+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
63470+ *(bnode_commit_data(bn) +
63471+ index),
63472+ size - index),
63473+ bnode_set_commit_crc(bn, crc);
63474+
63475+ release_and_unlock_bnode(bn);
63476+
63477+ ret = bnode_check_crc(bn);
63478+ if (ret != 0)
63479+ return ret;
63480+
63481+ /* working of this depends on how it inserts
63482+ new j-node into clean list, because we are
63483+ scanning the same list now. It is OK, if
63484+ insertion is done to the list front */
63485+ cond_add_to_overwrite_set(atom, bn->cjnode);
63486+ }
63487+
63488+ node = list_entry(node->capture_link.next, jnode, capture_link);
63489+ }
63490+ }
63491+
63492+ blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
63493+ &blocks_freed, 0);
63494+
63495+ blocks_freed -= atom->nr_blocks_allocated;
63496+
63497+ {
63498+ reiser4_super_info_data *sbinfo;
63499+
63500+ sbinfo = get_super_private(super);
63501+
63502+ spin_lock_reiser4_super(sbinfo);
63503+ sbinfo->blocks_free_committed += blocks_freed;
63504+ spin_unlock_reiser4_super(sbinfo);
63505+ }
63506+
63507+ return 0;
63508+}
63509+
63510+/* plugin->u.space_allocator.init_allocator
63511+ constructor of reiser4_space_allocator object. It is called on fs mount */
63512+int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
63513+ struct super_block *super, void *arg)
63514+{
63515+ struct bitmap_allocator_data *data = NULL;
63516+ bmap_nr_t bitmap_blocks_nr;
63517+ bmap_nr_t i;
63518+
63519+ assert("nikita-3039", reiser4_schedulable());
63520+
63521+ /* getting memory for bitmap allocator private data holder */
63522+ data =
63523+ kmalloc(sizeof(struct bitmap_allocator_data),
63524+ reiser4_ctx_gfp_mask_get());
63525+
63526+ if (data == NULL)
63527+ return RETERR(-ENOMEM);
63528+
63529+ /* allocation and initialization for the array of bnodes */
63530+ bitmap_blocks_nr = get_nr_bmap(super);
63531+
63532+ /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
63533+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
63534+ may I never meet someone who still uses the ia32 architecture when
63535+ storage devices of that size enter the market, and wants to use ia32
63536+ with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
63537+ probably, another dynamic data structure should replace a static
63538+ array of bnodes. */
63539+ /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
63540+ data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
63541+ if (data->bitmap == NULL) {
63542+ kfree(data);
63543+ return RETERR(-ENOMEM);
63544+ }
63545+
63546+ for (i = 0; i < bitmap_blocks_nr; i++)
63547+ init_bnode(data->bitmap + i, super, i);
63548+
63549+ allocator->u.generic = data;
63550+
63551+#if REISER4_DEBUG
63552+ get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
63553+#endif
63554+
63555+ /* Load all bitmap blocks at mount time. */
63556+ if (!test_bit
63557+ (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
63558+ __u64 start_time, elapsed_time;
63559+ struct bitmap_node *bnode;
63560+ int ret;
63561+
63562+ if (REISER4_DEBUG)
63563+ printk(KERN_INFO "loading reiser4 bitmap...");
63564+ start_time = jiffies;
63565+
63566+ for (i = 0; i < bitmap_blocks_nr; i++) {
63567+ bnode = data->bitmap + i;
63568+ ret = load_and_lock_bnode(bnode);
63569+ if (ret) {
63570+ reiser4_destroy_allocator_bitmap(allocator,
63571+ super);
63572+ return ret;
63573+ }
63574+ release_and_unlock_bnode(bnode);
63575+ }
63576+
63577+ elapsed_time = jiffies - start_time;
63578+ if (REISER4_DEBUG)
63579+ printk("...done (%llu jiffies)\n",
63580+ (unsigned long long)elapsed_time);
63581+ }
63582+
63583+ return 0;
63584+}
63585+
63586+/* plugin->u.space_allocator.destroy_allocator
63587+ destructor. It is called on fs unmount */
63588+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63589+ struct super_block *super)
63590+{
63591+ bmap_nr_t bitmap_blocks_nr;
63592+ bmap_nr_t i;
63593+
63594+ struct bitmap_allocator_data *data = allocator->u.generic;
63595+
63596+ assert("zam-414", data != NULL);
63597+ assert("zam-376", data->bitmap != NULL);
63598+
63599+ bitmap_blocks_nr = get_nr_bmap(super);
63600+
63601+ for (i = 0; i < bitmap_blocks_nr; i++) {
63602+ struct bitmap_node *bnode = data->bitmap + i;
63603+
63604+ mutex_lock(&bnode->mutex);
63605+
63606+#if REISER4_DEBUG
63607+ if (atomic_read(&bnode->loaded)) {
63608+ jnode *wj = bnode->wjnode;
63609+ jnode *cj = bnode->cjnode;
63610+
63611+ assert("zam-480", jnode_page(cj) != NULL);
63612+ assert("zam-633", jnode_page(wj) != NULL);
63613+
63614+ assert("zam-634",
63615+ memcmp(jdata(wj), jdata(wj),
63616+ bmap_size(super->s_blocksize)) == 0);
63617+
63618+ }
63619+#endif
63620+ done_bnode(bnode);
63621+ mutex_unlock(&bnode->mutex);
63622+ }
63623+
63624+ vfree(data->bitmap);
63625+ kfree(data);
63626+
63627+ allocator->u.generic = NULL;
63628+
63629+ return 0;
63630+}
63631+
63632+/*
63633+ * Local variables:
63634+ * c-indentation-style: "K&R"
63635+ * mode-name: "LC"
63636+ * c-basic-offset: 8
63637+ * tab-width: 8
63638+ * fill-column: 79
63639+ * scroll-step: 1
63640+ * End:
63641+ */
63642diff --git a/fs/reiser4/plugin/space/bitmap.h b/fs/reiser4/plugin/space/bitmap.h
63643new file mode 100644
63644index 0000000..be867f1
63645--- /dev/null
63646+++ b/fs/reiser4/plugin/space/bitmap.h
63647@@ -0,0 +1,47 @@
63648+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63649+
63650+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63651+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63652+
63653+#include "../../dformat.h"
63654+#include "../../block_alloc.h"
63655+
63656+#include <linux/types.h> /* for __u?? */
63657+#include <linux/fs.h> /* for struct super_block */
63658+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63659+/* declarations of functions implementing methods of space allocator plugin for
63660+ bitmap based allocator. The functions themselves are in bitmap.c */
63661+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
63662+ struct super_block *, void *);
63663+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
63664+ struct super_block *);
63665+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
63666+ reiser4_blocknr_hint *, int needed,
63667+ reiser4_block_nr * start,
63668+ reiser4_block_nr * len);
63669+extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
63670+ const reiser4_block_nr *, int);
63671+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
63672+ reiser4_block_nr,
63673+ reiser4_block_nr);
63674+extern int reiser4_pre_commit_hook_bitmap(void);
63675+
63676+#define reiser4_post_commit_hook_bitmap() do{}while(0)
63677+#define reiser4_post_write_back_hook_bitmap() do{}while(0)
63678+#define reiser4_print_info_bitmap(pref, al) do{}while(0)
63679+
63680+typedef __u64 bmap_nr_t;
63681+typedef __u32 bmap_off_t;
63682+
63683+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63684+
63685+/* Make Linus happy.
63686+ Local variables:
63687+ c-indentation-style: "K&R"
63688+ mode-name: "LC"
63689+ c-basic-offset: 8
63690+ tab-width: 8
63691+ fill-column: 120
63692+ scroll-step: 1
63693+ End:
63694+*/
63695diff --git a/fs/reiser4/plugin/space/space_allocator.h b/fs/reiser4/plugin/space/space_allocator.h
63696new file mode 100644
63697index 0000000..5bfa9a3
63698--- /dev/null
63699+++ b/fs/reiser4/plugin/space/space_allocator.h
63700@@ -0,0 +1,80 @@
63701+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63702+
63703+#ifndef __SPACE_ALLOCATOR_H__
63704+#define __SPACE_ALLOCATOR_H__
63705+
63706+#include "../../forward.h"
63707+#include "bitmap.h"
63708+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63709+ * but... */
63710+#define DEF_SPACE_ALLOCATOR(allocator) \
63711+ \
63712+static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque) \
63713+{ \
63714+ return reiser4_init_allocator_##allocator (al, s, opaque); \
63715+} \
63716+ \
63717+static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s) \
63718+{ \
63719+ reiser4_destroy_allocator_##allocator (al, s); \
63720+} \
63721+ \
63722+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
63723+ int needed, reiser4_block_nr * start, reiser4_block_nr * len) \
63724+{ \
63725+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len); \
63726+} \
63727+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len) \
63728+{ \
63729+ reiser4_dealloc_blocks_##allocator (al, start, len); \
63730+} \
63731+ \
63732+static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired) \
63733+{ \
63734+ reiser4_check_blocks_##allocator (start, end, desired); \
63735+} \
63736+ \
63737+static inline void sa_pre_commit_hook (void) \
63738+{ \
63739+ reiser4_pre_commit_hook_##allocator (); \
63740+} \
63741+ \
63742+static inline void sa_post_commit_hook (void) \
63743+{ \
63744+ reiser4_post_commit_hook_##allocator (); \
63745+} \
63746+ \
63747+static inline void sa_post_write_back_hook (void) \
63748+{ \
63749+ reiser4_post_write_back_hook_##allocator(); \
63750+} \
63751+ \
63752+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
63753+{ \
63754+ reiser4_print_info_##allocator (prefix, al); \
63755+}
63756+
63757+DEF_SPACE_ALLOCATOR(bitmap)
63758+
63759+/* this object is part of reiser4 private in-core super block */
63760+struct reiser4_space_allocator {
63761+ union {
63762+ /* space allocators might use this pointer to reference their
63763+ * data. */
63764+ void *generic;
63765+ } u;
63766+};
63767+
63768+/* __SPACE_ALLOCATOR_H__ */
63769+#endif
63770+
63771+/* Make Linus happy.
63772+ Local variables:
63773+ c-indentation-style: "K&R"
63774+ mode-name: "LC"
63775+ c-basic-offset: 8
63776+ tab-width: 8
63777+ fill-column: 120
63778+ scroll-step: 1
63779+ End:
63780+*/
63781diff --git a/fs/reiser4/plugin/tail_policy.c b/fs/reiser4/plugin/tail_policy.c
63782new file mode 100644
63783index 0000000..43f4ae7
63784--- /dev/null
63785+++ b/fs/reiser4/plugin/tail_policy.c
63786@@ -0,0 +1,113 @@
63787+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63788+ * reiser4/README */
63789+
63790+/* Formatting policy plugins */
63791+
63792+/*
63793+ * Formatting policy plugin is used by object plugin (of regular file) to
63794+ * convert file between two representations.
63795+ *
63796+ * Currently following policies are implemented:
63797+ * never store file in formatted nodes
63798+ * always store file in formatted nodes
63799+ * store file in formatted nodes if file is smaller than 4 blocks (default)
63800+ */
63801+
63802+#include "../tree.h"
63803+#include "../inode.h"
63804+#include "../super.h"
63805+#include "object.h"
63806+#include "plugin.h"
63807+#include "node/node.h"
63808+#include "plugin_header.h"
63809+
63810+#include <linux/pagemap.h>
63811+#include <linux/fs.h> /* For struct inode */
63812+
63813+/**
63814+ * have_formatting_never -
63815+ * @inode:
63816+ * @size:
63817+ *
63818+ *
63819+ */
63820+/* Never store file's tail as direct item */
63821+/* Audited by: green(2002.06.12) */
63822+static int have_formatting_never(const struct inode *inode UNUSED_ARG
63823+ /* inode to operate on */ ,
63824+ loff_t size UNUSED_ARG /* new object size */ )
63825+{
63826+ return 0;
63827+}
63828+
63829+/* Always store file's tail as direct item */
63830+/* Audited by: green(2002.06.12) */
63831+static int
63832+have_formatting_always(const struct inode *inode UNUSED_ARG
63833+ /* inode to operate on */ ,
63834+ loff_t size UNUSED_ARG /* new object size */ )
63835+{
63836+ return 1;
63837+}
63838+
63839+/* This function makes test if we should store file denoted @inode as tails only or
63840+ as extents only. */
63841+static int
63842+have_formatting_default(const struct inode *inode UNUSED_ARG
63843+ /* inode to operate on */ ,
63844+ loff_t size /* new object size */ )
63845+{
63846+ assert("umka-1253", inode != NULL);
63847+
63848+ if (size > inode->i_sb->s_blocksize * 4)
63849+ return 0;
63850+
63851+ return 1;
63852+}
63853+
63854+/* tail plugins */
63855+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63856+ [NEVER_TAILS_FORMATTING_ID] = {
63857+ .h = {
63858+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63859+ .id = NEVER_TAILS_FORMATTING_ID,
63860+ .pops = NULL,
63861+ .label = "never",
63862+ .desc = "Never store file's tail",
63863+ .linkage = {NULL, NULL}
63864+ },
63865+ .have_tail = have_formatting_never
63866+ },
63867+ [ALWAYS_TAILS_FORMATTING_ID] = {
63868+ .h = {
63869+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63870+ .id = ALWAYS_TAILS_FORMATTING_ID,
63871+ .pops = NULL,
63872+ .label = "always",
63873+ .desc = "Always store file's tail",
63874+ .linkage = {NULL, NULL}
63875+ },
63876+ .have_tail = have_formatting_always
63877+ },
63878+ [SMALL_FILE_FORMATTING_ID] = {
63879+ .h = {
63880+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63881+ .id = SMALL_FILE_FORMATTING_ID,
63882+ .pops = NULL,
63883+ .label = "4blocks",
63884+ .desc = "store files shorter than 4 blocks in tail items",
63885+ .linkage = {NULL, NULL}
63886+ },
63887+ .have_tail = have_formatting_default
63888+ }
63889+};
63890+
63891+/*
63892+ * Local variables:
63893+ * c-indentation-style: "K&R"
63894+ * mode-name: "LC"
63895+ * c-basic-offset: 8
63896+ * tab-width: 8
63897+ * fill-column: 79
63898+ * End:
63899+ */
63900diff --git a/fs/reiser4/pool.c b/fs/reiser4/pool.c
63901new file mode 100644
63902index 0000000..f4303da
63903--- /dev/null
63904+++ b/fs/reiser4/pool.c
63905@@ -0,0 +1,234 @@
63906+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63907+ * reiser4/README */
63908+
63909+/* Fast pool allocation.
63910+
63911+ There are situations when some sub-system normally asks memory allocator
63912+ for only few objects, but under some circumstances could require much
63913+ more. Typical and actually motivating example is tree balancing. It needs
63914+ to keep track of nodes that were involved into it, and it is well-known
63915+ that in reasonable packed balanced tree most (92.938121%) percent of all
63916+ balancings end up after working with only few nodes (3.141592 on
63917+ average). But in rare cases balancing can involve much more nodes
63918+ (3*tree_height+1 in extremal situation).
63919+
63920+ On the one hand, we don't want to resort to dynamic allocation (slab,
63921+ malloc(), etc.) to allocate data structures required to keep track of
63922+ nodes during balancing. On the other hand, we cannot statically allocate
63923+ required amount of space on the stack, because first: it is useless wastage
63924+ of precious resource, and second: this amount is unknown in advance (tree
63925+ height can change).
63926+
63927+ Pools, implemented in this file are solution for this problem:
63928+
63929+ - some configurable amount of objects is statically preallocated on the
63930+ stack
63931+
63932+ - if this preallocated pool is exhausted and more objects is requested
63933+ they are allocated dynamically.
63934+
63935+ Pools encapsulate distinction between statically and dynamically allocated
63936+ objects. Both allocation and recycling look exactly the same.
63937+
63938+ To keep track of dynamically allocated objects, pool adds its own linkage
63939+ to each object.
63940+
63941+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
63942+ is not perfect. On the other hand, balancing is currently the only client
63943+ of pool code.
63944+
63945+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63946+ functions in the style of tslist/tshash, i.e., make them unreadable, but
63947+ type-safe.
63948+
63949+*/
63950+
63951+#include "debug.h"
63952+#include "pool.h"
63953+#include "super.h"
63954+
63955+#include <linux/types.h>
63956+#include <linux/err.h>
63957+
63958+/* initialize new pool object */
63959+static void reiser4_init_pool_obj(reiser4_pool_header * h /* pool object to
63960+ * initialize */ )
63961+{
63962+ INIT_LIST_HEAD(&h->usage_linkage);
63963+ INIT_LIST_HEAD(&h->level_linkage);
63964+ INIT_LIST_HEAD(&h->extra_linkage);
63965+}
63966+
63967+/* initialize new pool */
63968+void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63969+ size_t obj_size /* size of objects in @pool */ ,
63970+ int num_of_objs /* number of preallocated objects */ ,
63971+ char *data /* area for preallocated objects */ )
63972+{
63973+ reiser4_pool_header *h;
63974+ int i;
63975+
63976+ assert("nikita-955", pool != NULL);
63977+ assert("nikita-1044", obj_size > 0);
63978+ assert("nikita-956", num_of_objs >= 0);
63979+ assert("nikita-957", data != NULL);
63980+
63981+ memset(pool, 0, sizeof *pool);
63982+ pool->obj_size = obj_size;
63983+ pool->data = data;
63984+ INIT_LIST_HEAD(&pool->free);
63985+ INIT_LIST_HEAD(&pool->used);
63986+ INIT_LIST_HEAD(&pool->extra);
63987+ memset(data, 0, obj_size * num_of_objs);
63988+ for (i = 0; i < num_of_objs; ++i) {
63989+ h = (reiser4_pool_header *) (data + i * obj_size);
63990+ reiser4_init_pool_obj(h);
63991+ /* add pool header to the end of pool's free list */
63992+ list_add_tail(&h->usage_linkage, &pool->free);
63993+ }
63994+}
63995+
63996+/* release pool resources
63997+
63998+ Release all resources acquired by this pool, specifically, dynamically
63999+ allocated objects.
64000+
64001+*/
64002+void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
64003+{
64004+}
64005+
64006+/* allocate carry object from pool
64007+
64008+ First, try to get preallocated object. If this fails, resort to dynamic
64009+ allocation.
64010+
64011+*/
64012+static void *reiser4_pool_alloc(reiser4_pool * pool /* pool to allocate object
64013+ * from */ )
64014+{
64015+ reiser4_pool_header *result;
64016+
64017+ assert("nikita-959", pool != NULL);
64018+
64019+ if (!list_empty(&pool->free)) {
64020+ struct list_head *linkage;
64021+
64022+ linkage = pool->free.next;
64023+ list_del(linkage);
64024+ INIT_LIST_HEAD(linkage);
64025+ result = list_entry(linkage, reiser4_pool_header, usage_linkage);
64026+ BUG_ON(!list_empty(&result->level_linkage) ||
64027+ !list_empty(&result->extra_linkage));
64028+ } else {
64029+ /* pool is empty. Extra allocations don't deserve dedicated
64030+ slab to be served from, as they are expected to be rare. */
64031+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
64032+ if (result != 0) {
64033+ reiser4_init_pool_obj(result);
64034+ list_add(&result->extra_linkage, &pool->extra);
64035+ } else
64036+ return ERR_PTR(RETERR(-ENOMEM));
64037+ BUG_ON(!list_empty(&result->usage_linkage) ||
64038+ !list_empty(&result->level_linkage));
64039+ }
64040+ ++pool->objs;
64041+ list_add(&result->usage_linkage, &pool->used);
64042+ memset(result + 1, 0, pool->obj_size - sizeof *result);
64043+ return result;
64044+}
64045+
64046+/* return object back to the pool */
64047+void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h /* pool to return object back
64048+ * into */ )
64049+{
64050+ assert("nikita-961", h != NULL);
64051+ assert("nikita-962", pool != NULL);
64052+
64053+ --pool->objs;
64054+ assert("nikita-963", pool->objs >= 0);
64055+
64056+ list_del_init(&h->usage_linkage);
64057+ list_del_init(&h->level_linkage);
64058+
64059+ if (list_empty(&h->extra_linkage))
64060+ /*
64061+ * pool header is not an extra one. Push it onto free list
64062+ * using usage_linkage
64063+ */
64064+ list_add(&h->usage_linkage, &pool->free);
64065+ else {
64066+ /* remove pool header from pool's extra list and kfree it */
64067+ list_del(&h->extra_linkage);
64068+ kfree(h);
64069+ }
64070+}
64071+
64072+/* add new object to the carry level list
64073+
64074+ Carry level is FIFO most of the time, but not always. Complications arise
64075+ when make_space() function tries to go to the left neighbor and thus adds
64076+ carry node before existing nodes, and also, when updating delimiting keys
64077+ after moving data between two nodes, we want left node to be locked before
64078+ right node.
64079+
64080+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
64081+ opration that updates delimiting keys is sometimes called with two nodes
64082+ (when data are moved between two nodes) and sometimes with only one node
64083+ (when leftmost item is deleted in a node). In any case operation is
64084+ supplied with at least node whose left delimiting key is to be updated
64085+ (that is "right" node).
64086+
64087+*/
64088+reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool /* pool from which to
64089+ * allocate new object
64090+ */,
64091+ struct list_head *list /* list where to add
64092+ * object */,
64093+ pool_ordering order /* where to add */,
64094+ reiser4_pool_header * reference
64095+ /* after (or before) which existing object
64096+ to add */)
64097+{
64098+ reiser4_pool_header *result;
64099+
64100+ assert("nikita-972", pool != NULL);
64101+
64102+ result = reiser4_pool_alloc(pool);
64103+ if (IS_ERR(result))
64104+ return result;
64105+
64106+ assert("nikita-973", result != NULL);
64107+
64108+ switch (order) {
64109+ case POOLO_BEFORE:
64110+ __list_add(&result->level_linkage,
64111+ reference->level_linkage.prev,
64112+ &reference->level_linkage);
64113+ break;
64114+ case POOLO_AFTER:
64115+ __list_add(&result->level_linkage,
64116+ &reference->level_linkage,
64117+ reference->level_linkage.next);
64118+ break;
64119+ case POOLO_LAST:
64120+ list_add_tail(&result->level_linkage, list);
64121+ break;
64122+ case POOLO_FIRST:
64123+ list_add(&result->level_linkage, list);
64124+ break;
64125+ default:
64126+ wrong_return_value("nikita-927", "order");
64127+ }
64128+ return result;
64129+}
64130+
64131+/* Make Linus happy.
64132+ Local variables:
64133+ c-indentation-style: "K&R"
64134+ mode-name: "LC"
64135+ c-basic-offset: 8
64136+ tab-width: 8
64137+ fill-column: 120
64138+ End:
64139+*/
64140diff --git a/fs/reiser4/pool.h b/fs/reiser4/pool.h
64141new file mode 100644
64142index 0000000..174d3c6
64143--- /dev/null
64144+++ b/fs/reiser4/pool.h
64145@@ -0,0 +1,55 @@
64146+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64147+
64148+/* Fast pool allocation */
64149+
64150+#ifndef __REISER4_POOL_H__
64151+#define __REISER4_POOL_H__
64152+
64153+#include <linux/types.h>
64154+
64155+typedef struct reiser4_pool {
64156+ size_t obj_size;
64157+ int objs;
64158+ char *data;
64159+ struct list_head free;
64160+ struct list_head used;
64161+ struct list_head extra;
64162+} reiser4_pool;
64163+
64164+typedef struct reiser4_pool_header {
64165+ /* object is either on free or "used" lists */
64166+ struct list_head usage_linkage;
64167+ struct list_head level_linkage;
64168+ struct list_head extra_linkage;
64169+} reiser4_pool_header;
64170+
64171+typedef enum {
64172+ POOLO_BEFORE,
64173+ POOLO_AFTER,
64174+ POOLO_LAST,
64175+ POOLO_FIRST
64176+} pool_ordering;
64177+
64178+/* pool manipulation functions */
64179+
64180+extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
64181+ int num_of_objs, char *data);
64182+extern void reiser4_done_pool(reiser4_pool * pool);
64183+extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
64184+reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool,
64185+ struct list_head * list,
64186+ pool_ordering order,
64187+ reiser4_pool_header * reference);
64188+
64189+/* __REISER4_POOL_H__ */
64190+#endif
64191+
64192+/* Make Linus happy.
64193+ Local variables:
64194+ c-indentation-style: "K&R"
64195+ mode-name: "LC"
64196+ c-basic-offset: 8
64197+ tab-width: 8
64198+ fill-column: 120
64199+ End:
64200+*/
64201diff --git a/fs/reiser4/readahead.c b/fs/reiser4/readahead.c
64202new file mode 100644
64203index 0000000..8e5a9f1
64204--- /dev/null
64205+++ b/fs/reiser4/readahead.c
64206@@ -0,0 +1,138 @@
64207+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64208+ * reiser4/README */
64209+
64210+#include "forward.h"
64211+#include "tree.h"
64212+#include "tree_walk.h"
64213+#include "super.h"
64214+#include "inode.h"
64215+#include "key.h"
64216+#include "znode.h"
64217+
64218+#include <linux/swap.h> /* for totalram_pages */
64219+
64220+void reiser4_init_ra_info(ra_info_t * rai)
64221+{
64222+ rai->key_to_stop = *reiser4_min_key();
64223+}
64224+
64225+/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
64226+static inline int ra_adjacent_only(int flags)
64227+{
64228+ return flags & RA_ADJACENT_ONLY;
64229+}
64230+
64231+/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
64232+ if right neighbor's first key is less or equal to readahead's stop key */
64233+static int should_readahead_neighbor(znode * node, ra_info_t * info)
64234+{
64235+ int result;
64236+
64237+ read_lock_dk(znode_get_tree(node));
64238+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
64239+ read_unlock_dk(znode_get_tree(node));
64240+ return result;
64241+}
64242+
64243+#define LOW_MEM_PERCENTAGE (5)
64244+
64245+static int low_on_memory(void)
64246+{
64247+ unsigned int freepages;
64248+
64249+ freepages = nr_free_pages();
64250+ return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
64251+}
64252+
64253+/* start read for @node and for a few of its right neighbors */
64254+void formatted_readahead(znode * node, ra_info_t * info)
64255+{
64256+ ra_params_t *ra_params;
64257+ znode *cur;
64258+ int i;
64259+ int grn_flags;
64260+ lock_handle next_lh;
64261+
64262+ /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
64263+ if (reiser4_blocknr_is_fake(znode_get_block(node)))
64264+ return;
64265+
64266+ ra_params = get_current_super_ra_params();
64267+
64268+ if (znode_page(node) == NULL)
64269+ jstartio(ZJNODE(node));
64270+
64271+ if (znode_get_level(node) != LEAF_LEVEL)
64272+ return;
64273+
64274+ /* don't waste memory for read-ahead when low on memory */
64275+ if (low_on_memory())
64276+ return;
64277+
64278+ /* We can have locked nodes on upper tree levels, in this situation lock
64279+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
64280+ here. */
64281+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
64282+
64283+ i = 0;
64284+ cur = zref(node);
64285+ init_lh(&next_lh);
64286+ while (i < ra_params->max) {
64287+ const reiser4_block_nr *nextblk;
64288+
64289+ if (!should_readahead_neighbor(cur, info))
64290+ break;
64291+
64292+ if (reiser4_get_right_neighbor
64293+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
64294+ break;
64295+
64296+ nextblk = znode_get_block(next_lh.node);
64297+ if (reiser4_blocknr_is_fake(nextblk) ||
64298+ (ra_adjacent_only(ra_params->flags)
64299+ && *nextblk != *znode_get_block(cur) + 1)) {
64300+ break;
64301+ }
64302+
64303+ zput(cur);
64304+ cur = zref(next_lh.node);
64305+ done_lh(&next_lh);
64306+ if (znode_page(cur) == NULL)
64307+ jstartio(ZJNODE(cur));
64308+ else
64309+ /* Do not scan read-ahead window if pages already
64310+ * allocated (and i/o already started). */
64311+ break;
64312+
64313+ i++;
64314+ }
64315+ zput(cur);
64316+ done_lh(&next_lh);
64317+}
64318+
64319+void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
64320+{
64321+ reiser4_key *stop_key;
64322+
64323+ assert("nikita-3542", dir != NULL);
64324+ assert("nikita-3543", tap != NULL);
64325+
64326+ stop_key = &tap->ra_info.key_to_stop;
64327+ /* initialize readdir readahead information: include into readahead
64328+ * stat data of all files of the directory */
64329+ set_key_locality(stop_key, get_inode_oid(dir));
64330+ set_key_type(stop_key, KEY_SD_MINOR);
64331+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
64332+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
64333+ set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
64334+}
64335+
64336+/*
64337+ Local variables:
64338+ c-indentation-style: "K&R"
64339+ mode-name: "LC"
64340+ c-basic-offset: 8
64341+ tab-width: 8
64342+ fill-column: 80
64343+ End:
64344+*/
64345diff --git a/fs/reiser4/readahead.h b/fs/reiser4/readahead.h
64346new file mode 100644
64347index 0000000..524c574
64348--- /dev/null
64349+++ b/fs/reiser4/readahead.h
64350@@ -0,0 +1,48 @@
64351+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64352+
64353+#ifndef __READAHEAD_H__
64354+#define __READAHEAD_H__
64355+
64356+#include "key.h"
64357+
64358+typedef enum {
64359+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
64360+} ra_global_flags;
64361+
64362+/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
64363+typedef struct formatted_read_ahead_params {
64364+ unsigned long max; /* request not more than this amount of nodes. Default is totalram_pages / 4 */
64365+ int flags;
64366+} ra_params_t;
64367+
64368+typedef struct {
64369+ reiser4_key key_to_stop;
64370+} ra_info_t;
64371+
64372+void formatted_readahead(znode *, ra_info_t *);
64373+void reiser4_init_ra_info(ra_info_t * rai);
64374+
64375+struct reiser4_file_ra_state {
64376+ loff_t start; /* Current window */
64377+ loff_t size;
64378+ loff_t next_size; /* Next window size */
64379+ loff_t ahead_start; /* Ahead window */
64380+ loff_t ahead_size;
64381+ loff_t max_window_size; /* Maximum readahead window */
64382+ loff_t slow_start; /* enlarging r/a size algorithm. */
64383+};
64384+
64385+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
64386+
64387+/* __READAHEAD_H__ */
64388+#endif
64389+
64390+/*
64391+ Local variables:
64392+ c-indentation-style: "K&R"
64393+ mode-name: "LC"
64394+ c-basic-offset: 8
64395+ tab-width: 8
64396+ fill-column: 120
64397+ End:
64398+*/
64399diff --git a/fs/reiser4/reiser4.h b/fs/reiser4/reiser4.h
64400new file mode 100644
64401index 0000000..77d720e
64402--- /dev/null
64403+++ b/fs/reiser4/reiser4.h
64404@@ -0,0 +1,269 @@
64405+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64406+ * reiser4/README */
64407+
64408+/* definitions of common constants used by reiser4 */
64409+
64410+#if !defined( __REISER4_H__ )
64411+#define __REISER4_H__
64412+
64413+#include <asm/param.h> /* for HZ */
64414+#include <linux/errno.h>
64415+#include <linux/types.h>
64416+#include <linux/fs.h>
64417+#include <linux/hardirq.h>
64418+#include <linux/sched.h>
64419+
64420+/*
64421+ * reiser4 compilation options.
64422+ */
64423+
64424+#if defined(CONFIG_REISER4_DEBUG)
64425+/* turn on assertion checks */
64426+#define REISER4_DEBUG (1)
64427+#else
64428+#define REISER4_DEBUG (0)
64429+#endif
64430+
64431+#if defined(CONFIG_ZLIB_INFLATE)
64432+/* turn on zlib */
64433+#define REISER4_ZLIB (1)
64434+#else
64435+#define REISER4_ZLIB (0)
64436+#endif
64437+
64438+#if defined(CONFIG_CRYPTO_SHA256)
64439+#define REISER4_SHA256 (1)
64440+#else
64441+#define REISER4_SHA256 (0)
64442+#endif
64443+
64444+/*
64445+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
64446+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
64447+ * components. Additional component, referred to as "ordering" is used to
64448+ * order items from which given object is composed of. As such, ordering is
64449+ * placed between locality and objectid. For directory item ordering contains
64450+ * initial prefix of the file name this item is for. This sorts all directory
64451+ * items within given directory lexicographically (but see
64452+ * fibration.[ch]). For file body and stat-data, ordering contains initial
64453+ * prefix of the name file was initially created with. In the common case
64454+ * (files with single name) this allows to order file bodies and stat-datas in
64455+ * the same order as their respective directory entries, thus speeding up
64456+ * readdir.
64457+ *
64458+ * Note, that kernel can only mount file system with the same key size as one
64459+ * it is compiled for, so flipping this option may render your data
64460+ * inaccessible.
64461+ */
64462+#define REISER4_LARGE_KEY (1)
64463+/*#define REISER4_LARGE_KEY (0)*/
64464+
64465+/*#define GUESS_EXISTS 1*/
64466+
64467+/*
64468+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
64469+ * option
64470+ */
64471+
64472+extern const char *REISER4_SUPER_MAGIC_STRING;
64473+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
64474+ * beginning of device */
64475+
64476+/* here go tunable parameters that are not worth special entry in kernel
64477+ configuration */
64478+
64479+/* default number of slots in coord-by-key caches */
64480+#define CBK_CACHE_SLOTS (16)
64481+/* how many elementary tree operation to carry on the next level */
64482+#define CARRIES_POOL_SIZE (5)
64483+/* size of pool of preallocated nodes for carry process. */
64484+#define NODES_LOCKED_POOL_SIZE (5)
64485+
64486+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64487+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64488+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
64489+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
64490+
64491+/* we are supporting reservation of disk space on uid basis */
64492+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
64493+/* we are supporting reservation of disk space for groups */
64494+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
64495+/* we are supporting reservation of disk space for root */
64496+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
64497+/* we use rapid flush mode, see flush.c for comments. */
64498+#define REISER4_USE_RAPID_FLUSH (1)
64499+
64500+/*
64501+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
64502+ */
64503+#define REISER4_USE_ENTD (1)
64504+
64505+/* key allocation is Plan-A */
64506+#define REISER4_PLANA_KEY_ALLOCATION (1)
64507+/* key allocation follows good old 3.x scheme */
64508+#define REISER4_3_5_KEY_ALLOCATION (0)
64509+
64510+/* size of hash-table for znodes */
64511+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64512+
64513+/* number of buckets in lnode hash-table */
64514+#define LNODE_HTABLE_BUCKETS (1024)
64515+
64516+/* some ridiculously high maximal limit on height of znode tree. This
64517+ is used in declaration of various per level arrays and
64518+ to allocate stattistics gathering array for per-level stats. */
64519+#define REISER4_MAX_ZTREE_HEIGHT (8)
64520+
64521+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64522+
64523+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64524+ sequential search is on average faster than binary. This is because
64525+ of better optimization and because sequential search is more CPU
64526+ cache friendly. This number (25) was found by experiments on dual AMD
64527+ Athlon(tm), 1400MHz.
64528+
64529+ NOTE: testing in kernel has shown that binary search is more effective than
64530+ implied by results of the user level benchmarking. Probably because in the
64531+ node keys are separated by other data. So value was adjusted after few
64532+ tests. More thorough tuning is needed.
64533+*/
64534+#define REISER4_SEQ_SEARCH_BREAK (3)
64535+
64536+/* don't allow tree to be lower than this */
64537+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
64538+
64539+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64540+ * available memory. */
64541+/* Default value of maximal atom size. Can be ovewritten by
64542+ tmgr.atom_max_size mount option. By default infinity. */
64543+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
64544+
64545+/* Default value of maximal atom age (in jiffies). After reaching this age
64546+ atom will be forced to commit, either synchronously or asynchronously. Can
64547+ be overwritten by tmgr.atom_max_age mount option. */
64548+#define REISER4_ATOM_MAX_AGE (600 * HZ)
64549+
64550+/* sleeping period for ktxnmrgd */
64551+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
64552+
64553+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64554+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64555+
64556+/* start complaining after that many restarts in coord_by_key().
64557+
64558+ This either means incredibly heavy contention for this part of a tree, or
64559+ some corruption or bug.
64560+*/
64561+#define REISER4_CBK_ITERATIONS_LIMIT (100)
64562+
64563+/* return -EIO after that many iterations in coord_by_key().
64564+
64565+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
64566+ finished. --nikita
64567+*/
64568+#define REISER4_MAX_CBK_ITERATIONS 500000
64569+
64570+/* put a per-inode limit on maximal number of directory entries with identical
64571+ keys in hashed directory.
64572+
64573+ Disable this until inheritance interfaces stabilize: we need some way to
64574+ set per directory limit.
64575+*/
64576+#define REISER4_USE_COLLISION_LIMIT (0)
64577+
64578+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64579+ will force them to be relocated. */
64580+#define FLUSH_RELOCATE_THRESHOLD 64
64581+/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64582+ from the preceder it will relocate to that position. */
64583+#define FLUSH_RELOCATE_DISTANCE 64
64584+
64585+/* If we have written this much or more blocks before encountering busy jnode
64586+ in flush list - abort flushing hoping that next time we get called
64587+ this jnode will be clean already, and we will save some seeks. */
64588+#define FLUSH_WRITTEN_THRESHOLD 50
64589+
64590+/* The maximum number of nodes to scan left on a level during flush. */
64591+#define FLUSH_SCAN_MAXNODES 10000
64592+
64593+/* per-atom limit of flushers */
64594+#define ATOM_MAX_FLUSHERS (1)
64595+
64596+/* default tracing buffer size */
64597+#define REISER4_TRACE_BUF_SIZE (1 << 15)
64598+
64599+/* what size units of IO we would like cp, etc., to use, in writing to
64600+ reiser4. In bytes.
64601+
64602+ Can be overwritten by optimal_io_size mount option.
64603+*/
64604+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64605+
64606+/* see comments in inode.c:oid_to_uino() */
64607+#define REISER4_UINO_SHIFT (1 << 30)
64608+
64609+/* Mark function argument as unused to avoid compiler warnings. */
64610+#define UNUSED_ARG __attribute__((unused))
64611+
64612+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64613+#define NONNULL __attribute__((nonnull))
64614+#else
64615+#define NONNULL
64616+#endif
64617+
64618+/* master super block offset in bytes.*/
64619+#define REISER4_MASTER_OFFSET 65536
64620+
64621+/* size of VFS block */
64622+#define VFS_BLKSIZE 512
64623+/* number of bits in size of VFS block (512==2^9) */
64624+#define VFS_BLKSIZE_BITS 9
64625+
64626+#define REISER4_I reiser4_inode_data
64627+
64628+/* implication */
64629+#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64630+/* logical equivalence */
64631+#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64632+
64633+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64634+
64635+#define NOT_YET (0)
64636+
64637+/** Reiser4 specific error codes **/
64638+
64639+#define REISER4_ERROR_CODE_BASE 500
64640+
64641+/* Neighbor is not available (side neighbor or parent) */
64642+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
64643+
64644+/* Node was not found in cache */
64645+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64646+
64647+/* node has no free space enough for completion of balancing operation */
64648+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
64649+
64650+/* repeat operation */
64651+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
64652+
64653+/* deadlock happens */
64654+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
64655+
64656+/* operation cannot be performed, because it would block and non-blocking mode
64657+ * was requested. */
64658+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
64659+
64660+/* wait some event (depends on context), then repeat */
64661+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
64662+
64663+#endif /* __REISER4_H__ */
64664+
64665+/* Make Linus happy.
64666+ Local variables:
64667+ c-indentation-style: "K&R"
64668+ mode-name: "LC"
64669+ c-basic-offset: 8
64670+ tab-width: 8
64671+ fill-column: 120
64672+ End:
64673+*/
64674diff --git a/fs/reiser4/safe_link.c b/fs/reiser4/safe_link.c
64675new file mode 100644
64676index 0000000..1253bdb
64677--- /dev/null
64678+++ b/fs/reiser4/safe_link.c
64679@@ -0,0 +1,351 @@
64680+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64681+ * reiser4/README */
64682+
64683+/* Safe-links. */
64684+
64685+/*
64686+ * Safe-links are used to maintain file system consistency during operations
64687+ * that spawns multiple transactions. For example:
64688+ *
64689+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64690+ * without user-visible names in the file system, but still opened by some
64691+ * active process. What happens here is that unlink proper (i.e., removal
64692+ * of the last file name) and file deletion (truncate of file body to zero
64693+ * and deletion of stat-data, that happens when last file descriptor is
64694+ * closed), may belong to different transactions T1 and T2. If a crash
64695+ * happens after T1 commit, but before T2 commit, on-disk file system has
64696+ * a file without name, that is, disk space leak.
64697+ *
64698+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
64699+ * system crashes while truncate was in-progress, file is left partially
64700+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
64701+ * every system is atomic.
64702+ *
64703+ * Safe-links address both above cases. Basically, safe-link is a way post
64704+ * some operation to be executed during commit of some other transaction than
64705+ * current one. (Another way to look at the safe-link is to interpret it as a
64706+ * logical logging.)
64707+ *
64708+ * Specifically, at the beginning of unlink safe-link in inserted in the
64709+ * tree. This safe-link is normally removed by file deletion code (during
64710+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
64711+ * normally removed when truncate operation is finished.
64712+ *
64713+ * This means, that in the case of "clean umount" there are no safe-links in
64714+ * the tree. If safe-links are observed during mount, it means that (a) system
64715+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
64716+ * (i.e., not finished) operations that were in-progress during system
64717+ * termination. Each safe-link record enough information to complete
64718+ * corresponding operation, and mount simply "replays" them (hence, the
64719+ * analogy with the logical logging).
64720+ *
64721+ * Safe-links are implemented as blackbox items (see
64722+ * plugin/item/blackbox.[ch]).
64723+ *
64724+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
64725+ * list" there.
64726+ */
64727+
64728+#include "safe_link.h"
64729+#include "debug.h"
64730+#include "inode.h"
64731+
64732+#include "plugin/item/blackbox.h"
64733+
64734+#include <linux/fs.h>
64735+
64736+/*
64737+ * On-disk format of safe-link.
64738+ */
64739+typedef struct safelink {
64740+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
64741+ * for */
64742+ d64 size; /* size to which file should be truncated */
64743+} safelink_t;
64744+
64745+/*
64746+ * locality where safe-link items are stored. Next to the objectid of root
64747+ * directory.
64748+ */
64749+static oid_t safe_link_locality(reiser4_tree * tree)
64750+{
64751+ return get_key_objectid(get_super_private(tree->super)->df_plug->
64752+ root_dir_key(tree->super)) + 1;
64753+}
64754+
64755+/*
64756+ Construct a key for the safe-link. Key has the following format:
64757+
64758+| 60 | 4 | 64 | 4 | 60 | 64 |
64759++---------------+---+------------------+---+---------------+------------------+
64760+| locality | 0 | 0 | 0 | objectid | link type |
64761++---------------+---+------------------+---+---------------+------------------+
64762+| | | | |
64763+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
64764+
64765+ This is in large keys format. In small keys format second 8 byte chunk is
64766+ out. Locality is a constant returned by safe_link_locality(). objectid is
64767+ an oid of a file on which operation protected by this safe-link is
64768+ performed. link-type is used to distinguish safe-links for different
64769+ operations.
64770+
64771+ */
64772+static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64773+ reiser4_safe_link_t link, reiser4_key * key)
64774+{
64775+ reiser4_key_init(key);
64776+ set_key_locality(key, safe_link_locality(tree));
64777+ set_key_objectid(key, oid);
64778+ set_key_offset(key, link);
64779+ return key;
64780+}
64781+
64782+/*
64783+ * how much disk space is necessary to insert and remove (in the
64784+ * error-handling path) safe-link.
64785+ */
64786+static __u64 safe_link_tograb(reiser4_tree * tree)
64787+{
64788+ return
64789+ /* insert safe link */
64790+ estimate_one_insert_item(tree) +
64791+ /* remove safe link */
64792+ estimate_one_item_removal(tree) +
64793+ /* drill to the leaf level during insertion */
64794+ 1 + estimate_one_insert_item(tree) +
64795+ /*
64796+ * possible update of existing safe-link. Actually, if
64797+ * safe-link existed already (we failed to remove it), then no
64798+ * insertion is necessary, so this term is already "covered",
64799+ * but for simplicity let's left it.
64800+ */
64801+ 1;
64802+}
64803+
64804+/*
64805+ * grab enough disk space to insert and remove (in the error-handling path)
64806+ * safe-link.
64807+ */
64808+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64809+{
64810+ int result;
64811+
64812+ grab_space_enable();
64813+ /* The sbinfo->delete_mutex can be taken here.
64814+ * safe_link_release() should be called before leaving reiser4
64815+ * context. */
64816+ result =
64817+ reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64818+ grab_space_enable();
64819+ return result;
64820+}
64821+
64822+/*
64823+ * release unused disk space reserved by safe_link_grab().
64824+ */
64825+void safe_link_release(reiser4_tree * tree)
64826+{
64827+ reiser4_release_reserved(tree->super);
64828+}
64829+
64830+/*
64831+ * insert into tree safe-link for operation @link on inode @inode.
64832+ */
64833+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64834+{
64835+ reiser4_key key;
64836+ safelink_t sl;
64837+ int length;
64838+ int result;
64839+ reiser4_tree *tree;
64840+
64841+ build_sd_key(inode, &sl.sdkey);
64842+ length = sizeof sl.sdkey;
64843+
64844+ if (link == SAFE_TRUNCATE) {
64845+ /*
64846+ * for truncate we have to store final file length also,
64847+ * expand item.
64848+ */
64849+ length += sizeof(sl.size);
64850+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64851+ }
64852+ tree = reiser4_tree_by_inode(inode);
64853+ build_link_key(tree, get_inode_oid(inode), link, &key);
64854+
64855+ result = store_black_box(tree, &key, &sl, length);
64856+ if (result == -EEXIST)
64857+ result = update_black_box(tree, &key, &sl, length);
64858+ return result;
64859+}
64860+
64861+/*
64862+ * remove safe-link corresponding to the operation @link on inode @inode from
64863+ * the tree.
64864+ */
64865+int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64866+{
64867+ reiser4_key key;
64868+
64869+ return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64870+}
64871+
64872+/*
64873+ * in-memory structure to keep information extracted from safe-link. This is
64874+ * used to iterate over all safe-links.
64875+ */
64876+typedef struct {
64877+ reiser4_tree *tree; /* internal tree */
64878+ reiser4_key key; /* safe-link key */
64879+ reiser4_key sdkey; /* key of object stat-data */
64880+ reiser4_safe_link_t link; /* safe-link type */
64881+ oid_t oid; /* object oid */
64882+ __u64 size; /* final size for truncate */
64883+} safe_link_context;
64884+
64885+/*
64886+ * start iterating over all safe-links.
64887+ */
64888+static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64889+{
64890+ ctx->tree = tree;
64891+ reiser4_key_init(&ctx->key);
64892+ set_key_locality(&ctx->key, safe_link_locality(tree));
64893+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
64894+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
64895+}
64896+
64897+/*
64898+ * return next safe-link.
64899+ */
64900+static int safe_link_iter_next(safe_link_context * ctx)
64901+{
64902+ int result;
64903+ safelink_t sl;
64904+
64905+ result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64906+ if (result == 0) {
64907+ ctx->oid = get_key_objectid(&ctx->key);
64908+ ctx->link = get_key_offset(&ctx->key);
64909+ ctx->sdkey = sl.sdkey;
64910+ if (ctx->link == SAFE_TRUNCATE)
64911+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64912+ }
64913+ return result;
64914+}
64915+
64916+/*
64917+ * check are there any more safe-links left in the tree.
64918+ */
64919+static int safe_link_iter_finished(safe_link_context * ctx)
64920+{
64921+ return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64922+}
64923+
64924+/*
64925+ * finish safe-link iteration.
64926+ */
64927+static void safe_link_iter_end(safe_link_context * ctx)
64928+{
64929+ /* nothing special */
64930+}
64931+
64932+/*
64933+ * process single safe-link.
64934+ */
64935+static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64936+ reiser4_key * sdkey, oid_t oid, __u64 size)
64937+{
64938+ struct inode *inode;
64939+ int result;
64940+
64941+ /*
64942+ * obtain object inode by reiser4_iget(), then call object plugin
64943+ * ->safelink() method to do actual work, then delete safe-link on
64944+ * success.
64945+ */
64946+ inode = reiser4_iget(super, sdkey, 1);
64947+ if (!IS_ERR(inode)) {
64948+ file_plugin *fplug;
64949+
64950+ fplug = inode_file_plugin(inode);
64951+ assert("nikita-3428", fplug != NULL);
64952+ assert("", oid == get_inode_oid(inode));
64953+ if (fplug->safelink != NULL) {
64954+ /* reiser4_txn_restart_current is not necessary because
64955+ * mounting is signle thread. However, without it
64956+ * deadlock detection code will complain (see
64957+ * nikita-3361). */
64958+ reiser4_txn_restart_current();
64959+ result = fplug->safelink(inode, link, size);
64960+ } else {
64961+ warning("nikita-3430",
64962+ "Cannot handle safelink for %lli",
64963+ (unsigned long long)oid);
64964+ reiser4_print_key("key", sdkey);
64965+ result = 0;
64966+ }
64967+ if (result != 0) {
64968+ warning("nikita-3431",
64969+ "Error processing safelink for %lli: %i",
64970+ (unsigned long long)oid, result);
64971+ }
64972+ reiser4_iget_complete(inode);
64973+ iput(inode);
64974+ if (result == 0) {
64975+ result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
64976+ if (result == 0)
64977+ result =
64978+ safe_link_del(reiser4_get_tree(super), oid, link);
64979+ safe_link_release(reiser4_get_tree(super));
64980+ /*
64981+ * restart transaction: if there was large number of
64982+ * safe-links, their processing may fail to fit into
64983+ * single transaction.
64984+ */
64985+ if (result == 0)
64986+ reiser4_txn_restart_current();
64987+ }
64988+ } else
64989+ result = PTR_ERR(inode);
64990+ return result;
64991+}
64992+
64993+/*
64994+ * iterate over all safe-links in the file-system processing them one by one.
64995+ */
64996+int process_safelinks(struct super_block *super)
64997+{
64998+ safe_link_context ctx;
64999+ int result;
65000+
65001+ if (rofs_super(super))
65002+ /* do nothing on the read-only file system */
65003+ return 0;
65004+ safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
65005+ result = 0;
65006+ do {
65007+ result = safe_link_iter_next(&ctx);
65008+ if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
65009+ result = 0;
65010+ break;
65011+ }
65012+ if (result == 0)
65013+ result = process_safelink(super, ctx.link,
65014+ &ctx.sdkey, ctx.oid,
65015+ ctx.size);
65016+ } while (result == 0);
65017+ safe_link_iter_end(&ctx);
65018+ return result;
65019+}
65020+
65021+/* Make Linus happy.
65022+ Local variables:
65023+ c-indentation-style: "K&R"
65024+ mode-name: "LC"
65025+ c-basic-offset: 8
65026+ tab-width: 8
65027+ fill-column: 120
65028+ scroll-step: 1
65029+ End:
65030+*/
65031diff --git a/fs/reiser4/safe_link.h b/fs/reiser4/safe_link.h
65032new file mode 100644
65033index 0000000..7ae4458
65034--- /dev/null
65035+++ b/fs/reiser4/safe_link.h
65036@@ -0,0 +1,29 @@
65037+/* Copyright 2003 by Hans Reiser, licensing governed by
65038+ * reiser4/README */
65039+
65040+/* Safe-links. See safe_link.c for details. */
65041+
65042+#if !defined( __FS_SAFE_LINK_H__ )
65043+#define __FS_SAFE_LINK_H__
65044+
65045+#include "tree.h"
65046+
65047+int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
65048+void safe_link_release(reiser4_tree * tree);
65049+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
65050+int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
65051+
65052+int process_safelinks(struct super_block *super);
65053+
65054+/* __FS_SAFE_LINK_H__ */
65055+#endif
65056+
65057+/* Make Linus happy.
65058+ Local variables:
65059+ c-indentation-style: "K&R"
65060+ mode-name: "LC"
65061+ c-basic-offset: 8
65062+ tab-width: 8
65063+ fill-column: 120
65064+ End:
65065+*/
65066diff --git a/fs/reiser4/seal.c b/fs/reiser4/seal.c
65067new file mode 100644
65068index 0000000..c91cf52
65069--- /dev/null
65070+++ b/fs/reiser4/seal.c
65071@@ -0,0 +1,218 @@
65072+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
65073+/* Seals implementation. */
65074+/* Seals are "weak" tree pointers. They are analogous to tree coords in
65075+ allowing to bypass tree traversal. But normal usage of coords implies that
65076+ node pointed to by coord is locked, whereas seals don't keep a lock (or
65077+ even a reference) to znode. In stead, each znode contains a version number,
65078+ increased on each znode modification. This version number is copied into a
65079+ seal when seal is created. Later, one can "validate" seal by calling
65080+ reiser4_seal_validate(). If znode is in cache and its version number is
65081+ still the same, seal is "pristine" and coord associated with it can be
65082+ re-used immediately.
65083+
65084+ If, on the other hand, znode is out of cache, or it is obviously different
65085+ one from the znode seal was initially attached to (for example, it is on
65086+ the different level, or is being removed from the tree), seal is
65087+ irreparably invalid ("burned") and tree traversal has to be repeated.
65088+
65089+ Otherwise, there is some hope, that while znode was modified (and seal was
65090+ "broken" as a result), key attached to the seal is still in the node. This
65091+ is checked by first comparing this key with delimiting keys of node and, if
65092+ key is ok, doing intra-node lookup.
65093+
65094+ Znode version is maintained in the following way:
65095+
65096+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
65097+ znode_epoch is incremented and its new value is stored in ->version field
65098+ of new znode. Whenever znode is dirtied (which means it was probably
65099+ modified), znode_epoch is also incremented and its new value is stored in
65100+ znode->version. This is done so, because just incrementing znode->version
65101+ on each update is not enough: it may so happen, that znode get deleted, new
65102+ znode is allocated for the same disk block and gets the same version
65103+ counter, tricking seal code into false positive.
65104+*/
65105+
65106+#include "forward.h"
65107+#include "debug.h"
65108+#include "key.h"
65109+#include "coord.h"
65110+#include "seal.h"
65111+#include "plugin/item/item.h"
65112+#include "plugin/node/node.h"
65113+#include "jnode.h"
65114+#include "znode.h"
65115+#include "super.h"
65116+
65117+static znode *seal_node(const seal_t * seal);
65118+static int seal_matches(const seal_t * seal, znode * node);
65119+
65120+/* initialise seal. This can be called several times on the same seal. @coord
65121+ and @key can be NULL. */
65122+void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
65123+ const coord_t * coord /* coord @seal will be
65124+ * attached to */ ,
65125+ const reiser4_key * key UNUSED_ARG /* key @seal will be
65126+ * attached to */ )
65127+{
65128+ assert("nikita-1886", seal != NULL);
65129+ memset(seal, 0, sizeof *seal);
65130+ if (coord != NULL) {
65131+ znode *node;
65132+
65133+ node = coord->node;
65134+ assert("nikita-1987", node != NULL);
65135+ spin_lock_znode(node);
65136+ seal->version = node->version;
65137+ assert("nikita-1988", seal->version != 0);
65138+ seal->block = *znode_get_block(node);
65139+#if REISER4_DEBUG
65140+ seal->coord1 = *coord;
65141+ if (key != NULL)
65142+ seal->key = *key;
65143+#endif
65144+ spin_unlock_znode(node);
65145+ }
65146+}
65147+
65148+/* finish with seal */
65149+void reiser4_seal_done(seal_t * seal /* seal to clear */ )
65150+{
65151+ assert("nikita-1887", seal != NULL);
65152+ seal->version = 0;
65153+}
65154+
65155+/* true if seal was initialised */
65156+int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
65157+{
65158+ assert("nikita-1890", seal != NULL);
65159+ return seal->version != 0;
65160+}
65161+
65162+#if REISER4_DEBUG
65163+/* helper function for reiser4_seal_validate(). It checks that item at @coord
65164+ * has expected key. This is to detect cases where node was modified but wasn't
65165+ * marked dirty. */
65166+static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
65167+ const reiser4_key * k /* expected key */ )
65168+{
65169+ reiser4_key ukey;
65170+
65171+ return (coord->between != AT_UNIT) ||
65172+ /* FIXME-VS: we only can compare keys for items whose units
65173+ represent exactly one key */
65174+ ((coord_is_existing_unit(coord))
65175+ && (item_is_extent(coord)
65176+ || keyeq(k, unit_key_by_coord(coord, &ukey))))
65177+ || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
65178+ && keyge(k, unit_key_by_coord(coord, &ukey)));
65179+}
65180+#endif
65181+
65182+/* this is used by reiser4_seal_validate. It accepts return value of
65183+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
65184+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
65185+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
65186+ * We cannot do this in longterm_lock_znode(), because sometimes we want to
65187+ * distinguish between -EINVAL and -E_REPEAT. */
65188+static int should_repeat(int return_code)
65189+{
65190+ return return_code == -EINVAL;
65191+}
65192+
65193+/* (re-)validate seal.
65194+
65195+ Checks whether seal is pristine, and try to revalidate it if possible.
65196+
65197+ If seal was burned, or broken irreparably, return -E_REPEAT.
65198+
65199+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
65200+ looking for is in range of keys covered by the sealed node, but item wasn't
65201+ found by node ->lookup() method. Alternative is to return -ENOENT in this
65202+ case, but this would complicate callers logic.
65203+
65204+*/
65205+int reiser4_seal_validate(seal_t * seal /* seal to validate */,
65206+ coord_t * coord /* coord to validate against */,
65207+ const reiser4_key * key /* key to validate against */,
65208+ lock_handle * lh /* resulting lock handle */,
65209+ znode_lock_mode mode /* lock node */,
65210+ znode_lock_request request /* locking priority */)
65211+{
65212+ znode *node;
65213+ int result;
65214+
65215+ assert("nikita-1889", seal != NULL);
65216+ assert("nikita-1881", reiser4_seal_is_set(seal));
65217+ assert("nikita-1882", key != NULL);
65218+ assert("nikita-1883", coord != NULL);
65219+ assert("nikita-1884", lh != NULL);
65220+ assert("nikita-1885", keyeq(&seal->key, key));
65221+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
65222+
65223+ /* obtain znode by block number */
65224+ node = seal_node(seal);
65225+ if (node != NULL) {
65226+ /* znode was in cache, lock it */
65227+ result = longterm_lock_znode(lh, node, mode, request);
65228+ zput(node);
65229+ if (result == 0) {
65230+ if (seal_matches(seal, node)) {
65231+ /* if seal version and znode version
65232+ coincide */
65233+ ON_DEBUG(coord_update_v(coord));
65234+ assert("nikita-1990",
65235+ node == seal->coord1.node);
65236+ assert("nikita-1898",
65237+ WITH_DATA_RET(coord->node, 1,
65238+ check_seal_match(coord,
65239+ key)));
65240+ } else
65241+ result = RETERR(-E_REPEAT);
65242+ }
65243+ if (result != 0) {
65244+ if (should_repeat(result))
65245+ result = RETERR(-E_REPEAT);
65246+ /* unlock node on failure */
65247+ done_lh(lh);
65248+ }
65249+ } else {
65250+ /* znode wasn't in cache */
65251+ result = RETERR(-E_REPEAT);
65252+ }
65253+ return result;
65254+}
65255+
65256+/* helpers functions */
65257+
65258+/* obtain reference to znode seal points to, if in cache */
65259+static znode *seal_node(const seal_t * seal /* seal to query */ )
65260+{
65261+ assert("nikita-1891", seal != NULL);
65262+ return zlook(current_tree, &seal->block);
65263+}
65264+
65265+/* true if @seal version and @node version coincide */
65266+static int seal_matches(const seal_t * seal /* seal to check */ ,
65267+ znode * node /* node to check */ )
65268+{
65269+ int result;
65270+
65271+ assert("nikita-1991", seal != NULL);
65272+ assert("nikita-1993", node != NULL);
65273+
65274+ spin_lock_znode(node);
65275+ result = (seal->version == node->version);
65276+ spin_unlock_znode(node);
65277+ return result;
65278+}
65279+
65280+/* Make Linus happy.
65281+ Local variables:
65282+ c-indentation-style: "K&R"
65283+ mode-name: "LC"
65284+ c-basic-offset: 8
65285+ tab-width: 8
65286+ fill-column: 120
65287+ scroll-step: 1
65288+ End:
65289+*/
65290diff --git a/fs/reiser4/seal.h b/fs/reiser4/seal.h
65291new file mode 100644
65292index 0000000..5c3c5e0
65293--- /dev/null
65294+++ b/fs/reiser4/seal.h
65295@@ -0,0 +1,49 @@
65296+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
65297+
65298+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
65299+
65300+#ifndef __SEAL_H__
65301+#define __SEAL_H__
65302+
65303+#include "forward.h"
65304+#include "debug.h"
65305+#include "dformat.h"
65306+#include "key.h"
65307+#include "coord.h"
65308+
65309+/* for __u?? types */
65310+/*#include <linux/types.h>*/
65311+
65312+/* seal. See comment at the top of seal.c */
65313+typedef struct seal_s {
65314+ /* version of znode recorder at the time of seal creation */
65315+ __u64 version;
65316+ /* block number of znode attached to this seal */
65317+ reiser4_block_nr block;
65318+#if REISER4_DEBUG
65319+ /* coord this seal is attached to. For debugging. */
65320+ coord_t coord1;
65321+ /* key this seal is attached to. For debugging. */
65322+ reiser4_key key;
65323+#endif
65324+} seal_t;
65325+
65326+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
65327+extern void reiser4_seal_done(seal_t *);
65328+extern int reiser4_seal_is_set(const seal_t *);
65329+extern int reiser4_seal_validate(seal_t *, coord_t *,
65330+ const reiser4_key *, lock_handle *,
65331+ znode_lock_mode mode, znode_lock_request request);
65332+
65333+/* __SEAL_H__ */
65334+#endif
65335+
65336+/* Make Linus happy.
65337+ Local variables:
65338+ c-indentation-style: "K&R"
65339+ mode-name: "LC"
65340+ c-basic-offset: 8
65341+ tab-width: 8
65342+ fill-column: 120
65343+ End:
65344+*/
65345diff --git a/fs/reiser4/search.c b/fs/reiser4/search.c
65346new file mode 100644
65347index 0000000..9d35e11
65348--- /dev/null
65349+++ b/fs/reiser4/search.c
65350@@ -0,0 +1,1611 @@
65351+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65352+ * reiser4/README */
65353+
65354+#include "forward.h"
65355+#include "debug.h"
65356+#include "dformat.h"
65357+#include "key.h"
65358+#include "coord.h"
65359+#include "seal.h"
65360+#include "plugin/item/item.h"
65361+#include "plugin/node/node.h"
65362+#include "plugin/plugin.h"
65363+#include "jnode.h"
65364+#include "znode.h"
65365+#include "block_alloc.h"
65366+#include "tree_walk.h"
65367+#include "tree.h"
65368+#include "reiser4.h"
65369+#include "super.h"
65370+#include "inode.h"
65371+
65372+#include <linux/slab.h>
65373+
65374+static const char *bias_name(lookup_bias bias);
65375+
65376+/* tree searching algorithm, intranode searching algorithms are in
65377+ plugin/node/ */
65378+
65379+/* tree lookup cache
65380+ *
65381+ * The coord by key cache consists of small list of recently accessed nodes
65382+ * maintained according to the LRU discipline. Before doing real top-to-down
65383+ * tree traversal this cache is scanned for nodes that can contain key
65384+ * requested.
65385+ *
65386+ * The efficiency of coord cache depends heavily on locality of reference for
65387+ * tree accesses. Our user level simulations show reasonably good hit ratios
65388+ * for coord cache under most loads so far.
65389+ */
65390+
65391+/* Initialise coord cache slot */
65392+static void cbk_cache_init_slot(cbk_cache_slot *slot)
65393+{
65394+ assert("nikita-345", slot != NULL);
65395+
65396+ INIT_LIST_HEAD(&slot->lru);
65397+ slot->node = NULL;
65398+}
65399+
65400+/* Initialize coord cache */
65401+int cbk_cache_init(cbk_cache *cache /* cache to init */ )
65402+{
65403+ int i;
65404+
65405+ assert("nikita-346", cache != NULL);
65406+
65407+ cache->slot =
65408+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
65409+ reiser4_ctx_gfp_mask_get());
65410+ if (cache->slot == NULL)
65411+ return RETERR(-ENOMEM);
65412+
65413+ INIT_LIST_HEAD(&cache->lru);
65414+ for (i = 0; i < cache->nr_slots; ++i) {
65415+ cbk_cache_init_slot(cache->slot + i);
65416+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
65417+ }
65418+ rwlock_init(&cache->guard);
65419+ return 0;
65420+}
65421+
65422+/* free cbk cache data */
65423+void cbk_cache_done(cbk_cache * cache /* cache to release */ )
65424+{
65425+ assert("nikita-2493", cache != NULL);
65426+ if (cache->slot != NULL) {
65427+ kfree(cache->slot);
65428+ cache->slot = NULL;
65429+ }
65430+}
65431+
65432+/* macro to iterate over all cbk cache slots */
65433+#define for_all_slots(cache, slot) \
65434+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
65435+ &(cache)->lru != &(slot)->lru; \
65436+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
65437+
65438+#if REISER4_DEBUG
65439+/* this function assures that [cbk-cache-invariant] invariant holds */
65440+static int cbk_cache_invariant(const cbk_cache *cache)
65441+{
65442+ cbk_cache_slot *slot;
65443+ int result;
65444+ int unused;
65445+
65446+ if (cache->nr_slots == 0)
65447+ return 1;
65448+
65449+ assert("nikita-2469", cache != NULL);
65450+ unused = 0;
65451+ result = 1;
65452+ read_lock(&((cbk_cache *)cache)->guard);
65453+ for_all_slots(cache, slot) {
65454+ /* in LRU first go all `used' slots followed by `unused' */
65455+ if (unused && (slot->node != NULL))
65456+ result = 0;
65457+ if (slot->node == NULL)
65458+ unused = 1;
65459+ else {
65460+ cbk_cache_slot *scan;
65461+
65462+ /* all cached nodes are different */
65463+ scan = slot;
65464+ while (result) {
65465+ scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
65466+ if (&cache->lru == &scan->lru)
65467+ break;
65468+ if (slot->node == scan->node)
65469+ result = 0;
65470+ }
65471+ }
65472+ if (!result)
65473+ break;
65474+ }
65475+ read_unlock(&((cbk_cache *)cache)->guard);
65476+ return result;
65477+}
65478+
65479+#endif
65480+
65481+/* Remove references, if any, to @node from coord cache */
65482+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
65483+ reiser4_tree * tree /* tree to remove node from */ )
65484+{
65485+ cbk_cache_slot *slot;
65486+ cbk_cache *cache;
65487+ int i;
65488+
65489+ assert("nikita-350", node != NULL);
65490+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
65491+
65492+ cache = &tree->cbk_cache;
65493+ assert("nikita-2470", cbk_cache_invariant(cache));
65494+
65495+ write_lock(&(cache->guard));
65496+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65497+ if (slot->node == node) {
65498+ list_move_tail(&slot->lru, &cache->lru);
65499+ slot->node = NULL;
65500+ break;
65501+ }
65502+ }
65503+ write_unlock(&(cache->guard));
65504+ assert("nikita-2471", cbk_cache_invariant(cache));
65505+}
65506+
65507+/* add to the cbk-cache in the "tree" information about "node". This
65508+ can actually be update of existing slot in a cache. */
65509+static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65510+{
65511+ cbk_cache *cache;
65512+ cbk_cache_slot *slot;
65513+ int i;
65514+
65515+ assert("nikita-352", node != NULL);
65516+
65517+ cache = &znode_get_tree(node)->cbk_cache;
65518+ assert("nikita-2472", cbk_cache_invariant(cache));
65519+
65520+ if (cache->nr_slots == 0)
65521+ return;
65522+
65523+ write_lock(&(cache->guard));
65524+ /* find slot to update/add */
65525+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65526+ /* oops, this node is already in a cache */
65527+ if (slot->node == node)
65528+ break;
65529+ }
65530+ /* if all slots are used, reuse least recently used one */
65531+ if (i == cache->nr_slots) {
65532+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65533+ slot->node = (znode *) node;
65534+ }
65535+ list_move(&slot->lru, &cache->lru);
65536+ write_unlock(&(cache->guard));
65537+ assert("nikita-2473", cbk_cache_invariant(cache));
65538+}
65539+
65540+static int setup_delimiting_keys(cbk_handle * h);
65541+static lookup_result coord_by_handle(cbk_handle * handle);
65542+static lookup_result traverse_tree(cbk_handle * h);
65543+static int cbk_cache_search(cbk_handle * h);
65544+
65545+static level_lookup_result cbk_level_lookup(cbk_handle * h);
65546+static level_lookup_result cbk_node_lookup(cbk_handle * h);
65547+
65548+/* helper functions */
65549+
65550+static void update_stale_dk(reiser4_tree * tree, znode * node);
65551+
65552+/* release parent node during traversal */
65553+static void put_parent(cbk_handle * h);
65554+/* check consistency of fields */
65555+static int sanity_check(cbk_handle * h);
65556+/* release resources in handle */
65557+static void hput(cbk_handle * h);
65558+
65559+static level_lookup_result search_to_left(cbk_handle * h);
65560+
65561+/* pack numerous (numberous I should say) arguments of coord_by_key() into
65562+ * cbk_handle */
65563+static cbk_handle *cbk_pack(cbk_handle * handle,
65564+ reiser4_tree * tree,
65565+ const reiser4_key * key,
65566+ coord_t * coord,
65567+ lock_handle * active_lh,
65568+ lock_handle * parent_lh,
65569+ znode_lock_mode lock_mode,
65570+ lookup_bias bias,
65571+ tree_level lock_level,
65572+ tree_level stop_level,
65573+ __u32 flags, ra_info_t * info)
65574+{
65575+ memset(handle, 0, sizeof *handle);
65576+
65577+ handle->tree = tree;
65578+ handle->key = key;
65579+ handle->lock_mode = lock_mode;
65580+ handle->bias = bias;
65581+ handle->lock_level = lock_level;
65582+ handle->stop_level = stop_level;
65583+ handle->coord = coord;
65584+ /* set flags. See comment in tree.h:cbk_flags */
65585+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65586+
65587+ handle->active_lh = active_lh;
65588+ handle->parent_lh = parent_lh;
65589+ handle->ra_info = info;
65590+ return handle;
65591+}
65592+
65593+/* main tree lookup procedure
65594+
65595+ Check coord cache. If key we are looking for is not found there, call cbk()
65596+ to do real tree traversal.
65597+
65598+ As we have extents on the twig level, @lock_level and @stop_level can
65599+ be different from LEAF_LEVEL and each other.
65600+
65601+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65602+ long term locks) while calling this.
65603+*/
65604+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65605+ * in. Usually this tree is
65606+ * part of file-system
65607+ * super-block */ ,
65608+ const reiser4_key * key /* key to look for */ ,
65609+ coord_t * coord /* where to store found
65610+ * position in a tree. Fields
65611+ * in "coord" are only valid if
65612+ * coord_by_key() returned
65613+ * "CBK_COORD_FOUND" */ ,
65614+ lock_handle * lh, /* resulting lock handle */
65615+ znode_lock_mode lock_mode /* type of lookup we
65616+ * want on node. Pass
65617+ * ZNODE_READ_LOCK here
65618+ * if you only want to
65619+ * read item found and
65620+ * ZNODE_WRITE_LOCK if
65621+ * you want to modify
65622+ * it */ ,
65623+ lookup_bias bias /* what to return if coord
65624+ * with exactly the @key is
65625+ * not in the tree */ ,
65626+ tree_level lock_level /* tree level where to start
65627+ * taking @lock type of
65628+ * locks */ ,
65629+ tree_level stop_level /* tree level to stop. Pass
65630+ * LEAF_LEVEL or TWIG_LEVEL
65631+ * here Item being looked
65632+ * for has to be between
65633+ * @lock_level and
65634+ * @stop_level, inclusive */ ,
65635+ __u32 flags /* search flags */ ,
65636+ ra_info_t *
65637+ info
65638+ /* information about desired tree traversal readahead */
65639+ )
65640+{
65641+ cbk_handle handle;
65642+ lock_handle parent_lh;
65643+ lookup_result result;
65644+
65645+ init_lh(lh);
65646+ init_lh(&parent_lh);
65647+
65648+ assert("nikita-3023", reiser4_schedulable());
65649+
65650+ assert("nikita-353", tree != NULL);
65651+ assert("nikita-354", key != NULL);
65652+ assert("nikita-355", coord != NULL);
65653+ assert("nikita-356", (bias == FIND_EXACT)
65654+ || (bias == FIND_MAX_NOT_MORE_THAN));
65655+ assert("nikita-357", stop_level >= LEAF_LEVEL);
65656+ /* no locks can be held during tree traversal */
65657+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65658+
65659+ cbk_pack(&handle,
65660+ tree,
65661+ key,
65662+ coord,
65663+ lh,
65664+ &parent_lh,
65665+ lock_mode, bias, lock_level, stop_level, flags, info);
65666+
65667+ result = coord_by_handle(&handle);
65668+ assert("nikita-3247",
65669+ ergo(!IS_CBKERR(result), coord->node == lh->node));
65670+ return result;
65671+}
65672+
65673+/* like coord_by_key(), but starts traversal from vroot of @object rather than
65674+ * from tree root. */
65675+lookup_result reiser4_object_lookup(struct inode * object,
65676+ const reiser4_key * key,
65677+ coord_t * coord,
65678+ lock_handle * lh,
65679+ znode_lock_mode lock_mode,
65680+ lookup_bias bias,
65681+ tree_level lock_level,
65682+ tree_level stop_level, __u32 flags,
65683+ ra_info_t * info)
65684+{
65685+ cbk_handle handle;
65686+ lock_handle parent_lh;
65687+ lookup_result result;
65688+
65689+ init_lh(lh);
65690+ init_lh(&parent_lh);
65691+
65692+ assert("nikita-3023", reiser4_schedulable());
65693+
65694+ assert("nikita-354", key != NULL);
65695+ assert("nikita-355", coord != NULL);
65696+ assert("nikita-356", (bias == FIND_EXACT)
65697+ || (bias == FIND_MAX_NOT_MORE_THAN));
65698+ assert("nikita-357", stop_level >= LEAF_LEVEL);
65699+ /* no locks can be held during tree search by key */
65700+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65701+
65702+ cbk_pack(&handle,
65703+ object != NULL ? reiser4_tree_by_inode(object) : current_tree,
65704+ key,
65705+ coord,
65706+ lh,
65707+ &parent_lh,
65708+ lock_mode, bias, lock_level, stop_level, flags, info);
65709+ handle.object = object;
65710+
65711+ result = coord_by_handle(&handle);
65712+ assert("nikita-3247",
65713+ ergo(!IS_CBKERR(result), coord->node == lh->node));
65714+ return result;
65715+}
65716+
65717+/* lookup by cbk_handle. Common part of coord_by_key() and
65718+ reiser4_object_lookup(). */
65719+static lookup_result coord_by_handle(cbk_handle * handle)
65720+{
65721+ /*
65722+ * first check cbk_cache (which is look-aside cache for our tree) and
65723+ * of this fails, start traversal.
65724+ */
65725+ /* first check whether "key" is in cache of recent lookups. */
65726+ if (cbk_cache_search(handle) == 0)
65727+ return handle->result;
65728+ else
65729+ return traverse_tree(handle);
65730+}
65731+
65732+/* Execute actor for each item (or unit, depending on @through_units_p),
65733+ starting from @coord, right-ward, until either:
65734+
65735+ - end of the tree is reached
65736+ - unformatted node is met
65737+ - error occurred
65738+ - @actor returns 0 or less
65739+
65740+ Error code, or last actor return value is returned.
65741+
65742+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
65743+ sequence of entries with identical keys and alikes.
65744+*/
65745+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65746+ coord_t * coord /* coord to start from */ ,
65747+ lock_handle * lh /* lock handle to start with and to
65748+ * update along the way */ ,
65749+ tree_iterate_actor_t actor /* function to call on each
65750+ * item/unit */ ,
65751+ void *arg /* argument to pass to @actor */ ,
65752+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
65753+ int through_units_p /* call @actor on each item or on
65754+ * each unit */ )
65755+{
65756+ int result;
65757+
65758+ assert("nikita-1143", tree != NULL);
65759+ assert("nikita-1145", coord != NULL);
65760+ assert("nikita-1146", lh != NULL);
65761+ assert("nikita-1147", actor != NULL);
65762+
65763+ result = zload(coord->node);
65764+ coord_clear_iplug(coord);
65765+ if (result != 0)
65766+ return result;
65767+ if (!coord_is_existing_unit(coord)) {
65768+ zrelse(coord->node);
65769+ return -ENOENT;
65770+ }
65771+ while ((result = actor(tree, coord, lh, arg)) > 0) {
65772+ /* move further */
65773+ if ((through_units_p && coord_next_unit(coord)) ||
65774+ (!through_units_p && coord_next_item(coord))) {
65775+ do {
65776+ lock_handle couple;
65777+
65778+ /* move to the next node */
65779+ init_lh(&couple);
65780+ result =
65781+ reiser4_get_right_neighbor(&couple,
65782+ coord->node,
65783+ (int)mode,
65784+ GN_CAN_USE_UPPER_LEVELS);
65785+ zrelse(coord->node);
65786+ if (result == 0) {
65787+
65788+ result = zload(couple.node);
65789+ if (result != 0) {
65790+ done_lh(&couple);
65791+ return result;
65792+ }
65793+
65794+ coord_init_first_unit(coord,
65795+ couple.node);
65796+ done_lh(lh);
65797+ move_lh(lh, &couple);
65798+ } else
65799+ return result;
65800+ } while (node_is_empty(coord->node));
65801+ }
65802+
65803+ assert("nikita-1149", coord_is_existing_unit(coord));
65804+ }
65805+ zrelse(coord->node);
65806+ return result;
65807+}
65808+
65809+/* return locked uber znode for @tree */
65810+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65811+ znode_lock_request pri, lock_handle * lh)
65812+{
65813+ int result;
65814+
65815+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
65816+ return result;
65817+}
65818+
65819+/* true if @key is strictly within @node
65820+
65821+ we are looking for possibly non-unique key and it is item is at the edge of
65822+ @node. May be it is in the neighbor.
65823+*/
65824+static int znode_contains_key_strict(znode * node /* node to check key
65825+ * against */ ,
65826+ const reiser4_key *
65827+ key /* key to check */ ,
65828+ int isunique)
65829+{
65830+ int answer;
65831+
65832+ assert("nikita-1760", node != NULL);
65833+ assert("nikita-1722", key != NULL);
65834+
65835+ if (keyge(key, &node->rd_key))
65836+ return 0;
65837+
65838+ answer = keycmp(&node->ld_key, key);
65839+
65840+ if (isunique)
65841+ return answer != GREATER_THAN;
65842+ else
65843+ return answer == LESS_THAN;
65844+}
65845+
65846+/*
65847+ * Virtual Root (vroot) code.
65848+ *
65849+ * For given file system object (e.g., regular file or directory) let's
65850+ * define its "virtual root" as lowest in the tree (that is, furtherest
65851+ * from the tree root) node such that all body items of said object are
65852+ * located in a tree rooted at this node.
65853+ *
65854+ * Once vroot of object is found all tree lookups for items within body of
65855+ * this object ("object lookups") can be started from its vroot rather
65856+ * than from real root. This has following advantages:
65857+ *
65858+ * 1. amount of nodes traversed during lookup (and, hence, amount of
65859+ * key comparisons made) decreases, and
65860+ *
65861+ * 2. contention on tree root is decreased. This latter was actually
65862+ * motivating reason behind vroot, because spin lock of root node,
65863+ * which is taken when acquiring long-term lock on root node is the
65864+ * hottest lock in the reiser4.
65865+ *
65866+ * How to find vroot.
65867+ *
65868+ * When vroot of object F is not yet determined, all object lookups start
65869+ * from the root of the tree. At each tree level during traversal we have
65870+ * a node N such that a key we are looking for (which is the key inside
65871+ * object's body) is located within N. In function handle_vroot() called
65872+ * from cbk_level_lookup() we check whether N is possible vroot for
65873+ * F. Check is trivial---if neither leftmost nor rightmost item of N
65874+ * belongs to F (and we already have helpful ->owns_item() method of
65875+ * object plugin for this), then N is possible vroot of F. This, of
65876+ * course, relies on the assumption that each object occupies contiguous
65877+ * range of keys in the tree.
65878+ *
65879+ * Thus, traversing tree downward and checking each node as we go, we can
65880+ * find lowest such node, which, by definition, is vroot.
65881+ *
65882+ * How to track vroot.
65883+ *
65884+ * Nohow. If actual vroot changes, next object lookup will just restart
65885+ * from the actual tree root, refreshing object's vroot along the way.
65886+ *
65887+ */
65888+
65889+/*
65890+ * Check whether @node is possible vroot of @object.
65891+ */
65892+static void handle_vroot(struct inode *object, znode * node)
65893+{
65894+ file_plugin *fplug;
65895+ coord_t coord;
65896+
65897+ fplug = inode_file_plugin(object);
65898+ assert("nikita-3353", fplug != NULL);
65899+ assert("nikita-3354", fplug->owns_item != NULL);
65900+
65901+ if (unlikely(node_is_empty(node)))
65902+ return;
65903+
65904+ coord_init_first_unit(&coord, node);
65905+ /*
65906+ * if leftmost item of @node belongs to @object, we cannot be sure
65907+ * that @node is vroot of @object, because, some items of @object are
65908+ * probably in the sub-tree rooted at the left neighbor of @node.
65909+ */
65910+ if (fplug->owns_item(object, &coord))
65911+ return;
65912+ coord_init_last_unit(&coord, node);
65913+ /* mutatis mutandis for the rightmost item */
65914+ if (fplug->owns_item(object, &coord))
65915+ return;
65916+ /* otherwise, @node is possible vroot of @object */
65917+ inode_set_vroot(object, node);
65918+}
65919+
65920+/*
65921+ * helper function used by traverse tree to start tree traversal not from the
65922+ * tree root, but from @h->object's vroot, if possible.
65923+ */
65924+static int prepare_object_lookup(cbk_handle * h)
65925+{
65926+ znode *vroot;
65927+ int result;
65928+
65929+ vroot = inode_get_vroot(h->object);
65930+ if (vroot == NULL) {
65931+ /*
65932+ * object doesn't have known vroot, start from real tree root.
65933+ */
65934+ return LOOKUP_CONT;
65935+ }
65936+
65937+ h->level = znode_get_level(vroot);
65938+ /* take a long-term lock on vroot */
65939+ h->result = longterm_lock_znode(h->active_lh, vroot,
65940+ cbk_lock_mode(h->level, h),
65941+ ZNODE_LOCK_LOPRI);
65942+ result = LOOKUP_REST;
65943+ if (h->result == 0) {
65944+ int isunique;
65945+ int inside;
65946+
65947+ isunique = h->flags & CBK_UNIQUE;
65948+ /* check that key is inside vroot */
65949+ read_lock_dk(h->tree);
65950+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65951+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65952+ read_unlock_dk(h->tree);
65953+ if (inside) {
65954+ h->result = zload(vroot);
65955+ if (h->result == 0) {
65956+ /* search for key in vroot. */
65957+ result = cbk_node_lookup(h);
65958+ zrelse(vroot); /*h->active_lh->node); */
65959+ if (h->active_lh->node != vroot) {
65960+ result = LOOKUP_REST;
65961+ } else if (result == LOOKUP_CONT) {
65962+ move_lh(h->parent_lh, h->active_lh);
65963+ h->flags &= ~CBK_DKSET;
65964+ }
65965+ }
65966+ }
65967+ }
65968+
65969+ zput(vroot);
65970+
65971+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65972+ hput(h);
65973+ return result;
65974+}
65975+
65976+/* main function that handles common parts of tree traversal: starting
65977+ (fake znode handling), restarts, error handling, completion */
65978+static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65979+{
65980+ int done;
65981+ int iterations;
65982+ int vroot_used;
65983+
65984+ assert("nikita-365", h != NULL);
65985+ assert("nikita-366", h->tree != NULL);
65986+ assert("nikita-367", h->key != NULL);
65987+ assert("nikita-368", h->coord != NULL);
65988+ assert("nikita-369", (h->bias == FIND_EXACT)
65989+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
65990+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65991+ assert("nikita-2949", !(h->flags & CBK_DKSET));
65992+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65993+
65994+ done = 0;
65995+ iterations = 0;
65996+ vroot_used = 0;
65997+
65998+ /* loop for restarts */
65999+ restart:
66000+
66001+ assert("nikita-3024", reiser4_schedulable());
66002+
66003+ h->result = CBK_COORD_FOUND;
66004+ /* connect_znode() needs it */
66005+ h->ld_key = *reiser4_min_key();
66006+ h->rd_key = *reiser4_max_key();
66007+ h->flags |= CBK_DKSET;
66008+ h->error = NULL;
66009+
66010+ if (!vroot_used && h->object != NULL) {
66011+ vroot_used = 1;
66012+ done = prepare_object_lookup(h);
66013+ if (done == LOOKUP_REST) {
66014+ goto restart;
66015+ } else if (done == LOOKUP_DONE)
66016+ return h->result;
66017+ }
66018+ if (h->parent_lh->node == NULL) {
66019+ done =
66020+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
66021+ h->parent_lh);
66022+
66023+ assert("nikita-1637", done != -E_DEADLOCK);
66024+
66025+ h->block = h->tree->root_block;
66026+ h->level = h->tree->height;
66027+ h->coord->node = h->parent_lh->node;
66028+
66029+ if (done != 0)
66030+ return done;
66031+ }
66032+
66033+ /* loop descending a tree */
66034+ while (!done) {
66035+
66036+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
66037+ IS_POW(iterations))) {
66038+ warning("nikita-1481", "Too many iterations: %i",
66039+ iterations);
66040+ reiser4_print_key("key", h->key);
66041+ ++iterations;
66042+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
66043+ h->error =
66044+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
66045+ h->result = RETERR(-EIO);
66046+ break;
66047+ }
66048+ switch (cbk_level_lookup(h)) {
66049+ case LOOKUP_CONT:
66050+ move_lh(h->parent_lh, h->active_lh);
66051+ continue;
66052+ default:
66053+ wrong_return_value("nikita-372", "cbk_level");
66054+ case LOOKUP_DONE:
66055+ done = 1;
66056+ break;
66057+ case LOOKUP_REST:
66058+ hput(h);
66059+ /* deadlock avoidance is normal case. */
66060+ if (h->result != -E_DEADLOCK)
66061+ ++iterations;
66062+ reiser4_preempt_point();
66063+ goto restart;
66064+ }
66065+ }
66066+ /* that's all. The rest is error handling */
66067+ if (unlikely(h->error != NULL)) {
66068+ warning("nikita-373", "%s: level: %i, "
66069+ "lock_level: %i, stop_level: %i "
66070+ "lock_mode: %s, bias: %s",
66071+ h->error, h->level, h->lock_level, h->stop_level,
66072+ lock_mode_name(h->lock_mode), bias_name(h->bias));
66073+ reiser4_print_address("block", &h->block);
66074+ reiser4_print_key("key", h->key);
66075+ print_coord_content("coord", h->coord);
66076+ }
66077+ /* `unlikely' error case */
66078+ if (unlikely(IS_CBKERR(h->result))) {
66079+ /* failure. do cleanup */
66080+ hput(h);
66081+ } else {
66082+ assert("nikita-1605", WITH_DATA_RET
66083+ (h->coord->node, 1,
66084+ ergo((h->result == CBK_COORD_FOUND) &&
66085+ (h->bias == FIND_EXACT) &&
66086+ (!node_is_empty(h->coord->node)),
66087+ coord_is_existing_item(h->coord))));
66088+ }
66089+ return h->result;
66090+}
66091+
66092+/* find delimiting keys of child
66093+
66094+ Determine left and right delimiting keys for child pointed to by
66095+ @parent_coord.
66096+
66097+*/
66098+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
66099+ * locked */ ,
66100+ const coord_t * parent_coord /* coord where
66101+ * pointer to
66102+ * child is
66103+ * stored */ ,
66104+ reiser4_key * ld /* where to store left
66105+ * delimiting key */ ,
66106+ reiser4_key * rd /* where to store right
66107+ * delimiting key */ )
66108+{
66109+ coord_t neighbor;
66110+
66111+ assert("nikita-1484", parent != NULL);
66112+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
66113+
66114+ coord_dup(&neighbor, parent_coord);
66115+
66116+ if (neighbor.between == AT_UNIT)
66117+ /* imitate item ->lookup() behavior. */
66118+ neighbor.between = AFTER_UNIT;
66119+
66120+ if (coord_set_to_left(&neighbor) == 0)
66121+ unit_key_by_coord(&neighbor, ld);
66122+ else {
66123+ assert("nikita-14851", 0);
66124+ *ld = *znode_get_ld_key(parent);
66125+ }
66126+
66127+ coord_dup(&neighbor, parent_coord);
66128+ if (neighbor.between == AT_UNIT)
66129+ neighbor.between = AFTER_UNIT;
66130+ if (coord_set_to_right(&neighbor) == 0)
66131+ unit_key_by_coord(&neighbor, rd);
66132+ else
66133+ *rd = *znode_get_rd_key(parent);
66134+}
66135+
66136+/*
66137+ * setup delimiting keys for a child
66138+ *
66139+ * @parent parent node
66140+ *
66141+ * @coord location in @parent where pointer to @child is
66142+ *
66143+ * @child child node
66144+ */
66145+int
66146+set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
66147+{
66148+ reiser4_tree *tree;
66149+
66150+ assert("nikita-2952",
66151+ znode_get_level(parent) == znode_get_level(coord->node));
66152+
66153+ /* fast check without taking dk lock. This is safe, because
66154+ * JNODE_DKSET is never cleared once set. */
66155+ if (!ZF_ISSET(child, JNODE_DKSET)) {
66156+ tree = znode_get_tree(parent);
66157+ write_lock_dk(tree);
66158+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
66159+ find_child_delimiting_keys(parent, coord,
66160+ &child->ld_key,
66161+ &child->rd_key);
66162+ ON_DEBUG(child->ld_key_version =
66163+ atomic_inc_return(&delim_key_version);
66164+ child->rd_key_version =
66165+ atomic_inc_return(&delim_key_version););
66166+ ZF_SET(child, JNODE_DKSET);
66167+ }
66168+ write_unlock_dk(tree);
66169+ return 1;
66170+ }
66171+ return 0;
66172+}
66173+
66174+/* Perform tree lookup at one level. This is called from cbk_traverse()
66175+ function that drives lookup through tree and calls cbk_node_lookup() to
66176+ perform lookup within one node.
66177+
66178+ See comments in a code.
66179+*/
66180+static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
66181+{
66182+ int ret;
66183+ int setdk;
66184+ int ldkeyset = 0;
66185+ reiser4_key ldkey;
66186+ reiser4_key key;
66187+ znode *active;
66188+
66189+ assert("nikita-3025", reiser4_schedulable());
66190+
66191+ /* acquire reference to @active node */
66192+ active =
66193+ zget(h->tree, &h->block, h->parent_lh->node, h->level,
66194+ reiser4_ctx_gfp_mask_get());
66195+
66196+ if (IS_ERR(active)) {
66197+ h->result = PTR_ERR(active);
66198+ return LOOKUP_DONE;
66199+ }
66200+
66201+ /* lock @active */
66202+ h->result = longterm_lock_znode(h->active_lh,
66203+ active,
66204+ cbk_lock_mode(h->level, h),
66205+ ZNODE_LOCK_LOPRI);
66206+ /* longterm_lock_znode() acquires additional reference to znode (which
66207+ will be later released by longterm_unlock_znode()). Release
66208+ reference acquired by zget().
66209+ */
66210+ zput(active);
66211+ if (unlikely(h->result != 0))
66212+ goto fail_or_restart;
66213+
66214+ setdk = 0;
66215+ /* if @active is accessed for the first time, setup delimiting keys on
66216+ it. Delimiting keys are taken from the parent node. See
66217+ setup_delimiting_keys() for details.
66218+ */
66219+ if (h->flags & CBK_DKSET) {
66220+ setdk = setup_delimiting_keys(h);
66221+ h->flags &= ~CBK_DKSET;
66222+ } else {
66223+ znode *parent;
66224+
66225+ parent = h->parent_lh->node;
66226+ h->result = zload(parent);
66227+ if (unlikely(h->result != 0))
66228+ goto fail_or_restart;
66229+
66230+ if (!ZF_ISSET(active, JNODE_DKSET))
66231+ setdk = set_child_delimiting_keys(parent,
66232+ h->coord, active);
66233+ else {
66234+ read_lock_dk(h->tree);
66235+ find_child_delimiting_keys(parent, h->coord, &ldkey,
66236+ &key);
66237+ read_unlock_dk(h->tree);
66238+ ldkeyset = 1;
66239+ }
66240+ zrelse(parent);
66241+ }
66242+
66243+ /* this is ugly kludge. Reminder: this is necessary, because
66244+ ->lookup() method returns coord with ->between field probably set
66245+ to something different from AT_UNIT.
66246+ */
66247+ h->coord->between = AT_UNIT;
66248+
66249+ if (znode_just_created(active) && (h->coord->node != NULL)) {
66250+ write_lock_tree(h->tree);
66251+ /* if we are going to load znode right now, setup
66252+ ->in_parent: coord where pointer to this node is stored in
66253+ parent.
66254+ */
66255+ coord_to_parent_coord(h->coord, &active->in_parent);
66256+ write_unlock_tree(h->tree);
66257+ }
66258+
66259+ /* check connectedness without holding tree lock---false negatives
66260+ * will be re-checked by connect_znode(), and false positives are
66261+ * impossible---@active cannot suddenly turn into unconnected
66262+ * state. */
66263+ if (!znode_is_connected(active)) {
66264+ h->result = connect_znode(h->coord, active);
66265+ if (unlikely(h->result != 0)) {
66266+ put_parent(h);
66267+ goto fail_or_restart;
66268+ }
66269+ }
66270+
66271+ jload_prefetch(ZJNODE(active));
66272+
66273+ if (setdk)
66274+ update_stale_dk(h->tree, active);
66275+
66276+ /* put_parent() cannot be called earlier, because connect_znode()
66277+ assumes parent node is referenced; */
66278+ put_parent(h);
66279+
66280+ if ((!znode_contains_key_lock(active, h->key) &&
66281+ (h->flags & CBK_TRUST_DK))
66282+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
66283+ /* 1. key was moved out of this node while this thread was
66284+ waiting for the lock. Restart. More elaborate solution is
66285+ to determine where key moved (to the left, or to the right)
66286+ and try to follow it through sibling pointers.
66287+
66288+ 2. or, node itself is going to be removed from the
66289+ tree. Release lock and restart.
66290+ */
66291+ h->result = -E_REPEAT;
66292+ }
66293+ if (h->result == -E_REPEAT)
66294+ return LOOKUP_REST;
66295+
66296+ h->result = zload_ra(active, h->ra_info);
66297+ if (h->result) {
66298+ return LOOKUP_DONE;
66299+ }
66300+
66301+ /* sanity checks */
66302+ if (sanity_check(h)) {
66303+ zrelse(active);
66304+ return LOOKUP_DONE;
66305+ }
66306+
66307+ /* check that key of leftmost item in the @active is the same as in
66308+ * its parent */
66309+ if (ldkeyset && !node_is_empty(active) &&
66310+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
66311+ warning("vs-3533", "Keys are inconsistent. Fsck?");
66312+ reiser4_print_key("inparent", &ldkey);
66313+ reiser4_print_key("inchild", &key);
66314+ h->result = RETERR(-EIO);
66315+ zrelse(active);
66316+ return LOOKUP_DONE;
66317+ }
66318+
66319+ if (h->object != NULL)
66320+ handle_vroot(h->object, active);
66321+
66322+ ret = cbk_node_lookup(h);
66323+
66324+ /* h->active_lh->node might change, but active is yet to be zrelsed */
66325+ zrelse(active);
66326+
66327+ return ret;
66328+
66329+ fail_or_restart:
66330+ if (h->result == -E_DEADLOCK)
66331+ return LOOKUP_REST;
66332+ return LOOKUP_DONE;
66333+}
66334+
66335+#if REISER4_DEBUG
66336+/* check left and right delimiting keys of a znode */
66337+void check_dkeys(znode * node)
66338+{
66339+ znode *left;
66340+ znode *right;
66341+
66342+ read_lock_tree(current_tree);
66343+ read_lock_dk(current_tree);
66344+
66345+ assert("vs-1710", znode_is_any_locked(node));
66346+ assert("vs-1197",
66347+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
66348+
66349+ left = node->left;
66350+ right = node->right;
66351+
66352+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
66353+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
66354+ /* check left neighbor. Note that left neighbor is not locked,
66355+ so it might get wrong delimiting keys therefore */
66356+ assert("vs-1198",
66357+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
66358+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
66359+
66360+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
66361+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
66362+ /* check right neighbor. Note that right neighbor is not
66363+ locked, so it might get wrong delimiting keys therefore */
66364+ assert("vs-1199",
66365+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
66366+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
66367+
66368+ read_unlock_dk(current_tree);
66369+ read_unlock_tree(current_tree);
66370+}
66371+#endif
66372+
66373+/* true if @key is left delimiting key of @node */
66374+static int key_is_ld(znode * node, const reiser4_key * key)
66375+{
66376+ int ld;
66377+
66378+ assert("nikita-1716", node != NULL);
66379+ assert("nikita-1758", key != NULL);
66380+
66381+ read_lock_dk(znode_get_tree(node));
66382+ assert("nikita-1759", znode_contains_key(node, key));
66383+ ld = keyeq(znode_get_ld_key(node), key);
66384+ read_unlock_dk(znode_get_tree(node));
66385+ return ld;
66386+}
66387+
66388+/* Process one node during tree traversal.
66389+
66390+ This is called by cbk_level_lookup(). */
66391+static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
66392+{
66393+ /* node plugin of @active */
66394+ node_plugin *nplug;
66395+ /* item plugin of item that was found */
66396+ item_plugin *iplug;
66397+ /* search bias */
66398+ lookup_bias node_bias;
66399+ /* node we are operating upon */
66400+ znode *active;
66401+ /* tree we are searching in */
66402+ reiser4_tree *tree;
66403+ /* result */
66404+ int result;
66405+
66406+ assert("nikita-379", h != NULL);
66407+
66408+ active = h->active_lh->node;
66409+ tree = h->tree;
66410+
66411+ nplug = active->nplug;
66412+ assert("nikita-380", nplug != NULL);
66413+
66414+ ON_DEBUG(check_dkeys(active));
66415+
66416+ /* return item from "active" node with maximal key not greater than
66417+ "key" */
66418+ node_bias = h->bias;
66419+ result = nplug->lookup(active, h->key, node_bias, h->coord);
66420+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
66421+ /* error occurred */
66422+ h->result = result;
66423+ return LOOKUP_DONE;
66424+ }
66425+ if (h->level == h->stop_level) {
66426+ /* welcome to the stop level */
66427+ assert("nikita-381", h->coord->node == active);
66428+ if (result == NS_FOUND) {
66429+ /* success of tree lookup */
66430+ if (!(h->flags & CBK_UNIQUE)
66431+ && key_is_ld(active, h->key)) {
66432+ return search_to_left(h);
66433+ } else
66434+ h->result = CBK_COORD_FOUND;
66435+ } else {
66436+ h->result = CBK_COORD_NOTFOUND;
66437+ }
66438+ if (!(h->flags & CBK_IN_CACHE))
66439+ cbk_cache_add(active);
66440+ return LOOKUP_DONE;
66441+ }
66442+
66443+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
66444+ h->error = "not found on internal node";
66445+ h->result = result;
66446+ return LOOKUP_DONE;
66447+ }
66448+
66449+ assert("vs-361", h->level > h->stop_level);
66450+
66451+ if (handle_eottl(h, &result)) {
66452+ assert("vs-1674", (result == LOOKUP_DONE ||
66453+ result == LOOKUP_REST));
66454+ return result;
66455+ }
66456+
66457+ /* go down to next level */
66458+ check_me("vs-12", zload(h->coord->node) == 0);
66459+ assert("nikita-2116", item_is_internal(h->coord));
66460+ iplug = item_plugin_by_coord(h->coord);
66461+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
66462+ zrelse(h->coord->node);
66463+ --h->level;
66464+ return LOOKUP_CONT; /* continue */
66465+}
66466+
66467+/* scan cbk_cache slots looking for a match for @h */
66468+static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
66469+{
66470+ level_lookup_result llr;
66471+ znode *node;
66472+ reiser4_tree *tree;
66473+ cbk_cache_slot *slot;
66474+ cbk_cache *cache;
66475+ tree_level level;
66476+ int isunique;
66477+ const reiser4_key *key;
66478+ int result;
66479+
66480+ assert("nikita-1317", h != NULL);
66481+ assert("nikita-1315", h->tree != NULL);
66482+ assert("nikita-1316", h->key != NULL);
66483+
66484+ tree = h->tree;
66485+ cache = &tree->cbk_cache;
66486+ if (cache->nr_slots == 0)
66487+ /* size of cbk cache was set to 0 by mount time option. */
66488+ return RETERR(-ENOENT);
66489+
66490+ assert("nikita-2474", cbk_cache_invariant(cache));
66491+ node = NULL; /* to keep gcc happy */
66492+ level = h->level;
66493+ key = h->key;
66494+ isunique = h->flags & CBK_UNIQUE;
66495+ result = RETERR(-ENOENT);
66496+
66497+ /*
66498+ * this is time-critical function and dragons had, hence, been settled
66499+ * here.
66500+ *
66501+ * Loop below scans cbk cache slots trying to find matching node with
66502+ * suitable range of delimiting keys and located at the h->level.
66503+ *
66504+ * Scan is done under cbk cache spin lock that protects slot->node
66505+ * pointers. If suitable node is found we want to pin it in
66506+ * memory. But slot->node can point to the node with x_count 0
66507+ * (unreferenced). Such node can be recycled at any moment, or can
66508+ * already be in the process of being recycled (within jput()).
66509+ *
66510+ * As we found node in the cbk cache, it means that jput() hasn't yet
66511+ * called cbk_cache_invalidate().
66512+ *
66513+ * We acquire reference to the node without holding tree lock, and
66514+ * later, check node's RIP bit. This avoids races with jput().
66515+ */
66516+
66517+ rcu_read_lock();
66518+ read_lock(&((cbk_cache *)cache)->guard);
66519+
66520+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66521+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66522+ BUG_ON(&slot->lru != &cache->lru);/*????*/
66523+ while (1) {
66524+
66525+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66526+
66527+ if (&cache->lru != &slot->lru)
66528+ node = slot->node;
66529+ else
66530+ node = NULL;
66531+
66532+ if (unlikely(node == NULL))
66533+ break;
66534+
66535+ /*
66536+ * this is (hopefully) the only place in the code where we are
66537+ * working with delimiting keys without holding dk lock. This
66538+ * is fine here, because this is only "guess" anyway---keys
66539+ * are rechecked under dk lock below.
66540+ */
66541+ if (znode_get_level(node) == level &&
66542+ /* reiser4_min_key < key < reiser4_max_key */
66543+ znode_contains_key_strict(node, key, isunique)) {
66544+ zref(node);
66545+ result = 0;
66546+ spin_lock_prefetch(&tree->tree_lock);
66547+ break;
66548+ }
66549+ }
66550+ read_unlock(&((cbk_cache *)cache)->guard);
66551+
66552+ assert("nikita-2475", cbk_cache_invariant(cache));
66553+
66554+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66555+ result = -ENOENT;
66556+
66557+ rcu_read_unlock();
66558+
66559+ if (result != 0) {
66560+ h->result = CBK_COORD_NOTFOUND;
66561+ return RETERR(-ENOENT);
66562+ }
66563+
66564+ result =
66565+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66566+ ZNODE_LOCK_LOPRI);
66567+ zput(node);
66568+ if (result != 0)
66569+ return result;
66570+ result = zload(node);
66571+ if (result != 0)
66572+ return result;
66573+
66574+ /* recheck keys */
66575+ read_lock_dk(tree);
66576+ result = (znode_contains_key_strict(node, key, isunique) &&
66577+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66578+ read_unlock_dk(tree);
66579+ if (result) {
66580+ /* do lookup inside node */
66581+ llr = cbk_node_lookup(h);
66582+ /* if cbk_node_lookup() wandered to another node (due to eottl
66583+ or non-unique keys), adjust @node */
66584+ /*node = h->active_lh->node; */
66585+
66586+ if (llr != LOOKUP_DONE) {
66587+ /* restart or continue on the next level */
66588+ result = RETERR(-ENOENT);
66589+ } else if (IS_CBKERR(h->result))
66590+ /* io or oom */
66591+ result = RETERR(-ENOENT);
66592+ else {
66593+ /* good. Either item found or definitely not found. */
66594+ result = 0;
66595+
66596+ write_lock(&(cache->guard));
66597+ if (slot->node == h->active_lh->node /*node */ ) {
66598+ /* if this node is still in cbk cache---move
66599+ its slot to the head of the LRU list. */
66600+ list_move(&slot->lru, &cache->lru);
66601+ }
66602+ write_unlock(&(cache->guard));
66603+ }
66604+ } else {
66605+ /* race. While this thread was waiting for the lock, node was
66606+ rebalanced and item we are looking for, shifted out of it
66607+ (if it ever was here).
66608+
66609+ Continuing scanning is almost hopeless: node key range was
66610+ moved to, is almost certainly at the beginning of the LRU
66611+ list at this time, because it's hot, but restarting
66612+ scanning from the very beginning is complex. Just return,
66613+ so that cbk() will be performed. This is not that
66614+ important, because such races should be rare. Are they?
66615+ */
66616+ result = RETERR(-ENOENT); /* -ERAUGHT */
66617+ }
66618+ zrelse(node);
66619+ assert("nikita-2476", cbk_cache_invariant(cache));
66620+ return result;
66621+}
66622+
66623+/* look for item with given key in the coord cache
66624+
66625+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66626+ which is a small LRU list of znodes accessed lately. For each znode in
66627+ znode in this list, it checks whether key we are looking for fits into key
66628+ range covered by this node. If so, and in addition, node lies at allowed
66629+ level (this is to handle extents on a twig level), node is locked, and
66630+ lookup inside it is performed.
66631+
66632+ we need a measurement of the cost of this cache search compared to the cost
66633+ of coord_by_key.
66634+
66635+*/
66636+static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66637+{
66638+ int result = 0;
66639+ tree_level level;
66640+
66641+ /* add CBK_IN_CACHE to the handle flags. This means that
66642+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66643+ * found node to the cache. */
66644+ h->flags |= CBK_IN_CACHE;
66645+ for (level = h->stop_level; level <= h->lock_level; ++level) {
66646+ h->level = level;
66647+ result = cbk_cache_scan_slots(h);
66648+ if (result != 0) {
66649+ done_lh(h->active_lh);
66650+ done_lh(h->parent_lh);
66651+ } else {
66652+ assert("nikita-1319", !IS_CBKERR(h->result));
66653+ break;
66654+ }
66655+ }
66656+ h->flags &= ~CBK_IN_CACHE;
66657+ return result;
66658+}
66659+
66660+/* type of lock we want to obtain during tree traversal. On stop level
66661+ we want type of lock user asked for, on upper levels: read lock. */
66662+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66663+{
66664+ assert("nikita-382", h != NULL);
66665+
66666+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66667+}
66668+
66669+/* update outdated delimiting keys */
66670+static void stale_dk(reiser4_tree * tree, znode * node)
66671+{
66672+ znode *right;
66673+
66674+ read_lock_tree(tree);
66675+ write_lock_dk(tree);
66676+ right = node->right;
66677+
66678+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66679+ right && ZF_ISSET(right, JNODE_DKSET) &&
66680+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66681+ znode_set_rd_key(node, znode_get_ld_key(right));
66682+
66683+ write_unlock_dk(tree);
66684+ read_unlock_tree(tree);
66685+}
66686+
66687+/* check for possibly outdated delimiting keys, and update them if
66688+ * necessary. */
66689+static void update_stale_dk(reiser4_tree * tree, znode * node)
66690+{
66691+ znode *right;
66692+ reiser4_key rd;
66693+
66694+ read_lock_tree(tree);
66695+ read_lock_dk(tree);
66696+ rd = *znode_get_rd_key(node);
66697+ right = node->right;
66698+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66699+ right && ZF_ISSET(right, JNODE_DKSET) &&
66700+ !keyeq(&rd, znode_get_ld_key(right)))) {
66701+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66702+ read_unlock_dk(tree);
66703+ read_unlock_tree(tree);
66704+ stale_dk(tree, node);
66705+ return;
66706+ }
66707+ read_unlock_dk(tree);
66708+ read_unlock_tree(tree);
66709+}
66710+
66711+/*
66712+ * handle searches a the non-unique key.
66713+ *
66714+ * Suppose that we are looking for an item with possibly non-unique key 100.
66715+ *
66716+ * Root node contains two pointers: one to a node with left delimiting key 0,
66717+ * and another to a node with left delimiting key 100. Item we interested in
66718+ * may well happen in the sub-tree rooted at the first pointer.
66719+ *
66720+ * To handle this search_to_left() is called when search reaches stop
66721+ * level. This function checks it is _possible_ that item we are looking for
66722+ * is in the left neighbor (this can be done by comparing delimiting keys) and
66723+ * if so, tries to lock left neighbor (this is low priority lock, so it can
66724+ * deadlock, tree traversal is just restarted if it did) and then checks
66725+ * whether left neighbor actually contains items with our key.
66726+ *
66727+ * Note that this is done on the stop level only. It is possible to try such
66728+ * left-check on each level, but as duplicate keys are supposed to be rare
66729+ * (very unlikely that more than one node is completely filled with items with
66730+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66731+ *
66732+ */
66733+static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66734+{
66735+ level_lookup_result result;
66736+ coord_t *coord;
66737+ znode *node;
66738+ znode *neighbor;
66739+
66740+ lock_handle lh;
66741+
66742+ assert("nikita-1761", h != NULL);
66743+ assert("nikita-1762", h->level == h->stop_level);
66744+
66745+ init_lh(&lh);
66746+ coord = h->coord;
66747+ node = h->active_lh->node;
66748+ assert("nikita-1763", coord_is_leftmost_unit(coord));
66749+
66750+ h->result =
66751+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66752+ GN_CAN_USE_UPPER_LEVELS);
66753+ neighbor = NULL;
66754+ switch (h->result) {
66755+ case -E_DEADLOCK:
66756+ result = LOOKUP_REST;
66757+ break;
66758+ case 0:{
66759+ node_plugin *nplug;
66760+ coord_t crd;
66761+ lookup_bias bias;
66762+
66763+ neighbor = lh.node;
66764+ h->result = zload(neighbor);
66765+ if (h->result != 0) {
66766+ result = LOOKUP_DONE;
66767+ break;
66768+ }
66769+
66770+ nplug = neighbor->nplug;
66771+
66772+ coord_init_zero(&crd);
66773+ bias = h->bias;
66774+ h->bias = FIND_EXACT;
66775+ h->result =
66776+ nplug->lookup(neighbor, h->key, h->bias, &crd);
66777+ h->bias = bias;
66778+
66779+ if (h->result == NS_NOT_FOUND) {
66780+ case -E_NO_NEIGHBOR:
66781+ h->result = CBK_COORD_FOUND;
66782+ if (!(h->flags & CBK_IN_CACHE))
66783+ cbk_cache_add(node);
66784+ default: /* some other error */
66785+ result = LOOKUP_DONE;
66786+ } else if (h->result == NS_FOUND) {
66787+ read_lock_dk(znode_get_tree(neighbor));
66788+ h->rd_key = *znode_get_ld_key(node);
66789+ leftmost_key_in_node(neighbor, &h->ld_key);
66790+ read_unlock_dk(znode_get_tree(neighbor));
66791+ h->flags |= CBK_DKSET;
66792+
66793+ h->block = *znode_get_block(neighbor);
66794+ /* clear coord -> node so that cbk_level_lookup()
66795+ wouldn't overwrite parent hint in neighbor.
66796+
66797+ Parent hint was set up by
66798+ reiser4_get_left_neighbor()
66799+ */
66800+ /* FIXME: why do we have to spinlock here? */
66801+ write_lock_tree(znode_get_tree(neighbor));
66802+ h->coord->node = NULL;
66803+ write_unlock_tree(znode_get_tree(neighbor));
66804+ result = LOOKUP_CONT;
66805+ } else {
66806+ result = LOOKUP_DONE;
66807+ }
66808+ if (neighbor != NULL)
66809+ zrelse(neighbor);
66810+ }
66811+ }
66812+ done_lh(&lh);
66813+ return result;
66814+}
66815+
66816+/* debugging aid: return symbolic name of search bias */
66817+static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66818+{
66819+ if (bias == FIND_EXACT)
66820+ return "exact";
66821+ else if (bias == FIND_MAX_NOT_MORE_THAN)
66822+ return "left-slant";
66823+/* else if( bias == RIGHT_SLANT_BIAS ) */
66824+/* return "right-bias"; */
66825+ else {
66826+ static char buf[30];
66827+
66828+ sprintf(buf, "unknown: %i", bias);
66829+ return buf;
66830+ }
66831+}
66832+
66833+#if REISER4_DEBUG
66834+/* debugging aid: print human readable information about @p */
66835+void print_coord_content(const char *prefix /* prefix to print */ ,
66836+ coord_t * p /* coord to print */ )
66837+{
66838+ reiser4_key key;
66839+
66840+ if (p == NULL) {
66841+ printk("%s: null\n", prefix);
66842+ return;
66843+ }
66844+ if ((p->node != NULL) && znode_is_loaded(p->node)
66845+ && coord_is_existing_item(p))
66846+ printk("%s: data: %p, length: %i\n", prefix,
66847+ item_body_by_coord(p), item_length_by_coord(p));
66848+ if (znode_is_loaded(p->node)) {
66849+ item_key_by_coord(p, &key);
66850+ reiser4_print_key(prefix, &key);
66851+ }
66852+}
66853+
66854+/* debugging aid: print human readable information about @block */
66855+void reiser4_print_address(const char *prefix /* prefix to print */ ,
66856+ const reiser4_block_nr * block /* block number to print */ )
66857+{
66858+ printk("%s: %s\n", prefix, sprint_address(block));
66859+}
66860+#endif
66861+
66862+/* return string containing human readable representation of @block */
66863+char *sprint_address(const reiser4_block_nr *
66864+ block /* block number to print */ )
66865+{
66866+ static char address[30];
66867+
66868+ if (block == NULL)
66869+ sprintf(address, "null");
66870+ else if (reiser4_blocknr_is_fake(block))
66871+ sprintf(address, "%llx", (unsigned long long)(*block));
66872+ else
66873+ sprintf(address, "%llu", (unsigned long long)(*block));
66874+ return address;
66875+}
66876+
66877+/* release parent node during traversal */
66878+static void put_parent(cbk_handle * h /* search handle */ )
66879+{
66880+ assert("nikita-383", h != NULL);
66881+ if (h->parent_lh->node != NULL) {
66882+ longterm_unlock_znode(h->parent_lh);
66883+ }
66884+}
66885+
66886+/* helper function used by coord_by_key(): release reference to parent znode
66887+ stored in handle before processing its child. */
66888+static void hput(cbk_handle * h /* search handle */ )
66889+{
66890+ assert("nikita-385", h != NULL);
66891+ done_lh(h->parent_lh);
66892+ done_lh(h->active_lh);
66893+}
66894+
66895+/* Helper function used by cbk(): update delimiting keys of child node (stored
66896+ in h->active_lh->node) using key taken from parent on the parent level. */
66897+static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66898+{
66899+ znode *active;
66900+ reiser4_tree *tree;
66901+
66902+ assert("nikita-1088", h != NULL);
66903+
66904+ active = h->active_lh->node;
66905+
66906+ /* fast check without taking dk lock. This is safe, because
66907+ * JNODE_DKSET is never cleared once set. */
66908+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66909+ tree = znode_get_tree(active);
66910+ write_lock_dk(tree);
66911+ if (!ZF_ISSET(active, JNODE_DKSET)) {
66912+ znode_set_ld_key(active, &h->ld_key);
66913+ znode_set_rd_key(active, &h->rd_key);
66914+ ZF_SET(active, JNODE_DKSET);
66915+ }
66916+ write_unlock_dk(tree);
66917+ return 1;
66918+ }
66919+ return 0;
66920+}
66921+
66922+/* true if @block makes sense for the @tree. Used to detect corrupted node
66923+ * pointers */
66924+static int
66925+block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66926+ reiser4_tree * tree /* tree to check against */ )
66927+{
66928+ assert("nikita-757", block != NULL);
66929+ assert("nikita-758", tree != NULL);
66930+
66931+ /* check to see if it exceeds the size of the device. */
66932+ return reiser4_blocknr_is_sane_for(tree->super, block);
66933+}
66934+
66935+/* check consistency of fields */
66936+static int sanity_check(cbk_handle * h /* search handle */ )
66937+{
66938+ assert("nikita-384", h != NULL);
66939+
66940+ if (h->level < h->stop_level) {
66941+ h->error = "Buried under leaves";
66942+ h->result = RETERR(-EIO);
66943+ return LOOKUP_DONE;
66944+ } else if (!block_nr_is_correct(&h->block, h->tree)) {
66945+ h->error = "bad block number";
66946+ h->result = RETERR(-EIO);
66947+ return LOOKUP_DONE;
66948+ } else
66949+ return 0;
66950+}
66951+
66952+/* Make Linus happy.
66953+ Local variables:
66954+ c-indentation-style: "K&R"
66955+ mode-name: "LC"
66956+ c-basic-offset: 8
66957+ tab-width: 8
66958+ fill-column: 120
66959+ scroll-step: 1
66960+ End:
66961+*/
66962diff --git a/fs/reiser4/status_flags.c b/fs/reiser4/status_flags.c
66963new file mode 100644
66964index 0000000..b32f89a
66965--- /dev/null
66966+++ b/fs/reiser4/status_flags.c
66967@@ -0,0 +1,175 @@
66968+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66969+ * reiser4/README */
66970+
66971+/* Functions that deal with reiser4 status block, query status and update it, if needed */
66972+
66973+#include <linux/bio.h>
66974+#include <linux/highmem.h>
66975+#include <linux/fs.h>
66976+#include <linux/blkdev.h>
66977+#include "debug.h"
66978+#include "dformat.h"
66979+#include "status_flags.h"
66980+#include "super.h"
66981+
66982+/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66983+ unconditionally unlocks the page, so we can see that io was done.
66984+ We do not free bio, because we hope to reuse that. */
66985+static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66986+ int err)
66987+{
66988+ if (bio->bi_size)
66989+ return 1;
66990+
66991+ if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66992+ SetPageUptodate(bio->bi_io_vec->bv_page);
66993+ } else {
66994+ ClearPageUptodate(bio->bi_io_vec->bv_page);
66995+ SetPageError(bio->bi_io_vec->bv_page);
66996+ }
66997+ unlock_page(bio->bi_io_vec->bv_page);
66998+ return 0;
66999+}
67000+
67001+/* Initialise status code. This is expected to be called from the disk format
67002+ code. block paremeter is where status block lives. */
67003+int reiser4_status_init(reiser4_block_nr block)
67004+{
67005+ struct super_block *sb = reiser4_get_current_sb();
67006+ struct reiser4_status *statuspage;
67007+ struct bio *bio;
67008+ struct page *page;
67009+
67010+ get_super_private(sb)->status_page = NULL;
67011+ get_super_private(sb)->status_bio = NULL;
67012+
67013+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
67014+ if (!page)
67015+ return -ENOMEM;
67016+
67017+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
67018+ if (bio != NULL) {
67019+ bio->bi_sector = block * (sb->s_blocksize >> 9);
67020+ bio->bi_bdev = sb->s_bdev;
67021+ bio->bi_io_vec[0].bv_page = page;
67022+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
67023+ bio->bi_io_vec[0].bv_offset = 0;
67024+ bio->bi_vcnt = 1;
67025+ bio->bi_size = sb->s_blocksize;
67026+ bio->bi_end_io = reiser4_status_endio;
67027+ } else {
67028+ __free_pages(page, 0);
67029+ return -ENOMEM;
67030+ }
67031+ lock_page(page);
67032+ submit_bio(READ, bio);
67033+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
67034+ wait_on_page_locked(page);
67035+ if (!PageUptodate(page)) {
67036+ warning("green-2007",
67037+ "I/O error while tried to read status page\n");
67038+ return -EIO;
67039+ }
67040+
67041+ statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
67042+ if (memcmp
67043+ (statuspage->magic, REISER4_STATUS_MAGIC,
67044+ sizeof(REISER4_STATUS_MAGIC))) {
67045+ /* Magic does not match. */
67046+ kunmap_atomic((char *)statuspage, KM_USER0);
67047+ warning("green-2008", "Wrong magic in status block\n");
67048+ __free_pages(page, 0);
67049+ bio_put(bio);
67050+ return -EINVAL;
67051+ }
67052+ kunmap_atomic((char *)statuspage, KM_USER0);
67053+
67054+ get_super_private(sb)->status_page = page;
67055+ get_super_private(sb)->status_bio = bio;
67056+ return 0;
67057+}
67058+
67059+/* Query the status of fs. Returns if the FS can be safely mounted.
67060+ Also if "status" and "extended" parameters are given, it will fill
67061+ actual parts of status from disk there. */
67062+int reiser4_status_query(u64 * status, u64 * extended)
67063+{
67064+ struct super_block *sb = reiser4_get_current_sb();
67065+ struct reiser4_status *statuspage;
67066+ int retval;
67067+
67068+ if (!get_super_private(sb)->status_page) { // No status page?
67069+ return REISER4_STATUS_MOUNT_UNKNOWN;
67070+ }
67071+ statuspage = (struct reiser4_status *)
67072+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
67073+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) { // FIXME: this cast is a hack for 32 bit arches to work.
67074+ case REISER4_STATUS_OK:
67075+ retval = REISER4_STATUS_MOUNT_OK;
67076+ break;
67077+ case REISER4_STATUS_CORRUPTED:
67078+ retval = REISER4_STATUS_MOUNT_WARN;
67079+ break;
67080+ case REISER4_STATUS_DAMAGED:
67081+ case REISER4_STATUS_DESTROYED:
67082+ case REISER4_STATUS_IOERROR:
67083+ retval = REISER4_STATUS_MOUNT_RO;
67084+ break;
67085+ default:
67086+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
67087+ break;
67088+ }
67089+
67090+ if (status)
67091+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
67092+ if (extended)
67093+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
67094+
67095+ kunmap_atomic((char *)statuspage, KM_USER0);
67096+ return retval;
67097+}
67098+
67099+/* This function should be called when something bad happens (e.g. from reiser4_panic).
67100+ It fills the status structure and tries to push it to disk. */
67101+int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
67102+{
67103+ struct super_block *sb = reiser4_get_current_sb();
67104+ struct reiser4_status *statuspage;
67105+ struct bio *bio = get_super_private(sb)->status_bio;
67106+
67107+ if (!get_super_private(sb)->status_page) { // No status page?
67108+ return -1;
67109+ }
67110+ statuspage = (struct reiser4_status *)
67111+ kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
67112+
67113+ put_unaligned(cpu_to_le64(status), &statuspage->status);
67114+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
67115+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
67116+
67117+ kunmap_atomic((char *)statuspage, KM_USER0);
67118+ bio->bi_bdev = sb->s_bdev;
67119+ bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
67120+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
67121+ bio->bi_io_vec[0].bv_offset = 0;
67122+ bio->bi_vcnt = 1;
67123+ bio->bi_size = sb->s_blocksize;
67124+ bio->bi_end_io = reiser4_status_endio;
67125+ lock_page(get_super_private(sb)->status_page); // Safe as nobody should touch our page.
67126+ /* We can block now, but we have no other choice anyway */
67127+ submit_bio(WRITE, bio);
67128+ blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
67129+ return 0; // We do not wait for io to finish.
67130+}
67131+
67132+/* Frees the page with status and bio structure. Should be called by disk format at umount time */
67133+int reiser4_status_finish(void)
67134+{
67135+ struct super_block *sb = reiser4_get_current_sb();
67136+
67137+ __free_pages(get_super_private(sb)->status_page, 0);
67138+ get_super_private(sb)->status_page = NULL;
67139+ bio_put(get_super_private(sb)->status_bio);
67140+ get_super_private(sb)->status_bio = NULL;
67141+ return 0;
67142+}
67143diff --git a/fs/reiser4/status_flags.h b/fs/reiser4/status_flags.h
67144new file mode 100644
67145index 0000000..6cfa5ad
67146--- /dev/null
67147+++ b/fs/reiser4/status_flags.h
67148@@ -0,0 +1,43 @@
67149+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67150+ * reiser4/README */
67151+
67152+/* Here we declare structures and flags that store reiser4 status on disk.
67153+ The status that helps us to find out if the filesystem is valid or if it
67154+ contains some critical, or not so critical errors */
67155+
67156+#if !defined( __REISER4_STATUS_FLAGS_H__ )
67157+#define __REISER4_STATUS_FLAGS_H__
67158+
67159+#include "dformat.h"
67160+/* These are major status flags */
67161+#define REISER4_STATUS_OK 0
67162+#define REISER4_STATUS_CORRUPTED 0x1
67163+#define REISER4_STATUS_DAMAGED 0x2
67164+#define REISER4_STATUS_DESTROYED 0x4
67165+#define REISER4_STATUS_IOERROR 0x8
67166+
67167+/* Return values for reiser4_status_query() */
67168+#define REISER4_STATUS_MOUNT_OK 0
67169+#define REISER4_STATUS_MOUNT_WARN 1
67170+#define REISER4_STATUS_MOUNT_RO 2
67171+#define REISER4_STATUS_MOUNT_UNKNOWN -1
67172+
67173+#define REISER4_TEXTERROR_LEN 256
67174+
67175+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
67176+/* We probably need to keep its size under sector size which is 512 bytes */
67177+struct reiser4_status {
67178+ char magic[16];
67179+ d64 status; /* Current FS state */
67180+ d64 extended_status; /* Any additional info that might have sense in addition to "status". E.g.
67181+ last sector where io error happened if status is "io error encountered" */
67182+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
67183+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if appropriate, otherwise filled with zeroes */
67184+};
67185+
67186+int reiser4_status_init(reiser4_block_nr block);
67187+int reiser4_status_query(u64 * status, u64 * extended);
67188+int reiser4_status_write(u64 status, u64 extended_status, char *message);
67189+int reiser4_status_finish(void);
67190+
67191+#endif
67192diff --git a/fs/reiser4/super.c b/fs/reiser4/super.c
67193new file mode 100644
67194index 0000000..bc4113e
67195--- /dev/null
67196+++ b/fs/reiser4/super.c
67197@@ -0,0 +1,316 @@
67198+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67199+ * reiser4/README */
67200+
67201+/* Super-block manipulations. */
67202+
67203+#include "debug.h"
67204+#include "dformat.h"
67205+#include "key.h"
67206+#include "plugin/security/perm.h"
67207+#include "plugin/space/space_allocator.h"
67208+#include "plugin/plugin.h"
67209+#include "tree.h"
67210+#include "vfs_ops.h"
67211+#include "super.h"
67212+#include "reiser4.h"
67213+
67214+#include <linux/types.h> /* for __u?? */
67215+#include <linux/fs.h> /* for struct super_block */
67216+
67217+static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
67218+static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
67219+static __u64 reserved_for_root(const struct super_block *super);
67220+
67221+/* Return reiser4-specific part of super block */
67222+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super /* super block
67223+ * queried */ )
67224+{
67225+ return (reiser4_super_info_data *) super->s_fs_info;
67226+}
67227+
67228+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
67229+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
67230+{
67231+ assert("nikita-448", super != NULL);
67232+ assert("nikita-449", is_reiser4_super(super));
67233+ return (long)REISER4_SUPER_MAGIC;
67234+}
67235+
67236+/* functions to read/modify fields of reiser4_super_info_data */
67237+
67238+/* get number of blocks in file system */
67239+__u64 reiser4_block_count(const struct super_block *super /* super block
67240+ queried */ )
67241+{
67242+ assert("vs-494", super != NULL);
67243+ assert("vs-495", is_reiser4_super(super));
67244+ return get_super_private(super)->block_count;
67245+}
67246+
67247+#if REISER4_DEBUG
67248+/*
67249+ * number of blocks in the current file system
67250+ */
67251+__u64 reiser4_current_block_count(void)
67252+{
67253+ return get_current_super_private()->block_count;
67254+}
67255+#endif /* REISER4_DEBUG */
67256+
67257+/* set number of block in filesystem */
67258+void reiser4_set_block_count(const struct super_block *super, __u64 nr)
67259+{
67260+ assert("vs-501", super != NULL);
67261+ assert("vs-502", is_reiser4_super(super));
67262+ get_super_private(super)->block_count = nr;
67263+ /*
67264+ * The proper calculation of the reserved space counter (%5 of device
67265+ * block counter) we need a 64 bit division which is missing in Linux
67266+ * on i386 platform. Because we do not need a precise calculation here
67267+ * we can replace a div64 operation by this combination of
67268+ * multiplication and shift: 51. / (2^10) == .0498 .
67269+ * FIXME: this is a bug. It comes up only for very small filesystems
67270+ * which probably are never used. Nevertheless, it is a bug. Number of
67271+ * reserved blocks must be not less than maximal number of blocks which
67272+ * get grabbed with BA_RESERVED.
67273+ */
67274+ get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
67275+}
67276+
67277+/* amount of blocks used (allocated for data) in file system */
67278+__u64 reiser4_data_blocks(const struct super_block *super /* super block
67279+ queried */ )
67280+{
67281+ assert("nikita-452", super != NULL);
67282+ assert("nikita-453", is_reiser4_super(super));
67283+ return get_super_private(super)->blocks_used;
67284+}
67285+
67286+/* set number of block used in filesystem */
67287+void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
67288+{
67289+ assert("vs-503", super != NULL);
67290+ assert("vs-504", is_reiser4_super(super));
67291+ get_super_private(super)->blocks_used = nr;
67292+}
67293+
67294+/* amount of free blocks in file system */
67295+__u64 reiser4_free_blocks(const struct super_block *super /* super block
67296+ queried */ )
67297+{
67298+ assert("nikita-454", super != NULL);
67299+ assert("nikita-455", is_reiser4_super(super));
67300+ return get_super_private(super)->blocks_free;
67301+}
67302+
67303+/* set number of blocks free in filesystem */
67304+void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
67305+{
67306+ assert("vs-505", super != NULL);
67307+ assert("vs-506", is_reiser4_super(super));
67308+ get_super_private(super)->blocks_free = nr;
67309+}
67310+
67311+/* get mkfs unique identifier */
67312+__u32 reiser4_mkfs_id(const struct super_block *super /* super block
67313+ queried */ )
67314+{
67315+ assert("vpf-221", super != NULL);
67316+ assert("vpf-222", is_reiser4_super(super));
67317+ return get_super_private(super)->mkfs_id;
67318+}
67319+
67320+/* amount of free blocks in file system */
67321+__u64 reiser4_free_committed_blocks(const struct super_block *super)
67322+{
67323+ assert("vs-497", super != NULL);
67324+ assert("vs-498", is_reiser4_super(super));
67325+ return get_super_private(super)->blocks_free_committed;
67326+}
67327+
67328+/* amount of blocks in the file system reserved for @uid and @gid */
67329+long reiser4_reserved_blocks(const struct super_block *super /* super block
67330+ queried */ ,
67331+ uid_t uid /* user id */ ,
67332+ gid_t gid /* group id */ )
67333+{
67334+ long reserved;
67335+
67336+ assert("nikita-456", super != NULL);
67337+ assert("nikita-457", is_reiser4_super(super));
67338+
67339+ reserved = 0;
67340+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
67341+ reserved += reserved_for_gid(super, gid);
67342+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
67343+ reserved += reserved_for_uid(super, uid);
67344+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
67345+ reserved += reserved_for_root(super);
67346+ return reserved;
67347+}
67348+
67349+/* get/set value of/to grabbed blocks counter */
67350+__u64 reiser4_grabbed_blocks(const struct super_block * super)
67351+{
67352+ assert("zam-512", super != NULL);
67353+ assert("zam-513", is_reiser4_super(super));
67354+
67355+ return get_super_private(super)->blocks_grabbed;
67356+}
67357+
67358+__u64 reiser4_flush_reserved(const struct super_block * super)
67359+{
67360+ assert("vpf-285", super != NULL);
67361+ assert("vpf-286", is_reiser4_super(super));
67362+
67363+ return get_super_private(super)->blocks_flush_reserved;
67364+}
67365+
67366+/* get/set value of/to counter of fake allocated formatted blocks */
67367+__u64 reiser4_fake_allocated(const struct super_block * super)
67368+{
67369+ assert("zam-516", super != NULL);
67370+ assert("zam-517", is_reiser4_super(super));
67371+
67372+ return get_super_private(super)->blocks_fake_allocated;
67373+}
67374+
67375+/* get/set value of/to counter of fake allocated unformatted blocks */
67376+__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
67377+{
67378+ assert("zam-516", super != NULL);
67379+ assert("zam-517", is_reiser4_super(super));
67380+
67381+ return get_super_private(super)->blocks_fake_allocated_unformatted;
67382+}
67383+
67384+/* get/set value of/to counter of clustered blocks */
67385+__u64 reiser4_clustered_blocks(const struct super_block * super)
67386+{
67387+ assert("edward-601", super != NULL);
67388+ assert("edward-602", is_reiser4_super(super));
67389+
67390+ return get_super_private(super)->blocks_clustered;
67391+}
67392+
67393+/* space allocator used by this file system */
67394+reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
67395+ *super)
67396+{
67397+ assert("nikita-1965", super != NULL);
67398+ assert("nikita-1966", is_reiser4_super(super));
67399+ return &get_super_private(super)->space_allocator;
67400+}
67401+
67402+/* return fake inode used to bind formatted nodes in the page cache */
67403+struct inode *reiser4_get_super_fake(const struct super_block *super /* super block
67404+ queried */ )
67405+{
67406+ assert("nikita-1757", super != NULL);
67407+ return get_super_private(super)->fake;
67408+}
67409+
67410+/* return fake inode used to bind copied on capture nodes in the page cache */
67411+struct inode *reiser4_get_cc_fake(const struct super_block *super /* super block
67412+ queried */ )
67413+{
67414+ assert("nikita-1757", super != NULL);
67415+ return get_super_private(super)->cc;
67416+}
67417+
67418+/* return fake inode used to bind bitmaps and journlal heads */
67419+struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
67420+{
67421+ assert("nikita-17571", super != NULL);
67422+ return get_super_private(super)->bitmap;
67423+}
67424+
67425+/* tree used by this file system */
67426+reiser4_tree *reiser4_get_tree(const struct super_block * super /* super block
67427+ * queried */ )
67428+{
67429+ assert("nikita-460", super != NULL);
67430+ assert("nikita-461", is_reiser4_super(super));
67431+ return &get_super_private(super)->tree;
67432+}
67433+
67434+/* Check that @super is (looks like) reiser4 super block. This is mainly for
67435+ use in assertions. */
67436+int is_reiser4_super(const struct super_block *super /* super block
67437+ * queried */ )
67438+{
67439+ return
67440+ super != NULL &&
67441+ get_super_private(super) != NULL &&
67442+ super->s_op == &(get_super_private(super)->ops.super);
67443+}
67444+
67445+int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
67446+{
67447+ return test_bit((int)f, &get_super_private(super)->fs_flags);
67448+}
67449+
67450+/* amount of blocks reserved for given group in file system */
67451+static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG /* super
67452+ * block
67453+ * queried */ ,
67454+ gid_t gid UNUSED_ARG /* group id */ )
67455+{
67456+ return 0;
67457+}
67458+
67459+/* amount of blocks reserved for given user in file system */
67460+static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG /* super
67461+ block
67462+ queried */ ,
67463+ uid_t uid UNUSED_ARG /* user id */ )
67464+{
67465+ return 0;
67466+}
67467+
67468+/* amount of blocks reserved for super user in file system */
67469+static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG /* super
67470+ block
67471+ queried */ )
67472+{
67473+ return 0;
67474+}
67475+
67476+/*
67477+ * true if block number @blk makes sense for the file system at @super.
67478+ */
67479+int
67480+reiser4_blocknr_is_sane_for(const struct super_block *super,
67481+ const reiser4_block_nr * blk)
67482+{
67483+ reiser4_super_info_data *sbinfo;
67484+
67485+ assert("nikita-2957", super != NULL);
67486+ assert("nikita-2958", blk != NULL);
67487+
67488+ if (reiser4_blocknr_is_fake(blk))
67489+ return 1;
67490+
67491+ sbinfo = get_super_private(super);
67492+ return *blk < sbinfo->block_count;
67493+}
67494+
67495+#if REISER4_DEBUG
67496+/*
67497+ * true, if block number @blk makes sense for the current file system
67498+ */
67499+int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
67500+{
67501+ return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
67502+}
67503+#endif /* REISER4_DEBUG */
67504+
67505+/* Make Linus happy.
67506+ Local variables:
67507+ c-indentation-style: "K&R"
67508+ mode-name: "LC"
67509+ c-basic-offset: 8
67510+ tab-width: 8
67511+ fill-column: 120
67512+ End:
67513+*/
67514diff --git a/fs/reiser4/super.h b/fs/reiser4/super.h
67515new file mode 100644
67516index 0000000..120f021
67517--- /dev/null
67518+++ b/fs/reiser4/super.h
67519@@ -0,0 +1,464 @@
67520+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67521+ * reiser4/README */
67522+
67523+/* Super-block functions. See super.c for details. */
67524+
67525+#if !defined( __REISER4_SUPER_H__ )
67526+#define __REISER4_SUPER_H__
67527+
67528+#include "tree.h"
67529+#include "entd.h"
67530+#include "wander.h"
67531+#include "fsdata.h"
67532+#include "plugin/object.h"
67533+#include "plugin/space/space_allocator.h"
67534+
67535+/*
67536+ * Flush algorithms parameters.
67537+ */
67538+typedef struct {
67539+ unsigned relocate_threshold;
67540+ unsigned relocate_distance;
67541+ unsigned written_threshold;
67542+ unsigned scan_maxnodes;
67543+} flush_params;
67544+
67545+typedef enum {
67546+ /*
67547+ * True if this file system doesn't support hard-links (multiple names)
67548+ * for directories: this is default UNIX behavior.
67549+ *
67550+ * If hard-links on directoires are not allowed, file system is Acyclic
67551+ * Directed Graph (modulo dot, and dotdot, of course).
67552+ *
67553+ * This is used by reiser4_link().
67554+ */
67555+ REISER4_ADG = 0,
67556+ /*
67557+ * set if all nodes in internal tree have the same node layout plugin.
67558+ * If so, znode_guess_plugin() will return tree->node_plugin in stead
67559+ * of guessing plugin by plugin id stored in the node.
67560+ */
67561+ REISER4_ONE_NODE_PLUGIN = 1,
67562+ /* if set, bsd gid assignment is supported. */
67563+ REISER4_BSD_GID = 2,
67564+ /* [mac]_time are 32 bit in inode */
67565+ REISER4_32_BIT_TIMES = 3,
67566+ /* load all bitmap blocks at mount time */
67567+ REISER4_DONT_LOAD_BITMAP = 5,
67568+ /* enforce atomicity during write(2) */
67569+ REISER4_ATOMIC_WRITE = 6,
67570+ /* don't use write barriers in the log writer code. */
67571+ REISER4_NO_WRITE_BARRIER = 7
67572+} reiser4_fs_flag;
67573+
67574+/*
67575+ * VFS related operation vectors.
67576+ */
67577+typedef struct object_ops {
67578+ struct super_operations super;
67579+ struct dentry_operations dentry;
67580+ struct export_operations export;
67581+} object_ops;
67582+
67583+/* reiser4-specific part of super block
67584+
67585+ Locking
67586+
67587+ Fields immutable after mount:
67588+
67589+ ->oid*
67590+ ->space*
67591+ ->default_[ug]id
67592+ ->mkfs_id
67593+ ->trace_flags
67594+ ->debug_flags
67595+ ->fs_flags
67596+ ->df_plug
67597+ ->optimal_io_size
67598+ ->plug
67599+ ->flush
67600+ ->u (bad name)
67601+ ->txnmgr
67602+ ->ra_params
67603+ ->fsuid
67604+ ->journal_header
67605+ ->journal_footer
67606+
67607+ Fields protected by ->lnode_guard
67608+
67609+ ->lnode_htable
67610+
67611+ Fields protected by per-super block spin lock
67612+
67613+ ->block_count
67614+ ->blocks_used
67615+ ->blocks_free
67616+ ->blocks_free_committed
67617+ ->blocks_grabbed
67618+ ->blocks_fake_allocated_unformatted
67619+ ->blocks_fake_allocated
67620+ ->blocks_flush_reserved
67621+ ->eflushed
67622+ ->blocknr_hint_default
67623+
67624+ After journal replaying during mount,
67625+
67626+ ->last_committed_tx
67627+
67628+ is protected by ->tmgr.commit_mutex
67629+
67630+ Invariants involving this data-type:
67631+
67632+ [sb-block-counts]
67633+ [sb-grabbed]
67634+ [sb-fake-allocated]
67635+*/
67636+struct reiser4_super_info_data {
67637+ /*
67638+ * guard spinlock which protects reiser4 super block fields (currently
67639+ * blocks_free, blocks_free_committed)
67640+ */
67641+ spinlock_t guard;
67642+
67643+ /* next oid that will be returned by oid_allocate() */
67644+ oid_t next_to_use;
67645+ /* total number of used oids */
67646+ oid_t oids_in_use;
67647+
67648+ /* space manager plugin */
67649+ reiser4_space_allocator space_allocator;
67650+
67651+ /* reiser4 internal tree */
67652+ reiser4_tree tree;
67653+
67654+ /*
67655+ * default user id used for light-weight files without their own
67656+ * stat-data.
67657+ */
67658+ uid_t default_uid;
67659+
67660+ /*
67661+ * default group id used for light-weight files without their own
67662+ * stat-data.
67663+ */
67664+ gid_t default_gid;
67665+
67666+ /* mkfs identifier generated at mkfs time. */
67667+ __u32 mkfs_id;
67668+ /* amount of blocks in a file system */
67669+ __u64 block_count;
67670+
67671+ /* inviolable reserve */
67672+ __u64 blocks_reserved;
67673+
67674+ /* amount of blocks used by file system data and meta-data. */
67675+ __u64 blocks_used;
67676+
67677+ /*
67678+ * amount of free blocks. This is "working" free blocks counter. It is
67679+ * like "working" bitmap, please see block_alloc.c for description.
67680+ */
67681+ __u64 blocks_free;
67682+
67683+ /*
67684+ * free block count for fs committed state. This is "commit" version of
67685+ * free block counter.
67686+ */
67687+ __u64 blocks_free_committed;
67688+
67689+ /*
67690+ * number of blocks reserved for further allocation, for all
67691+ * threads.
67692+ */
67693+ __u64 blocks_grabbed;
67694+
67695+ /* number of fake allocated unformatted blocks in tree. */
67696+ __u64 blocks_fake_allocated_unformatted;
67697+
67698+ /* number of fake allocated formatted blocks in tree. */
67699+ __u64 blocks_fake_allocated;
67700+
67701+ /* number of blocks reserved for flush operations. */
67702+ __u64 blocks_flush_reserved;
67703+
67704+ /* number of blocks reserved for cluster operations. */
67705+ __u64 blocks_clustered;
67706+
67707+ /* unique file-system identifier */
67708+ __u32 fsuid;
67709+
67710+ /* On-disk format version. If does not equal to the disk_format
67711+ plugin version, some format updates (e.g. enlarging plugin
67712+ set, etc) may have place on mount. */
67713+ int version;
67714+
67715+ /* file-system wide flags. See reiser4_fs_flag enum */
67716+ unsigned long fs_flags;
67717+
67718+ /* transaction manager */
67719+ txn_mgr tmgr;
67720+
67721+ /* ent thread */
67722+ entd_context entd;
67723+
67724+ /* fake inode used to bind formatted nodes */
67725+ struct inode *fake;
67726+ /* inode used to bind bitmaps (and journal heads) */
67727+ struct inode *bitmap;
67728+ /* inode used to bind copied on capture nodes */
67729+ struct inode *cc;
67730+
67731+ /* disk layout plugin */
67732+ disk_format_plugin *df_plug;
67733+
67734+ /* disk layout specific part of reiser4 super info data */
67735+ union {
67736+ format40_super_info format40;
67737+ } u;
67738+
67739+ /* value we return in st_blksize on stat(2) */
67740+ unsigned long optimal_io_size;
67741+
67742+ /* parameters for the flush algorithm */
67743+ flush_params flush;
67744+
67745+ /* pointers to jnodes for journal header and footer */
67746+ jnode *journal_header;
67747+ jnode *journal_footer;
67748+
67749+ journal_location jloc;
67750+
67751+ /* head block number of last committed transaction */
67752+ __u64 last_committed_tx;
67753+
67754+ /*
67755+ * we remember last written location for using as a hint for new block
67756+ * allocation
67757+ */
67758+ __u64 blocknr_hint_default;
67759+
67760+ /* committed number of files (oid allocator state variable ) */
67761+ __u64 nr_files_committed;
67762+
67763+ ra_params_t ra_params;
67764+
67765+ /*
67766+ * A mutex for serializing cut tree operation if out-of-free-space:
67767+ * the only one cut_tree thread is allowed to grab space from reserved
67768+ * area (it is 5% of disk space)
67769+ */
67770+ struct mutex delete_mutex;
67771+ /* task owning ->delete_mutex */
67772+ struct task_struct *delete_mutex_owner;
67773+
67774+ /* Diskmap's blocknumber */
67775+ __u64 diskmap_block;
67776+
67777+ /* What to do in case of error */
67778+ int onerror;
67779+
67780+ /* operations for objects on this file system */
67781+ object_ops ops;
67782+
67783+ /*
67784+ * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67785+ * more details
67786+ */
67787+ d_cursor_info d_info;
67788+
67789+#ifdef CONFIG_REISER4_BADBLOCKS
67790+ /* Alternative master superblock offset (in bytes) */
67791+ unsigned long altsuper;
67792+#endif
67793+ struct repacker *repacker;
67794+ struct page *status_page;
67795+ struct bio *status_bio;
67796+
67797+#if REISER4_DEBUG
67798+ /*
67799+ * minimum used blocks value (includes super blocks, bitmap blocks and
67800+ * other fs reserved areas), depends on fs format and fs size.
67801+ */
67802+ __u64 min_blocks_used;
67803+
67804+ /*
67805+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67806+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
67807+ * protected by sbinfo->all_guard spin lock. This lock should be taken
67808+ * with _irq modifier, because it is also modified from interrupt
67809+ * contexts (by RCU).
67810+ */
67811+ spinlock_t all_guard;
67812+ /* list of all jnodes */
67813+ struct list_head all_jnodes;
67814+#endif
67815+ struct dentry *debugfs_root;
67816+};
67817+
67818+extern reiser4_super_info_data *get_super_private_nocheck(const struct
67819+ super_block *super);
67820+
67821+/* Return reiser4-specific part of super block */
67822+static inline reiser4_super_info_data *get_super_private(const struct
67823+ super_block *super)
67824+{
67825+ assert("nikita-447", super != NULL);
67826+
67827+ return (reiser4_super_info_data *) super->s_fs_info;
67828+}
67829+
67830+/* get ent context for the @super */
67831+static inline entd_context *get_entd_context(struct super_block *super)
67832+{
67833+ return &get_super_private(super)->entd;
67834+}
67835+
67836+/* "Current" super-block: main super block used during current system
67837+ call. Reference to this super block is stored in reiser4_context. */
67838+static inline struct super_block *reiser4_get_current_sb(void)
67839+{
67840+ return get_current_context()->super;
67841+}
67842+
67843+/* Reiser4-specific part of "current" super-block: main super block used
67844+ during current system call. Reference to this super block is stored in
67845+ reiser4_context. */
67846+static inline reiser4_super_info_data *get_current_super_private(void)
67847+{
67848+ return get_super_private(reiser4_get_current_sb());
67849+}
67850+
67851+static inline ra_params_t *get_current_super_ra_params(void)
67852+{
67853+ return &(get_current_super_private()->ra_params);
67854+}
67855+
67856+/*
67857+ * true, if file system on @super is read-only
67858+ */
67859+static inline int rofs_super(struct super_block *super)
67860+{
67861+ return super->s_flags & MS_RDONLY;
67862+}
67863+
67864+/*
67865+ * true, if @tree represents read-only file system
67866+ */
67867+static inline int rofs_tree(reiser4_tree * tree)
67868+{
67869+ return rofs_super(tree->super);
67870+}
67871+
67872+/*
67873+ * true, if file system where @inode lives on, is read-only
67874+ */
67875+static inline int rofs_inode(struct inode *inode)
67876+{
67877+ return rofs_super(inode->i_sb);
67878+}
67879+
67880+/*
67881+ * true, if file system where @node lives on, is read-only
67882+ */
67883+static inline int rofs_jnode(jnode * node)
67884+{
67885+ return rofs_tree(jnode_get_tree(node));
67886+}
67887+
67888+extern __u64 reiser4_current_block_count(void);
67889+
67890+extern void build_object_ops(struct super_block *super, object_ops * ops);
67891+
67892+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67893+
67894+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67895+{
67896+ spin_lock(&(sbinfo->guard));
67897+}
67898+
67899+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67900+{
67901+ assert_spin_locked(&(sbinfo->guard));
67902+ spin_unlock(&(sbinfo->guard));
67903+}
67904+
67905+extern __u64 reiser4_flush_reserved(const struct super_block *);
67906+extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67907+extern long reiser4_statfs_type(const struct super_block *super);
67908+extern __u64 reiser4_block_count(const struct super_block *super);
67909+extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67910+extern __u64 reiser4_data_blocks(const struct super_block *super);
67911+extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67912+extern __u64 reiser4_free_blocks(const struct super_block *super);
67913+extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67914+extern __u32 reiser4_mkfs_id(const struct super_block *super);
67915+
67916+extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67917+
67918+extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67919+extern __u64 reiser4_fake_allocated(const struct super_block *);
67920+extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67921+extern __u64 reiser4_clustered_blocks(const struct super_block *);
67922+
67923+extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67924+ gid_t gid);
67925+
67926+extern reiser4_space_allocator *
67927+reiser4_get_space_allocator(const struct super_block *super);
67928+extern reiser4_oid_allocator *
67929+reiser4_get_oid_allocator(const struct super_block *super);
67930+extern struct inode *reiser4_get_super_fake(const struct super_block *super);
67931+extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
67932+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
67933+extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
67934+extern int is_reiser4_super(const struct super_block *super);
67935+
67936+extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67937+extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67938+ const reiser4_block_nr * blk);
67939+extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67940+extern int reiser4_done_super(struct super_block *s);
67941+
67942+/* step of fill super */
67943+extern int reiser4_init_fs_info(struct super_block *);
67944+extern void reiser4_done_fs_info(struct super_block *);
67945+extern int reiser4_init_super_data(struct super_block *, char *opt_string);
67946+extern int reiser4_init_read_super(struct super_block *, int silent);
67947+extern int reiser4_init_root_inode(struct super_block *);
67948+extern reiser4_plugin *get_default_plugin(pset_member memb);
67949+
67950+/* Maximal possible object id. */
67951+#define ABSOLUTE_MAX_OID ((oid_t)~0)
67952+
67953+#define OIDS_RESERVED ( 1 << 16 )
67954+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67955+oid_t oid_allocate(struct super_block *);
67956+int oid_release(struct super_block *, oid_t);
67957+oid_t oid_next(const struct super_block *);
67958+void oid_count_allocated(void);
67959+void oid_count_released(void);
67960+long oids_used(const struct super_block *);
67961+
67962+#if REISER4_DEBUG
67963+void print_fs_info(const char *prefix, const struct super_block *);
67964+#endif
67965+
67966+extern void destroy_reiser4_cache(struct kmem_cache **);
67967+
67968+extern struct super_operations reiser4_super_operations;
67969+extern struct export_operations reiser4_export_operations;
67970+extern struct dentry_operations reiser4_dentry_operations;
67971+
67972+/* __REISER4_SUPER_H__ */
67973+#endif
67974+
67975+/*
67976+ * Local variables:
67977+ * c-indentation-style: "K&R"
67978+ * mode-name: "LC"
67979+ * c-basic-offset: 8
67980+ * tab-width: 8
67981+ * fill-column: 120
67982+ * End:
67983+ */
67984diff --git a/fs/reiser4/super_ops.c b/fs/reiser4/super_ops.c
67985new file mode 100644
67986index 0000000..41e9c1a
67987--- /dev/null
67988+++ b/fs/reiser4/super_ops.c
67989@@ -0,0 +1,730 @@
67990+/* Copyright 2005 by Hans Reiser, licensing governed by
67991+ * reiser4/README */
67992+
67993+#include "inode.h"
67994+#include "page_cache.h"
67995+#include "ktxnmgrd.h"
67996+#include "flush.h"
67997+#include "safe_link.h"
67998+
67999+#include <linux/vfs.h>
68000+#include <linux/writeback.h>
68001+#include <linux/mount.h>
68002+#include <linux/seq_file.h>
68003+#include <linux/debugfs.h>
68004+
68005+/* slab cache for inodes */
68006+static struct kmem_cache *inode_cache;
68007+
68008+static struct dentry *reiser4_debugfs_root = NULL;
68009+
68010+/**
68011+ * init_once - constructor for reiser4 inodes
68012+ * @obj: inode to be initialized
68013+ * @cache: cache @obj belongs to
68014+ * @flags: SLAB flags
68015+ *
68016+ * Initialization function to be called when new page is allocated by reiser4
68017+ * inode cache. It is set on inode cache creation.
68018+ */
68019+static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
68020+{
68021+ reiser4_inode_object *info;
68022+
68023+ info = obj;
68024+
68025+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
68026+ SLAB_CTOR_CONSTRUCTOR) {
68027+ /* initialize vfs inode */
68028+ inode_init_once(&info->vfs_inode);
68029+
68030+ /*
68031+ * initialize reiser4 specific part fo inode.
68032+ * NOTE-NIKITA add here initializations for locks, list heads,
68033+ * etc. that will be added to our private inode part.
68034+ */
68035+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
68036+ init_rwsem(&info->p.conv_sem);
68037+ /* init semaphore which is used during inode loading */
68038+ loading_init_once(&info->p);
68039+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
68040+ GFP_ATOMIC);
68041+#if REISER4_DEBUG
68042+ info->p.nr_jnodes = 0;
68043+#endif
68044+ }
68045+}
68046+
68047+/**
68048+ * init_inodes - create znode cache
68049+ *
68050+ * Initializes slab cache of inodes. It is part of reiser4 module initialization.
68051+ */
68052+static int init_inodes(void)
68053+{
68054+ inode_cache = kmem_cache_create("reiser4_inode",
68055+ sizeof(reiser4_inode_object),
68056+ 0,
68057+ SLAB_HWCACHE_ALIGN |
68058+ SLAB_RECLAIM_ACCOUNT, init_once, NULL);
68059+ if (inode_cache == NULL)
68060+ return RETERR(-ENOMEM);
68061+ return 0;
68062+}
68063+
68064+/**
68065+ * done_inodes - delete inode cache
68066+ *
68067+ * This is called on reiser4 module unloading or system shutdown.
68068+ */
68069+static void done_inodes(void)
68070+{
68071+ destroy_reiser4_cache(&inode_cache);
68072+}
68073+
68074+/**
68075+ * reiser4_alloc_inode - alloc_inode of super operations
68076+ * @super: super block new inode is allocated for
68077+ *
68078+ * Allocates new inode, initializes reiser4 specific part of it.
68079+ */
68080+static struct inode *reiser4_alloc_inode(struct super_block *super)
68081+{
68082+ reiser4_inode_object *obj;
68083+
68084+ assert("nikita-1696", super != NULL);
68085+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
68086+ if (obj != NULL) {
68087+ reiser4_inode *info;
68088+
68089+ info = &obj->p;
68090+
68091+ info->pset = plugin_set_get_empty();
68092+ info->hset = plugin_set_get_empty();
68093+ info->extmask = 0;
68094+ info->locality_id = 0ull;
68095+ info->plugin_mask = 0;
68096+ info->heir_mask = 0;
68097+#if !REISER4_INO_IS_OID
68098+ info->oid_hi = 0;
68099+#endif
68100+ reiser4_seal_init(&info->sd_seal, NULL, NULL);
68101+ coord_init_invalid(&info->sd_coord, NULL);
68102+ info->flags = 0;
68103+ spin_lock_init(&info->guard);
68104+ /* this deals with info's loading semaphore */
68105+ loading_alloc(info);
68106+ info->vroot = UBER_TREE_ADDR;
68107+ return &obj->vfs_inode;
68108+ } else
68109+ return NULL;
68110+}
68111+
68112+/**
68113+ * reiser4_destroy_inode - destroy_inode of super operations
68114+ * @inode: inode being destroyed
68115+ *
68116+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
68117+ */
68118+static void reiser4_destroy_inode(struct inode *inode)
68119+{
68120+ reiser4_inode *info;
68121+
68122+ info = reiser4_inode_data(inode);
68123+
68124+ assert("vs-1220", inode_has_no_jnodes(info));
68125+
68126+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
68127+ file_plugin *fplug = inode_file_plugin(inode);
68128+ if (fplug->destroy_inode != NULL)
68129+ fplug->destroy_inode(inode);
68130+ }
68131+ reiser4_dispose_cursors(inode);
68132+ if (info->pset)
68133+ plugin_set_put(info->pset);
68134+ if (info->hset)
68135+ plugin_set_put(info->hset);
68136+
68137+ /*
68138+ * cannot add similar assertion about ->i_list as prune_icache return
68139+ * inode into slab with dangling ->list.{next,prev}. This is safe,
68140+ * because they are re-initialized in the new_inode().
68141+ */
68142+ assert("nikita-2895", list_empty(&inode->i_dentry));
68143+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
68144+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
68145+
68146+ /* this deals with info's loading semaphore */
68147+ loading_destroy(info);
68148+
68149+ kmem_cache_free(inode_cache,
68150+ container_of(info, reiser4_inode_object, p));
68151+}
68152+
68153+/**
68154+ * reiser4_dirty_inode - dirty_inode of super operations
68155+ * @inode: inode being dirtied
68156+ *
68157+ * Updates stat data.
68158+ */
68159+static void reiser4_dirty_inode(struct inode *inode)
68160+{
68161+ int result;
68162+
68163+ if (!is_in_reiser4_context())
68164+ return;
68165+ assert("", !IS_RDONLY(inode));
68166+ assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
68167+ get_current_context()->grabbed_blocks));
68168+
68169+ result = reiser4_update_sd(inode);
68170+ if (result)
68171+ warning("", "failed to dirty inode for %llu: %d",
68172+ get_inode_oid(inode), result);
68173+}
68174+
68175+/**
68176+ * reiser4_delete_inode - delete_inode of super operations
68177+ * @inode: inode to delete
68178+ *
68179+ * Calls file plugin's delete_object method to delete object items from
68180+ * filesystem tree and calls clear_inode.
68181+ */
68182+static void reiser4_delete_inode(struct inode *inode)
68183+{
68184+ reiser4_context *ctx;
68185+ file_plugin *fplug;
68186+
68187+ ctx = reiser4_init_context(inode->i_sb);
68188+ if (IS_ERR(ctx)) {
68189+ warning("vs-15", "failed to init context");
68190+ return;
68191+ }
68192+
68193+ if (is_inode_loaded(inode)) {
68194+ fplug = inode_file_plugin(inode);
68195+ if (fplug != NULL && fplug->delete_object != NULL)
68196+ fplug->delete_object(inode);
68197+ }
68198+
68199+ truncate_inode_pages(&inode->i_data, 0);
68200+ inode->i_blocks = 0;
68201+ clear_inode(inode);
68202+ reiser4_exit_context(ctx);
68203+}
68204+
68205+/**
68206+ * reiser4_put_super - put_super of super operations
68207+ * @super: super block to free
68208+ *
68209+ * Stops daemons, release resources, umounts in short.
68210+ */
68211+static void reiser4_put_super(struct super_block *super)
68212+{
68213+ reiser4_super_info_data *sbinfo;
68214+ reiser4_context *ctx;
68215+
68216+ sbinfo = get_super_private(super);
68217+ assert("vs-1699", sbinfo);
68218+
68219+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
68220+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
68221+ debugfs_remove(sbinfo->debugfs_root);
68222+
68223+ ctx = reiser4_init_context(super);
68224+ if (IS_ERR(ctx)) {
68225+ warning("vs-17", "failed to init context");
68226+ return;
68227+ }
68228+
68229+ /* have disk format plugin to free its resources */
68230+ if (get_super_private(super)->df_plug->release)
68231+ get_super_private(super)->df_plug->release(super);
68232+
68233+ reiser4_done_formatted_fake(super);
68234+
68235+ /* stop daemons: ktxnmgr and entd */
68236+ reiser4_done_entd(super);
68237+ reiser4_done_ktxnmgrd(super);
68238+ reiser4_done_txnmgr(&sbinfo->tmgr);
68239+
68240+ reiser4_done_fs_info(super);
68241+ reiser4_exit_context(ctx);
68242+}
68243+
68244+/**
68245+ * reiser4_write_super - write_super of super operations
68246+ * @super: super block to write
68247+ *
68248+ * Captures znode associated with super block, comit all transactions.
68249+ */
68250+static void reiser4_write_super(struct super_block *super)
68251+{
68252+ int ret;
68253+ reiser4_context *ctx;
68254+
68255+ assert("vs-1700", !rofs_super(super));
68256+
68257+ ctx = reiser4_init_context(super);
68258+ if (IS_ERR(ctx)) {
68259+ warning("vs-16", "failed to init context");
68260+ return;
68261+ }
68262+
68263+ ret = reiser4_capture_super_block(super);
68264+ if (ret != 0)
68265+ warning("vs-1701",
68266+ "reiser4_capture_super_block failed in write_super: %d",
68267+ ret);
68268+ ret = txnmgr_force_commit_all(super, 0);
68269+ if (ret != 0)
68270+ warning("jmacd-77113",
68271+ "txn_force failed in write_super: %d", ret);
68272+
68273+ super->s_dirt = 0;
68274+
68275+ reiser4_exit_context(ctx);
68276+}
68277+
68278+/**
68279+ * reiser4_statfs - statfs of super operations
68280+ * @super: super block of file system in queried
68281+ * @stafs: buffer to fill with statistics
68282+ *
68283+ * Returns information about filesystem.
68284+ */
68285+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
68286+{
68287+ sector_t total;
68288+ sector_t reserved;
68289+ sector_t free;
68290+ sector_t forroot;
68291+ sector_t deleted;
68292+ reiser4_context *ctx;
68293+ struct super_block *super = dentry->d_sb;
68294+
68295+ assert("nikita-408", super != NULL);
68296+ assert("nikita-409", statfs != NULL);
68297+
68298+ ctx = reiser4_init_context(super);
68299+ if (IS_ERR(ctx))
68300+ return PTR_ERR(ctx);
68301+
68302+ statfs->f_type = reiser4_statfs_type(super);
68303+ statfs->f_bsize = super->s_blocksize;
68304+
68305+ /*
68306+ * 5% of total block space is reserved. This is needed for flush and
68307+ * for truncates (so that we are able to perform truncate/unlink even
68308+ * on the otherwise completely full file system). If this reservation
68309+ * is hidden from statfs(2), users will mistakenly guess that they
68310+ * have enough free space to complete some operation, which is
68311+ * frustrating.
68312+ *
68313+ * Another possible solution is to subtract ->blocks_reserved from
68314+ * ->f_bfree, but changing available space seems less intrusive than
68315+ * letting user to see 5% of disk space to be used directly after
68316+ * mkfs.
68317+ */
68318+ total = reiser4_block_count(super);
68319+ reserved = get_super_private(super)->blocks_reserved;
68320+ deleted = txnmgr_count_deleted_blocks();
68321+ free = reiser4_free_blocks(super) + deleted;
68322+ forroot = reiser4_reserved_blocks(super, 0, 0);
68323+
68324+ /*
68325+ * These counters may be in inconsistent state because we take the
68326+ * values without keeping any global spinlock. Here we do a sanity
68327+ * check that free block counter does not exceed the number of all
68328+ * blocks.
68329+ */
68330+ if (free > total)
68331+ free = total;
68332+ statfs->f_blocks = total - reserved;
68333+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
68334+ if (free > reserved)
68335+ free -= reserved;
68336+ else
68337+ free = 0;
68338+ statfs->f_bfree = free;
68339+
68340+ if (free > forroot)
68341+ free -= forroot;
68342+ else
68343+ free = 0;
68344+ statfs->f_bavail = free;
68345+
68346+ statfs->f_files = 0;
68347+ statfs->f_ffree = 0;
68348+
68349+ /* maximal acceptable name length depends on directory plugin. */
68350+ assert("nikita-3351", super->s_root->d_inode != NULL);
68351+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
68352+ reiser4_exit_context(ctx);
68353+ return 0;
68354+}
68355+
68356+/**
68357+ * reiser4_clear_inode - clear_inode of super operation
68358+ * @inode: inode about to destroy
68359+ *
68360+ * Does sanity checks: being destroyed should have all jnodes detached.
68361+ */
68362+static void reiser4_clear_inode(struct inode *inode)
68363+{
68364+#if REISER4_DEBUG
68365+ reiser4_inode *r4_inode;
68366+
68367+ r4_inode = reiser4_inode_data(inode);
68368+ if (!inode_has_no_jnodes(r4_inode))
68369+ warning("vs-1732", "reiser4 inode has %ld jnodes\n",
68370+ r4_inode->nr_jnodes);
68371+#endif
68372+}
68373+
68374+/**
68375+ * reiser4_sync_inodes - sync_inodes of super operations
68376+ * @super:
68377+ * @wbc:
68378+ *
68379+ * This method is called by background and non-backgound writeback. Reiser4's
68380+ * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
68381+ * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
68382+ * mapping - dirty pages get into atoms. Writeout is called to flush some
68383+ * atoms.
68384+ */
68385+static void reiser4_sync_inodes(struct super_block *super,
68386+ struct writeback_control *wbc)
68387+{
68388+ reiser4_context *ctx;
68389+ long to_write;
68390+
68391+ if (wbc->for_kupdate)
68392+ /* reiser4 has its own means of periodical write-out */
68393+ return;
68394+
68395+ to_write = wbc->nr_to_write;
68396+ assert("vs-49", wbc->older_than_this == NULL);
68397+
68398+ ctx = reiser4_init_context(super);
68399+ if (IS_ERR(ctx)) {
68400+ warning("vs-13", "failed to init context");
68401+ return;
68402+ }
68403+
68404+ /*
68405+ * call reiser4_writepages for each of dirty inodes to turn dirty pages
68406+ * into transactions if they were not yet.
68407+ */
68408+ generic_sync_sb_inodes(super, wbc);
68409+
68410+ /* flush goes here */
68411+ wbc->nr_to_write = to_write;
68412+ reiser4_writeout(super, wbc);
68413+
68414+ /* avoid recursive calls to ->sync_inodes */
68415+ context_set_commit_async(ctx);
68416+ reiser4_exit_context(ctx);
68417+}
68418+
68419+/**
68420+ * reiser4_show_options - show_options of super operations
68421+ * @m: file where to write information
68422+ * @mnt: mount structure
68423+ *
68424+ * Makes reiser4 mount options visible in /proc/mounts.
68425+ */
68426+static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
68427+{
68428+ struct super_block *super;
68429+ reiser4_super_info_data *sbinfo;
68430+
68431+ super = mnt->mnt_sb;
68432+ sbinfo = get_super_private(super);
68433+
68434+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
68435+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
68436+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
68437+ seq_printf(m, ",atom_max_flushers=0x%x",
68438+ sbinfo->tmgr.atom_max_flushers);
68439+ seq_printf(m, ",cbk_cache_slots=0x%x",
68440+ sbinfo->tree.cbk_cache.nr_slots);
68441+
68442+ return 0;
68443+}
68444+
68445+struct super_operations reiser4_super_operations = {
68446+ .alloc_inode = reiser4_alloc_inode,
68447+ .destroy_inode = reiser4_destroy_inode,
68448+ .dirty_inode = reiser4_dirty_inode,
68449+ .delete_inode = reiser4_delete_inode,
68450+ .put_super = reiser4_put_super,
68451+ .write_super = reiser4_write_super,
68452+ .statfs = reiser4_statfs,
68453+ .clear_inode = reiser4_clear_inode,
68454+ .sync_inodes = reiser4_sync_inodes,
68455+ .show_options = reiser4_show_options
68456+};
68457+
68458+/**
68459+ * fill_super - initialize super block on mount
68460+ * @super: super block to fill
68461+ * @data: reiser4 specific mount option
68462+ * @silent:
68463+ *
68464+ * This is to be called by reiser4_get_sb. Mounts filesystem.
68465+ */
68466+static int fill_super(struct super_block *super, void *data, int silent)
68467+{
68468+ reiser4_context ctx;
68469+ int result;
68470+ reiser4_super_info_data *sbinfo;
68471+
68472+ assert("zam-989", super != NULL);
68473+
68474+ super->s_op = NULL;
68475+ init_stack_context(&ctx, super);
68476+
68477+ /* allocate reiser4 specific super block */
68478+ if ((result = reiser4_init_fs_info(super)) != 0)
68479+ goto failed_init_sinfo;
68480+
68481+ sbinfo = get_super_private(super);
68482+ /* initialize various reiser4 parameters, parse mount options */
68483+ if ((result = reiser4_init_super_data(super, data)) != 0)
68484+ goto failed_init_super_data;
68485+
68486+ /* read reiser4 master super block, initialize disk format plugin */
68487+ if ((result = reiser4_init_read_super(super, silent)) != 0)
68488+ goto failed_init_read_super;
68489+
68490+ /* initialize transaction manager */
68491+ reiser4_init_txnmgr(&sbinfo->tmgr);
68492+
68493+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
68494+ if ((result = reiser4_init_ktxnmgrd(super)) != 0)
68495+ goto failed_init_ktxnmgrd;
68496+
68497+ /* initialize entd context and start kernel thread entd */
68498+ if ((result = reiser4_init_entd(super)) != 0)
68499+ goto failed_init_entd;
68500+
68501+ /* initialize address spaces for formatted nodes and bitmaps */
68502+ if ((result = reiser4_init_formatted_fake(super)) != 0)
68503+ goto failed_init_formatted_fake;
68504+
68505+ /* initialize disk format plugin */
68506+ if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
68507+ goto failed_init_disk_format;
68508+
68509+ /*
68510+ * There are some 'committed' versions of reiser4 super block counters,
68511+ * which correspond to reiser4 on-disk state. These counters are
68512+ * initialized here
68513+ */
68514+ sbinfo->blocks_free_committed = sbinfo->blocks_free;
68515+ sbinfo->nr_files_committed = oids_used(super);
68516+
68517+ /* get inode of root directory */
68518+ if ((result = reiser4_init_root_inode(super)) != 0)
68519+ goto failed_init_root_inode;
68520+
68521+ if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
68522+ goto failed_update_format_version;
68523+
68524+ process_safelinks(super);
68525+ reiser4_exit_context(&ctx);
68526+
68527+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68528+ reiser4_debugfs_root);
68529+ if (sbinfo->debugfs_root) {
68530+ sbinfo->tmgr.debugfs_atom_count =
68531+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68532+ sbinfo->debugfs_root,
68533+ &sbinfo->tmgr.atom_count);
68534+ sbinfo->tmgr.debugfs_id_count =
68535+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68536+ sbinfo->debugfs_root,
68537+ &sbinfo->tmgr.id_count);
68538+ }
68539+ return 0;
68540+
68541+ failed_update_format_version:
68542+ failed_init_root_inode:
68543+ if (sbinfo->df_plug->release)
68544+ sbinfo->df_plug->release(super);
68545+ failed_init_disk_format:
68546+ reiser4_done_formatted_fake(super);
68547+ failed_init_formatted_fake:
68548+ reiser4_done_entd(super);
68549+ failed_init_entd:
68550+ reiser4_done_ktxnmgrd(super);
68551+ failed_init_ktxnmgrd:
68552+ reiser4_done_txnmgr(&sbinfo->tmgr);
68553+ failed_init_read_super:
68554+ failed_init_super_data:
68555+ reiser4_done_fs_info(super);
68556+ failed_init_sinfo:
68557+ reiser4_exit_context(&ctx);
68558+ return result;
68559+}
68560+
68561+/**
68562+ * reiser4_get_sb - get_sb of file_system_type operations
68563+ * @fs_type:
68564+ * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68565+ * @dev_name: block device file name
68566+ * @data: specific mount options
68567+ *
68568+ * Reiser4 mount entry.
68569+ */
68570+static int reiser4_get_sb(struct file_system_type *fs_type,
68571+ int flags,
68572+ const char *dev_name,
68573+ void *data,
68574+ struct vfsmount *mnt)
68575+{
68576+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
68577+}
68578+
68579+/* structure describing the reiser4 filesystem implementation */
68580+static struct file_system_type reiser4_fs_type = {
68581+ .owner = THIS_MODULE,
68582+ .name = "reiser4",
68583+ .fs_flags = FS_REQUIRES_DEV,
68584+ .get_sb = reiser4_get_sb,
68585+ .kill_sb = kill_block_super,
68586+ .next = NULL
68587+};
68588+
68589+void destroy_reiser4_cache(struct kmem_cache **cachep)
68590+{
68591+ kmem_cache_destroy(*cachep);
68592+ *cachep = NULL;
68593+}
68594+
68595+/**
68596+ * init_reiser4 - reiser4 initialization entry point
68597+ *
68598+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68599+ * on kernel initialization or during reiser4 module load.
68600+ */
68601+static int __init init_reiser4(void)
68602+{
68603+ int result;
68604+
68605+ printk(KERN_INFO
68606+ "Loading Reiser4. "
68607+ "See www.namesys.com for a description of Reiser4.\n");
68608+
68609+ /* initialize slab cache of inodes */
68610+ if ((result = init_inodes()) != 0)
68611+ goto failed_inode_cache;
68612+
68613+ /* initialize cache of znodes */
68614+ if ((result = init_znodes()) != 0)
68615+ goto failed_init_znodes;
68616+
68617+ /* initialize all plugins */
68618+ if ((result = init_plugins()) != 0)
68619+ goto failed_init_plugins;
68620+
68621+ /* initialize cache of plugin_set-s and plugin_set's hash table */
68622+ if ((result = init_plugin_set()) != 0)
68623+ goto failed_init_plugin_set;
68624+
68625+ /* initialize caches of txn_atom-s and txn_handle-s */
68626+ if ((result = init_txnmgr_static()) != 0)
68627+ goto failed_init_txnmgr_static;
68628+
68629+ /* initialize cache of jnodes */
68630+ if ((result = init_jnodes()) != 0)
68631+ goto failed_init_jnodes;
68632+
68633+ /* initialize cache of flush queues */
68634+ if ((result = reiser4_init_fqs()) != 0)
68635+ goto failed_init_fqs;
68636+
68637+ /* initialize cache of structures attached to dentry->d_fsdata */
68638+ if ((result = reiser4_init_dentry_fsdata()) != 0)
68639+ goto failed_init_dentry_fsdata;
68640+
68641+ /* initialize cache of structures attached to file->private_data */
68642+ if ((result = reiser4_init_file_fsdata()) != 0)
68643+ goto failed_init_file_fsdata;
68644+
68645+ /*
68646+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68647+ * more details
68648+ */
68649+ if ((result = reiser4_init_d_cursor()) != 0)
68650+ goto failed_init_d_cursor;
68651+
68652+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68653+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68654+ return 0;
68655+ }
68656+
68657+ reiser4_done_d_cursor();
68658+ failed_init_d_cursor:
68659+ reiser4_done_file_fsdata();
68660+ failed_init_file_fsdata:
68661+ reiser4_done_dentry_fsdata();
68662+ failed_init_dentry_fsdata:
68663+ reiser4_done_fqs();
68664+ failed_init_fqs:
68665+ done_jnodes();
68666+ failed_init_jnodes:
68667+ done_txnmgr_static();
68668+ failed_init_txnmgr_static:
68669+ done_plugin_set();
68670+ failed_init_plugin_set:
68671+ failed_init_plugins:
68672+ done_znodes();
68673+ failed_init_znodes:
68674+ done_inodes();
68675+ failed_inode_cache:
68676+ return result;
68677+}
68678+
68679+/**
68680+ * done_reiser4 - reiser4 exit entry point
68681+ *
68682+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68683+ * or at module unload.
68684+ */
68685+static void __exit done_reiser4(void)
68686+{
68687+ int result;
68688+
68689+ debugfs_remove(reiser4_debugfs_root);
68690+ result = unregister_filesystem(&reiser4_fs_type);
68691+ BUG_ON(result != 0);
68692+ reiser4_done_d_cursor();
68693+ reiser4_done_file_fsdata();
68694+ reiser4_done_dentry_fsdata();
68695+ reiser4_done_fqs();
68696+ done_jnodes();
68697+ done_txnmgr_static();
68698+ done_plugin_set();
68699+ done_znodes();
68700+ destroy_reiser4_cache(&inode_cache);
68701+}
68702+
68703+module_init(init_reiser4);
68704+module_exit(done_reiser4);
68705+
68706+MODULE_DESCRIPTION("Reiser4 filesystem");
68707+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68708+
68709+MODULE_LICENSE("GPL");
68710+
68711+/*
68712+ * Local variables:
68713+ * c-indentation-style: "K&R"
68714+ * mode-name: "LC"
68715+ * c-basic-offset: 8
68716+ * tab-width: 8
68717+ * fill-column: 79
68718+ * End:
68719+ */
68720diff --git a/fs/reiser4/tap.c b/fs/reiser4/tap.c
68721new file mode 100644
68722index 0000000..cfa5179
68723--- /dev/null
68724+++ b/fs/reiser4/tap.c
68725@@ -0,0 +1,377 @@
68726+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68727+ * reiser4/README */
68728+
68729+/*
68730+ Tree Access Pointer (tap).
68731+
68732+ tap is data structure combining coord and lock handle (mostly). It is
68733+ useful when one has to scan tree nodes (for example, in readdir, or flush),
68734+ for tap functions allow to move tap in either direction transparently
68735+ crossing unit/item/node borders.
68736+
68737+ Tap doesn't provide automatic synchronization of its fields as it is
68738+ supposed to be per-thread object.
68739+*/
68740+
68741+#include "forward.h"
68742+#include "debug.h"
68743+#include "coord.h"
68744+#include "tree.h"
68745+#include "context.h"
68746+#include "tap.h"
68747+#include "znode.h"
68748+#include "tree_walk.h"
68749+
68750+#if REISER4_DEBUG
68751+static int tap_invariant(const tap_t * tap);
68752+static void tap_check(const tap_t * tap);
68753+#else
68754+#define tap_check(tap) noop
68755+#endif
68756+
68757+/** load node tap is pointing to, if not loaded already */
68758+int reiser4_tap_load(tap_t * tap)
68759+{
68760+ tap_check(tap);
68761+ if (tap->loaded == 0) {
68762+ int result;
68763+
68764+ result = zload_ra(tap->coord->node, &tap->ra_info);
68765+ if (result != 0)
68766+ return result;
68767+ coord_clear_iplug(tap->coord);
68768+ }
68769+ ++tap->loaded;
68770+ tap_check(tap);
68771+ return 0;
68772+}
68773+
68774+/** release node tap is pointing to. Dual to tap_load() */
68775+void reiser4_tap_relse(tap_t * tap)
68776+{
68777+ tap_check(tap);
68778+ if (tap->loaded > 0) {
68779+ --tap->loaded;
68780+ if (tap->loaded == 0) {
68781+ zrelse(tap->coord->node);
68782+ }
68783+ }
68784+ tap_check(tap);
68785+}
68786+
68787+/**
68788+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68789+ * @mode
68790+ */
68791+void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68792+ znode_lock_mode mode)
68793+{
68794+ tap->coord = coord;
68795+ tap->lh = lh;
68796+ tap->mode = mode;
68797+ tap->loaded = 0;
68798+ INIT_LIST_HEAD(&tap->linkage);
68799+ reiser4_init_ra_info(&tap->ra_info);
68800+}
68801+
68802+/** add @tap to the per-thread list of all taps */
68803+void reiser4_tap_monitor(tap_t * tap)
68804+{
68805+ assert("nikita-2623", tap != NULL);
68806+ tap_check(tap);
68807+ list_add(&tap->linkage, reiser4_taps_list());
68808+ tap_check(tap);
68809+}
68810+
68811+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68812+ * loaded. */
68813+void reiser4_tap_copy(tap_t * dst, tap_t * src)
68814+{
68815+ assert("nikita-3193", src != NULL);
68816+ assert("nikita-3194", dst != NULL);
68817+
68818+ *dst->coord = *src->coord;
68819+ if (src->lh->node)
68820+ copy_lh(dst->lh, src->lh);
68821+ dst->mode = src->mode;
68822+ dst->loaded = 0;
68823+ INIT_LIST_HEAD(&dst->linkage);
68824+ dst->ra_info = src->ra_info;
68825+}
68826+
68827+/** finish with @tap */
68828+void reiser4_tap_done(tap_t * tap)
68829+{
68830+ assert("nikita-2565", tap != NULL);
68831+ tap_check(tap);
68832+ if (tap->loaded > 0)
68833+ zrelse(tap->coord->node);
68834+ done_lh(tap->lh);
68835+ tap->loaded = 0;
68836+ list_del_init(&tap->linkage);
68837+ tap->coord->node = NULL;
68838+}
68839+
68840+/**
68841+ * move @tap to the new node, locked with @target. Load @target, if @tap was
68842+ * already loaded.
68843+ */
68844+int reiser4_tap_move(tap_t * tap, lock_handle * target)
68845+{
68846+ int result = 0;
68847+
68848+ assert("nikita-2567", tap != NULL);
68849+ assert("nikita-2568", target != NULL);
68850+ assert("nikita-2570", target->node != NULL);
68851+ assert("nikita-2569", tap->coord->node == tap->lh->node);
68852+
68853+ tap_check(tap);
68854+ if (tap->loaded > 0)
68855+ result = zload_ra(target->node, &tap->ra_info);
68856+
68857+ if (result == 0) {
68858+ if (tap->loaded > 0)
68859+ zrelse(tap->coord->node);
68860+ done_lh(tap->lh);
68861+ copy_lh(tap->lh, target);
68862+ tap->coord->node = target->node;
68863+ coord_clear_iplug(tap->coord);
68864+ }
68865+ tap_check(tap);
68866+ return result;
68867+}
68868+
68869+/**
68870+ * move @tap to @target. Acquire lock on @target, if @tap was already
68871+ * loaded.
68872+ */
68873+static int tap_to(tap_t * tap, znode * target)
68874+{
68875+ int result;
68876+
68877+ assert("nikita-2624", tap != NULL);
68878+ assert("nikita-2625", target != NULL);
68879+
68880+ tap_check(tap);
68881+ result = 0;
68882+ if (tap->coord->node != target) {
68883+ lock_handle here;
68884+
68885+ init_lh(&here);
68886+ result = longterm_lock_znode(&here, target,
68887+ tap->mode, ZNODE_LOCK_HIPRI);
68888+ if (result == 0) {
68889+ result = reiser4_tap_move(tap, &here);
68890+ done_lh(&here);
68891+ }
68892+ }
68893+ tap_check(tap);
68894+ return result;
68895+}
68896+
68897+/**
68898+ * move @tap to given @target, loading and locking @target->node if
68899+ * necessary
68900+ */
68901+int tap_to_coord(tap_t * tap, coord_t * target)
68902+{
68903+ int result;
68904+
68905+ tap_check(tap);
68906+ result = tap_to(tap, target->node);
68907+ if (result == 0)
68908+ coord_dup(tap->coord, target);
68909+ tap_check(tap);
68910+ return result;
68911+}
68912+
68913+/** return list of all taps */
68914+struct list_head *reiser4_taps_list(void)
68915+{
68916+ return &get_current_context()->taps;
68917+}
68918+
68919+/** helper function for go_{next,prev}_{item,unit,node}() */
68920+int go_dir_el(tap_t * tap, sideof dir, int units_p)
68921+{
68922+ coord_t dup;
68923+ coord_t *coord;
68924+ int result;
68925+
68926+ int (*coord_dir) (coord_t *);
68927+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68928+ void (*coord_init) (coord_t *, const znode *);
68929+ ON_DEBUG(int (*coord_check) (const coord_t *));
68930+
68931+ assert("nikita-2556", tap != NULL);
68932+ assert("nikita-2557", tap->coord != NULL);
68933+ assert("nikita-2558", tap->lh != NULL);
68934+ assert("nikita-2559", tap->coord->node != NULL);
68935+
68936+ tap_check(tap);
68937+ if (dir == LEFT_SIDE) {
68938+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68939+ get_dir_neighbor = reiser4_get_left_neighbor;
68940+ coord_init = coord_init_last_unit;
68941+ } else {
68942+ coord_dir = units_p ? coord_next_unit : coord_next_item;
68943+ get_dir_neighbor = reiser4_get_right_neighbor;
68944+ coord_init = coord_init_first_unit;
68945+ }
68946+ ON_DEBUG(coord_check =
68947+ units_p ? coord_is_existing_unit : coord_is_existing_item);
68948+ assert("nikita-2560", coord_check(tap->coord));
68949+
68950+ coord = tap->coord;
68951+ coord_dup(&dup, coord);
68952+ if (coord_dir(&dup) != 0) {
68953+ do {
68954+ /* move to the left neighboring node */
68955+ lock_handle dup;
68956+
68957+ init_lh(&dup);
68958+ result =
68959+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68960+ GN_CAN_USE_UPPER_LEVELS);
68961+ if (result == 0) {
68962+ result = reiser4_tap_move(tap, &dup);
68963+ if (result == 0)
68964+ coord_init(tap->coord, dup.node);
68965+ done_lh(&dup);
68966+ }
68967+ /* skip empty nodes */
68968+ } while ((result == 0) && node_is_empty(coord->node));
68969+ } else {
68970+ result = 0;
68971+ coord_dup(coord, &dup);
68972+ }
68973+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68974+ tap_check(tap);
68975+ return result;
68976+}
68977+
68978+/**
68979+ * move @tap to the next unit, transparently crossing item and node
68980+ * boundaries
68981+ */
68982+int go_next_unit(tap_t * tap)
68983+{
68984+ return go_dir_el(tap, RIGHT_SIDE, 1);
68985+}
68986+
68987+/**
68988+ * move @tap to the previous unit, transparently crossing item and node
68989+ * boundaries
68990+ */
68991+int go_prev_unit(tap_t * tap)
68992+{
68993+ return go_dir_el(tap, LEFT_SIDE, 1);
68994+}
68995+
68996+/**
68997+ * @shift times apply @actor to the @tap. This is used to move @tap by
68998+ * @shift units (or items, or nodes) in either direction.
68999+ */
69000+static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
69001+{
69002+ int result;
69003+
69004+ assert("nikita-2555", shift >= 0);
69005+ assert("nikita-2562", tap->coord->node == tap->lh->node);
69006+
69007+ tap_check(tap);
69008+ result = reiser4_tap_load(tap);
69009+ if (result != 0)
69010+ return result;
69011+
69012+ for (; shift > 0; --shift) {
69013+ result = actor(tap);
69014+ assert("nikita-2563", tap->coord->node == tap->lh->node);
69015+ if (result != 0)
69016+ break;
69017+ }
69018+ reiser4_tap_relse(tap);
69019+ tap_check(tap);
69020+ return result;
69021+}
69022+
69023+/** move @tap @shift units rightward */
69024+int rewind_right(tap_t * tap, int shift)
69025+{
69026+ return rewind_to(tap, go_next_unit, shift);
69027+}
69028+
69029+/** move @tap @shift units leftward */
69030+int rewind_left(tap_t * tap, int shift)
69031+{
69032+ return rewind_to(tap, go_prev_unit, shift);
69033+}
69034+
69035+#if REISER4_DEBUG
69036+/** debugging function: print @tap content in human readable form */
69037+static void print_tap(const char *prefix, const tap_t * tap)
69038+{
69039+ if (tap == NULL) {
69040+ printk("%s: null tap\n", prefix);
69041+ return;
69042+ }
69043+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
69044+ tap->loaded, (&tap->linkage == tap->linkage.next &&
69045+ &tap->linkage == tap->linkage.prev),
69046+ tap->lh->node,
69047+ lock_mode_name(tap->mode));
69048+ print_coord("\tcoord", tap->coord, 0);
69049+}
69050+
69051+/** check [tap-sane] invariant */
69052+static int tap_invariant(const tap_t * tap)
69053+{
69054+ /* [tap-sane] invariant */
69055+
69056+ if (tap == NULL)
69057+ return 1;
69058+ /* tap->mode is one of
69059+ *
69060+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
69061+ */
69062+ if (tap->mode != ZNODE_NO_LOCK &&
69063+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
69064+ return 2;
69065+ /* tap->coord != NULL, and */
69066+ if (tap->coord == NULL)
69067+ return 3;
69068+ /* tap->lh != NULL, and */
69069+ if (tap->lh == NULL)
69070+ return 4;
69071+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
69072+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
69073+ return 5;
69074+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
69075+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
69076+ return 6;
69077+ return 0;
69078+}
69079+
69080+/** debugging function: check internal @tap consistency */
69081+static void tap_check(const tap_t * tap)
69082+{
69083+ int result;
69084+
69085+ result = tap_invariant(tap);
69086+ if (result != 0) {
69087+ print_tap("broken", tap);
69088+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
69089+ }
69090+}
69091+#endif
69092+
69093+/* Make Linus happy.
69094+ Local variables:
69095+ c-indentation-style: "K&R"
69096+ mode-name: "LC"
69097+ c-basic-offset: 8
69098+ tab-width: 8
69099+ fill-column: 120
69100+ scroll-step: 1
69101+ End:
69102+*/
69103diff --git a/fs/reiser4/tap.h b/fs/reiser4/tap.h
69104new file mode 100644
69105index 0000000..1416729
69106--- /dev/null
69107+++ b/fs/reiser4/tap.h
69108@@ -0,0 +1,70 @@
69109+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
69110+
69111+/* Tree Access Pointers. See tap.c for more details. */
69112+
69113+#if !defined( __REISER4_TAP_H__ )
69114+#define __REISER4_TAP_H__
69115+
69116+#include "forward.h"
69117+#include "readahead.h"
69118+
69119+/**
69120+ tree_access_pointer aka tap. Data structure combining coord_t and lock
69121+ handle.
69122+ Invariants involving this data-type, see doc/lock-ordering for details:
69123+
69124+ [tap-sane]
69125+ */
69126+struct tree_access_pointer {
69127+ /* coord tap is at */
69128+ coord_t *coord;
69129+ /* lock handle on ->coord->node */
69130+ lock_handle *lh;
69131+ /* mode of lock acquired by this tap */
69132+ znode_lock_mode mode;
69133+ /* incremented by reiser4_tap_load().
69134+ Decremented by reiser4_tap_relse(). */
69135+ int loaded;
69136+ /* list of taps */
69137+ struct list_head linkage;
69138+ /* read-ahead hint */
69139+ ra_info_t ra_info;
69140+};
69141+
69142+typedef int (*go_actor_t) (tap_t * tap);
69143+
69144+extern int reiser4_tap_load(tap_t * tap);
69145+extern void reiser4_tap_relse(tap_t * tap);
69146+extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
69147+ znode_lock_mode mode);
69148+extern void reiser4_tap_monitor(tap_t * tap);
69149+extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
69150+extern void reiser4_tap_done(tap_t * tap);
69151+extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
69152+extern int tap_to_coord(tap_t * tap, coord_t * target);
69153+
69154+extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
69155+extern int go_next_unit(tap_t * tap);
69156+extern int go_prev_unit(tap_t * tap);
69157+extern int rewind_right(tap_t * tap, int shift);
69158+extern int rewind_left(tap_t * tap, int shift);
69159+
69160+extern struct list_head *reiser4_taps_list(void);
69161+
69162+#define for_all_taps(tap) \
69163+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
69164+ reiser4_taps_list() != &tap->linkage; \
69165+ tap = list_entry(tap->linkage.next, tap_t, linkage))
69166+
69167+/* __REISER4_TAP_H__ */
69168+#endif
69169+/* Make Linus happy.
69170+ Local variables:
69171+ c-indentation-style: "K&R"
69172+ mode-name: "LC"
69173+ c-basic-offset: 8
69174+ tab-width: 8
69175+ fill-column: 120
69176+ scroll-step: 1
69177+ End:
69178+*/
69179diff --git a/fs/reiser4/tree.c b/fs/reiser4/tree.c
69180new file mode 100644
69181index 0000000..32548d2
69182--- /dev/null
69183+++ b/fs/reiser4/tree.c
69184@@ -0,0 +1,1876 @@
69185+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69186+ * reiser4/README */
69187+
69188+/*
69189+ * KEYS IN A TREE.
69190+ *
69191+ * The tree consists of nodes located on the disk. Node in the tree is either
69192+ * formatted or unformatted. Formatted node is one that has structure
69193+ * understood by the tree balancing and traversal code. Formatted nodes are
69194+ * further classified into leaf and internal nodes. Latter distinctions is
69195+ * (almost) of only historical importance: general structure of leaves and
69196+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
69197+ * that are part of bodies of ordinary files and attributes.
69198+ *
69199+ * Each node in the tree spawns some interval in the key space. Key ranges for
69200+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
69201+ * sense, because of the non-unique keys: intersection of key ranges for
69202+ * different nodes is either empty, or consists of exactly one key.
69203+ *
69204+ * Formatted node consists of a sequence of items. Each item spawns some
69205+ * interval in key space. Key ranges for all items in a tree are disjoint,
69206+ * modulo non-unique keys again. Items within nodes are ordered in the key
69207+ * order of the smallest key in a item.
69208+ *
69209+ * Particular type of item can be further split into units. Unit is piece of
69210+ * item that can be cut from item and moved into another item of the same
69211+ * time. Units are used by balancing code to repack data during balancing.
69212+ *
69213+ * Unit can be further split into smaller entities (for example, extent unit
69214+ * represents several pages, and it is natural for extent code to operate on
69215+ * particular pages and even bytes within one unit), but this is of no
69216+ * relevance to the generic balancing and lookup code.
69217+ *
69218+ * Although item is said to "spawn" range or interval of keys, it is not
69219+ * necessary that item contains piece of data addressable by each and every
69220+ * key in this range. For example, compound directory item, consisting of
69221+ * units corresponding to directory entries and keyed by hashes of file names,
69222+ * looks more as having "discrete spectrum": only some disjoint keys inside
69223+ * range occupied by this item really address data.
69224+ *
69225+ * No than less, each item always has well-defined least (minimal) key, that
69226+ * is recorded in item header, stored in the node this item is in. Also, item
69227+ * plugin can optionally define method ->max_key_inside() returning maximal
69228+ * key that can _possibly_ be located within this item. This method is used
69229+ * (mainly) to determine when given piece of data should be merged into
69230+ * existing item, in stead of creating new one. Because of this, even though
69231+ * ->max_key_inside() can be larger that any key actually located in the item,
69232+ * intervals
69233+ *
69234+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
69235+ *
69236+ * are still disjoint for all items within the _same_ node.
69237+ *
69238+ * In memory node is represented by znode. It plays several roles:
69239+ *
69240+ * . something locks are taken on
69241+ *
69242+ * . something tracked by transaction manager (this is going to change)
69243+ *
69244+ * . something used to access node data
69245+ *
69246+ * . something used to maintain tree structure in memory: sibling and
69247+ * parental linkage.
69248+ *
69249+ * . something used to organize nodes into "slums"
69250+ *
69251+ * More on znodes see in znode.[ch]
69252+ *
69253+ * DELIMITING KEYS
69254+ *
69255+ * To simplify balancing, allow some flexibility in locking and speed up
69256+ * important coord cache optimization, we keep delimiting keys of nodes in
69257+ * memory. Depending on disk format (implemented by appropriate node plugin)
69258+ * node on disk can record both left and right delimiting key, only one of
69259+ * them, or none. Still, our balancing and tree traversal code keep both
69260+ * delimiting keys for a node that is in memory stored in the znode. When
69261+ * node is first brought into memory during tree traversal, its left
69262+ * delimiting key is taken from its parent, and its right delimiting key is
69263+ * either next key in its parent, or is right delimiting key of parent if
69264+ * node is the rightmost child of parent.
69265+ *
69266+ * Physical consistency of delimiting key is protected by special dk
69267+ * read-write lock. That is, delimiting keys can only be inspected or
69268+ * modified under this lock. But dk lock is only sufficient for fast
69269+ * "pessimistic" check, because to simplify code and to decrease lock
69270+ * contention, balancing (carry) only updates delimiting keys right before
69271+ * unlocking all locked nodes on the given tree level. For example,
69272+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
69273+ * node it first does fast check under dk spin lock. If key looked for is
69274+ * not between delimiting keys for this node, next node is inspected and so
69275+ * on. If key is inside of the key range, long term lock is taken on node
69276+ * and key range is rechecked.
69277+ *
69278+ * COORDINATES
69279+ *
69280+ * To find something in the tree, you supply a key, and the key is resolved
69281+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
69282+ * node the coord points to remains locked. As mentioned above trees
69283+ * consist of nodes that consist of items that consist of units. A unit is
69284+ * the smallest and indivisible piece of tree as far as balancing and tree
69285+ * search are concerned. Each node, item, and unit can be addressed by
69286+ * giving its level in the tree and the key occupied by this entity. A node
69287+ * knows what the key ranges are of the items within it, and how to find its
69288+ * items and invoke their item handlers, but it does not know how to access
69289+ * individual units within its items except through the item handlers.
69290+ * coord is a structure containing a pointer to the node, the ordinal number
69291+ * of the item within this node (a sort of item offset), and the ordinal
69292+ * number of the unit within this item.
69293+ *
69294+ * TREE LOOKUP
69295+ *
69296+ * There are two types of access to the tree: lookup and modification.
69297+ *
69298+ * Lookup is a search for the key in the tree. Search can look for either
69299+ * exactly the key given to it, or for the largest key that is not greater
69300+ * than the key given to it. This distinction is determined by "bias"
69301+ * parameter of search routine (coord_by_key()). coord_by_key() either
69302+ * returns error (key is not in the tree, or some kind of external error
69303+ * occurred), or successfully resolves key into coord.
69304+ *
69305+ * This resolution is done by traversing tree top-to-bottom from root level
69306+ * to the desired level. On levels above twig level (level one above the
69307+ * leaf level) nodes consist exclusively of internal items. Internal item is
69308+ * nothing more than pointer to the tree node on the child level. On twig
69309+ * level nodes consist of internal items intermixed with extent
69310+ * items. Internal items form normal search tree structure used by traversal
69311+ * to descent through the tree.
69312+ *
69313+ * TREE LOOKUP OPTIMIZATIONS
69314+ *
69315+ * Tree lookup described above is expensive even if all nodes traversed are
69316+ * already in the memory: for each node binary search within it has to be
69317+ * performed and binary searches are CPU consuming and tend to destroy CPU
69318+ * caches.
69319+ *
69320+ * Several optimizations are used to work around this:
69321+ *
69322+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
69323+ * details)
69324+ *
69325+ * . seals (see seal.[ch])
69326+ *
69327+ * . vroot (see search.c)
69328+ *
69329+ * General search-by-key is layered thusly:
69330+ *
69331+ * [check seal, if any] --ok--> done
69332+ * |
69333+ * failed
69334+ * |
69335+ * V
69336+ * [vroot defined] --no--> node = tree_root
69337+ * | |
69338+ * yes |
69339+ * | |
69340+ * V |
69341+ * node = vroot |
69342+ * | |
69343+ * | |
69344+ * | |
69345+ * V V
69346+ * [check cbk_cache for key] --ok--> done
69347+ * |
69348+ * failed
69349+ * |
69350+ * V
69351+ * [start tree traversal from node]
69352+ *
69353+ */
69354+
69355+#include "forward.h"
69356+#include "debug.h"
69357+#include "dformat.h"
69358+#include "key.h"
69359+#include "coord.h"
69360+#include "plugin/item/static_stat.h"
69361+#include "plugin/item/item.h"
69362+#include "plugin/node/node.h"
69363+#include "plugin/plugin.h"
69364+#include "txnmgr.h"
69365+#include "jnode.h"
69366+#include "znode.h"
69367+#include "block_alloc.h"
69368+#include "tree_walk.h"
69369+#include "carry.h"
69370+#include "carry_ops.h"
69371+#include "tap.h"
69372+#include "tree.h"
69373+#include "vfs_ops.h"
69374+#include "page_cache.h"
69375+#include "super.h"
69376+#include "reiser4.h"
69377+#include "inode.h"
69378+
69379+#include <linux/fs.h> /* for struct super_block */
69380+#include <linux/spinlock.h>
69381+
69382+/* Disk address (block number) never ever used for any real tree node. This is
69383+ used as block number of "uber" znode.
69384+
69385+ Invalid block addresses are 0 by tradition.
69386+
69387+*/
69388+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
69389+
69390+#define CUT_TREE_MIN_ITERATIONS 64
69391+
69392+static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
69393+
69394+/* return node plugin of coord->node */
69395+node_plugin *node_plugin_by_coord(const coord_t * coord)
69396+{
69397+ assert("vs-1", coord != NULL);
69398+ assert("vs-2", coord->node != NULL);
69399+
69400+ return coord->node->nplug;
69401+}
69402+
69403+/* insert item into tree. Fields of @coord are updated so that they can be
69404+ * used by consequent insert operation. */
69405+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
69406+ * into */ ,
69407+ const reiser4_key * key /* key of new item */ ,
69408+ reiser4_item_data * data /* parameters for item
69409+ * creation */ ,
69410+ coord_t * coord /* resulting insertion coord */ ,
69411+ lock_handle * lh /* resulting lock
69412+ * handle */ ,
69413+ tree_level stop_level /** level where to insert */ ,
69414+ __u32 flags /* insertion flags */ )
69415+{
69416+ int result;
69417+
69418+ assert("nikita-358", tree != NULL);
69419+ assert("nikita-360", coord != NULL);
69420+
69421+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
69422+ FIND_EXACT, stop_level, stop_level,
69423+ flags | CBK_FOR_INSERT, NULL /*ra_info */ );
69424+ switch (result) {
69425+ default:
69426+ break;
69427+ case CBK_COORD_FOUND:
69428+ result = IBK_ALREADY_EXISTS;
69429+ break;
69430+ case CBK_COORD_NOTFOUND:
69431+ assert("nikita-2017", coord->node != NULL);
69432+ result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
69433+ break;
69434+ }
69435+ return result;
69436+}
69437+
69438+/* insert item by calling carry. Helper function called if short-cut
69439+ insertion failed */
69440+static insert_result insert_with_carry_by_coord(coord_t * coord, /* coord where to insert */
69441+ lock_handle * lh, /* lock handle of insertion
69442+ * node */
69443+ reiser4_item_data * data, /* parameters of new
69444+ * item */
69445+ const reiser4_key * key, /* key of new item */
69446+ carry_opcode cop, /* carry operation to perform */
69447+ cop_insert_flag flags
69448+ /* carry flags */ )
69449+{
69450+ int result;
69451+ carry_pool *pool;
69452+ carry_level *lowest_level;
69453+ carry_insert_data *cdata;
69454+ carry_op *op;
69455+
69456+ assert("umka-314", coord != NULL);
69457+
69458+ /* allocate carry_pool and 3 carry_level-s */
69459+ pool =
69460+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69461+ sizeof(*cdata));
69462+ if (IS_ERR(pool))
69463+ return PTR_ERR(pool);
69464+ lowest_level = (carry_level *) (pool + 1);
69465+ init_carry_level(lowest_level, pool);
69466+
69467+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
69468+ if (IS_ERR(op) || (op == NULL)) {
69469+ done_carry_pool(pool);
69470+ return RETERR(op ? PTR_ERR(op) : -EIO);
69471+ }
69472+ cdata = (carry_insert_data *) (lowest_level + 3);
69473+ cdata->coord = coord;
69474+ cdata->data = data;
69475+ cdata->key = key;
69476+ op->u.insert.d = cdata;
69477+ if (flags == 0)
69478+ flags = znode_get_tree(coord->node)->carry.insert_flags;
69479+ op->u.insert.flags = flags;
69480+ op->u.insert.type = COPT_ITEM_DATA;
69481+ op->u.insert.child = NULL;
69482+ if (lh != NULL) {
69483+ assert("nikita-3245", lh->node == coord->node);
69484+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69485+ lowest_level->tracked = lh;
69486+ }
69487+
69488+ result = reiser4_carry(lowest_level, NULL);
69489+ done_carry_pool(pool);
69490+
69491+ return result;
69492+}
69493+
69494+/* form carry queue to perform paste of @data with @key at @coord, and launch
69495+ its execution by calling carry().
69496+
69497+ Instruct carry to update @lh it after balancing insertion coord moves into
69498+ different block.
69499+
69500+*/
69501+static int paste_with_carry(coord_t * coord, /* coord of paste */
69502+ lock_handle * lh, /* lock handle of node
69503+ * where item is
69504+ * pasted */
69505+ reiser4_item_data * data, /* parameters of new
69506+ * item */
69507+ const reiser4_key * key, /* key of new item */
69508+ unsigned flags /* paste flags */ )
69509+{
69510+ int result;
69511+ carry_pool *pool;
69512+ carry_level *lowest_level;
69513+ carry_insert_data *cdata;
69514+ carry_op *op;
69515+
69516+ assert("umka-315", coord != NULL);
69517+ assert("umka-316", key != NULL);
69518+
69519+ pool =
69520+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69521+ sizeof(*cdata));
69522+ if (IS_ERR(pool))
69523+ return PTR_ERR(pool);
69524+ lowest_level = (carry_level *) (pool + 1);
69525+ init_carry_level(lowest_level, pool);
69526+
69527+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
69528+ if (IS_ERR(op) || (op == NULL)) {
69529+ done_carry_pool(pool);
69530+ return RETERR(op ? PTR_ERR(op) : -EIO);
69531+ }
69532+ cdata = (carry_insert_data *) (lowest_level + 3);
69533+ cdata->coord = coord;
69534+ cdata->data = data;
69535+ cdata->key = key;
69536+ op->u.paste.d = cdata;
69537+ if (flags == 0)
69538+ flags = znode_get_tree(coord->node)->carry.paste_flags;
69539+ op->u.paste.flags = flags;
69540+ op->u.paste.type = COPT_ITEM_DATA;
69541+ if (lh != NULL) {
69542+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69543+ lowest_level->tracked = lh;
69544+ }
69545+
69546+ result = reiser4_carry(lowest_level, NULL);
69547+ done_carry_pool(pool);
69548+
69549+ return result;
69550+}
69551+
69552+/* insert item at the given coord.
69553+
69554+ First try to skip carry by directly calling ->create_item() method of node
69555+ plugin. If this is impossible (there is not enough free space in the node,
69556+ or leftmost item in the node is created), call insert_with_carry_by_coord()
69557+ that will do full carry().
69558+
69559+*/
69560+insert_result insert_by_coord(coord_t * coord /* coord where to
69561+ * insert. coord->node has
69562+ * to be write locked by
69563+ * caller */ ,
69564+ reiser4_item_data * data /* data to be
69565+ * inserted */ ,
69566+ const reiser4_key * key /* key of new item */ ,
69567+ lock_handle * lh /* lock handle of write
69568+ * lock on node */ ,
69569+ __u32 flags /* insertion flags */ )
69570+{
69571+ unsigned item_size;
69572+ int result;
69573+ znode *node;
69574+
69575+ assert("vs-247", coord != NULL);
69576+ assert("vs-248", data != NULL);
69577+ assert("vs-249", data->length >= 0);
69578+ assert("nikita-1191", znode_is_write_locked(coord->node));
69579+
69580+ node = coord->node;
69581+ coord_clear_iplug(coord);
69582+ result = zload(node);
69583+ if (result != 0)
69584+ return result;
69585+
69586+ item_size = space_needed(node, NULL, data, 1);
69587+ if (item_size > znode_free_space(node) &&
69588+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69589+ && (flags & COPI_DONT_ALLOCATE)) {
69590+ /* we are forced to use free space of coord->node and new item
69591+ does not fit into it.
69592+
69593+ Currently we get here only when we allocate and copy units
69594+ of extent item from a node to its left neighbor during
69595+ "squalloc"-ing. If @node (this is left neighbor) does not
69596+ have enough free space - we do not want to attempt any
69597+ shifting and allocations because we are in squeezing and
69598+ everything to the left of @node is tightly packed.
69599+ */
69600+ result = -E_NODE_FULL;
69601+ } else if ((item_size <= znode_free_space(node)) &&
69602+ !coord_is_before_leftmost(coord) &&
69603+ (node_plugin_by_node(node)->fast_insert != NULL)
69604+ && node_plugin_by_node(node)->fast_insert(coord)) {
69605+ /* shortcut insertion without carry() overhead.
69606+
69607+ Only possible if:
69608+
69609+ - there is enough free space
69610+
69611+ - insertion is not into the leftmost position in a node
69612+ (otherwise it would require updating of delimiting key in a
69613+ parent)
69614+
69615+ - node plugin agrees with this
69616+
69617+ */
69618+ result =
69619+ node_plugin_by_node(node)->create_item(coord, key, data,
69620+ NULL);
69621+ znode_make_dirty(node);
69622+ } else {
69623+ /* otherwise do full-fledged carry(). */
69624+ result =
69625+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69626+ flags);
69627+ }
69628+ zrelse(node);
69629+ return result;
69630+}
69631+
69632+/* @coord is set to leaf level and @data is to be inserted to twig level */
69633+insert_result
69634+insert_extent_by_coord(coord_t *
69635+ coord
69636+ /* coord where to insert. coord->node * has to be write * locked by caller */
69637+ ,
69638+ reiser4_item_data * data /* data to be inserted */ ,
69639+ const reiser4_key * key /* key of new item */ ,
69640+ lock_handle *
69641+ lh /* lock handle of write lock on * node */ )
69642+{
69643+ assert("vs-405", coord != NULL);
69644+ assert("vs-406", data != NULL);
69645+ assert("vs-407", data->length > 0);
69646+ assert("vs-408", znode_is_write_locked(coord->node));
69647+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69648+
69649+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69650+ 0 /*flags */ );
69651+}
69652+
69653+/* Insert into the item at the given coord.
69654+
69655+ First try to skip carry by directly calling ->paste() method of item
69656+ plugin. If this is impossible (there is not enough free space in the node,
69657+ or we are pasting into leftmost position in the node), call
69658+ paste_with_carry() that will do full carry().
69659+
69660+*/
69661+/* paste_into_item */
69662+int insert_into_item(coord_t * coord /* coord of pasting */ ,
69663+ lock_handle * lh /* lock handle on node involved */ ,
69664+ const reiser4_key * key /* key of unit being pasted */ ,
69665+ reiser4_item_data * data /* parameters for new unit */ ,
69666+ unsigned flags /* insert/paste flags */ )
69667+{
69668+ int result;
69669+ int size_change;
69670+ node_plugin *nplug;
69671+ item_plugin *iplug;
69672+
69673+ assert("umka-317", coord != NULL);
69674+ assert("umka-318", key != NULL);
69675+
69676+ iplug = item_plugin_by_coord(coord);
69677+ nplug = node_plugin_by_coord(coord);
69678+
69679+ assert("nikita-1480", iplug == data->iplug);
69680+
69681+ size_change = space_needed(coord->node, coord, data, 0);
69682+ if (size_change > (int)znode_free_space(coord->node) &&
69683+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69684+ && (flags & COPI_DONT_ALLOCATE)) {
69685+ /* we are forced to use free space of coord->node and new data
69686+ does not fit into it. */
69687+ return -E_NODE_FULL;
69688+ }
69689+
69690+ /* shortcut paste without carry() overhead.
69691+
69692+ Only possible if:
69693+
69694+ - there is enough free space
69695+
69696+ - paste is not into the leftmost unit in a node (otherwise
69697+ it would require updating of delimiting key in a parent)
69698+
69699+ - node plugin agrees with this
69700+
69701+ - item plugin agrees with us
69702+ */
69703+ if (size_change <= (int)znode_free_space(coord->node) &&
69704+ (coord->item_pos != 0 ||
69705+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69706+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69707+ nplug->fast_paste(coord) &&
69708+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69709+ if (size_change > 0)
69710+ nplug->change_item_size(coord, size_change);
69711+ /* NOTE-NIKITA: huh? where @key is used? */
69712+ result = iplug->b.paste(coord, data, NULL);
69713+ if (size_change < 0)
69714+ nplug->change_item_size(coord, size_change);
69715+ znode_make_dirty(coord->node);
69716+ } else
69717+ /* otherwise do full-fledged carry(). */
69718+ result = paste_with_carry(coord, lh, data, key, flags);
69719+ return result;
69720+}
69721+
69722+/* this either appends or truncates item @coord */
69723+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
69724+ reiser4_item_data * data /* parameters of resize */ ,
69725+ reiser4_key * key /* key of new unit */ ,
69726+ lock_handle * lh /* lock handle of node
69727+ * being modified */ ,
69728+ cop_insert_flag flags /* carry flags */ )
69729+{
69730+ int result;
69731+ znode *node;
69732+
69733+ assert("nikita-362", coord != NULL);
69734+ assert("nikita-363", data != NULL);
69735+ assert("vs-245", data->length != 0);
69736+
69737+ node = coord->node;
69738+ coord_clear_iplug(coord);
69739+ result = zload(node);
69740+ if (result != 0)
69741+ return result;
69742+
69743+ if (data->length < 0)
69744+ result = node_plugin_by_coord(coord)->shrink_item(coord,
69745+ -data->length);
69746+ else
69747+ result = insert_into_item(coord, lh, key, data, flags);
69748+
69749+ zrelse(node);
69750+ return result;
69751+}
69752+
69753+/* insert flow @f */
69754+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69755+{
69756+ int result;
69757+ carry_pool *pool;
69758+ carry_level *lowest_level;
69759+ reiser4_item_data *data;
69760+ carry_op *op;
69761+
69762+ pool =
69763+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69764+ sizeof(*data));
69765+ if (IS_ERR(pool))
69766+ return PTR_ERR(pool);
69767+ lowest_level = (carry_level *) (pool + 1);
69768+ init_carry_level(lowest_level, pool);
69769+
69770+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69771+ 0 /* operate directly on coord -> node */ );
69772+ if (IS_ERR(op) || (op == NULL)) {
69773+ done_carry_pool(pool);
69774+ return RETERR(op ? PTR_ERR(op) : -EIO);
69775+ }
69776+
69777+ /* these are permanent during insert_flow */
69778+ data = (reiser4_item_data *) (lowest_level + 3);
69779+ data->user = 1;
69780+ data->iplug = item_plugin_by_id(FORMATTING_ID);
69781+ data->arg = NULL;
69782+ /* data.length and data.data will be set before calling paste or
69783+ insert */
69784+ data->length = 0;
69785+ data->data = NULL;
69786+
69787+ op->u.insert_flow.flags = 0;
69788+ op->u.insert_flow.insert_point = coord;
69789+ op->u.insert_flow.flow = f;
69790+ op->u.insert_flow.data = data;
69791+ op->u.insert_flow.new_nodes = 0;
69792+
69793+ lowest_level->track_type = CARRY_TRACK_CHANGE;
69794+ lowest_level->tracked = lh;
69795+
69796+ result = reiser4_carry(lowest_level, NULL);
69797+ done_carry_pool(pool);
69798+
69799+ return result;
69800+}
69801+
69802+/* Given a coord in parent node, obtain a znode for the corresponding child */
69803+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
69804+ * child */ ,
69805+ znode * parent /* parent of child */ ,
69806+ int incore_p /* if !0 only return child if already in
69807+ * memory */ ,
69808+ int setup_dkeys_p /* if !0 update delimiting keys of
69809+ * child */ )
69810+{
69811+ znode *child;
69812+
69813+ assert("nikita-1374", parent_coord != NULL);
69814+ assert("nikita-1482", parent != NULL);
69815+#if REISER4_DEBUG
69816+ if (setup_dkeys_p)
69817+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69818+#endif
69819+ assert("nikita-2947", znode_is_any_locked(parent));
69820+
69821+ if (znode_get_level(parent) <= LEAF_LEVEL) {
69822+ /* trying to get child of leaf node */
69823+ warning("nikita-1217", "Child of maize?");
69824+ return ERR_PTR(RETERR(-EIO));
69825+ }
69826+ if (item_is_internal(parent_coord)) {
69827+ reiser4_block_nr addr;
69828+ item_plugin *iplug;
69829+ reiser4_tree *tree;
69830+
69831+ iplug = item_plugin_by_coord(parent_coord);
69832+ assert("vs-512", iplug->s.internal.down_link);
69833+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
69834+
69835+ tree = znode_get_tree(parent);
69836+ if (incore_p)
69837+ child = zlook(tree, &addr);
69838+ else
69839+ child =
69840+ zget(tree, &addr, parent,
69841+ znode_get_level(parent) - 1,
69842+ reiser4_ctx_gfp_mask_get());
69843+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69844+ set_child_delimiting_keys(parent, parent_coord, child);
69845+ } else {
69846+ warning("nikita-1483", "Internal item expected");
69847+ child = ERR_PTR(RETERR(-EIO));
69848+ }
69849+ return child;
69850+}
69851+
69852+/* remove znode from transaction */
69853+static void uncapture_znode(znode * node)
69854+{
69855+ struct page *page;
69856+
69857+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69858+
69859+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
69860+ int ret;
69861+
69862+ /* An already allocated block goes right to the atom's delete set. */
69863+ ret =
69864+ reiser4_dealloc_block(znode_get_block(node), 0,
69865+ BA_DEFER | BA_FORMATTED);
69866+ if (ret)
69867+ warning("zam-942",
69868+ "can\'t add a block (%llu) number to atom's delete set\n",
69869+ (unsigned long long)(*znode_get_block(node)));
69870+
69871+ spin_lock_znode(node);
69872+ /* Here we return flush reserved block which was reserved at the
69873+ * moment when this allocated node was marked dirty and still
69874+ * not used by flush in node relocation procedure. */
69875+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69876+ txn_atom *atom;
69877+
69878+ atom = jnode_get_atom(ZJNODE(node));
69879+ assert("zam-939", atom != NULL);
69880+ spin_unlock_znode(node);
69881+ flush_reserved2grabbed(atom, (__u64) 1);
69882+ spin_unlock_atom(atom);
69883+ } else
69884+ spin_unlock_znode(node);
69885+ } else {
69886+ /* znode has assigned block which is counted as "fake
69887+ allocated". Return it back to "free blocks") */
69888+ fake_allocated2free((__u64) 1, BA_FORMATTED);
69889+ }
69890+
69891+ /*
69892+ * uncapture page from transaction. There is a possibility of a race
69893+ * with ->releasepage(): reiser4_releasepage() detaches page from this
69894+ * jnode and we have nothing to uncapture. To avoid this, get
69895+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
69896+ * will deal with released page itself.
69897+ */
69898+ spin_lock_znode(node);
69899+ page = znode_page(node);
69900+ if (likely(page != NULL)) {
69901+ /*
69902+ * reiser4_uncapture_page() can only be called when we are sure
69903+ * that znode is pinned in memory, which we are, because
69904+ * forget_znode() is only called from longterm_unlock_znode().
69905+ */
69906+ page_cache_get(page);
69907+ spin_unlock_znode(node);
69908+ lock_page(page);
69909+ reiser4_uncapture_page(page);
69910+ unlock_page(page);
69911+ page_cache_release(page);
69912+ } else {
69913+ txn_atom *atom;
69914+
69915+ /* handle "flush queued" znodes */
69916+ while (1) {
69917+ atom = jnode_get_atom(ZJNODE(node));
69918+ assert("zam-943", atom != NULL);
69919+
69920+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69921+ || !atom->nr_running_queues)
69922+ break;
69923+
69924+ spin_unlock_znode(node);
69925+ reiser4_atom_wait_event(atom);
69926+ spin_lock_znode(node);
69927+ }
69928+
69929+ reiser4_uncapture_block(ZJNODE(node));
69930+ spin_unlock_atom(atom);
69931+ zput(node);
69932+ }
69933+}
69934+
69935+/* This is called from longterm_unlock_znode() when last lock is released from
69936+ the node that has been removed from the tree. At this point node is removed
69937+ from sibling list and its lock is invalidated. */
69938+void forget_znode(lock_handle * handle)
69939+{
69940+ znode *node;
69941+ reiser4_tree *tree;
69942+
69943+ assert("umka-319", handle != NULL);
69944+
69945+ node = handle->node;
69946+ tree = znode_get_tree(node);
69947+
69948+ assert("vs-164", znode_is_write_locked(node));
69949+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69950+ assert_rw_locked(&(node->lock.guard));
69951+
69952+ /* We assume that this node was detached from its parent before
69953+ * unlocking, it gives no way to reach this node from parent through a
69954+ * down link. The node should have no children and, thereby, can't be
69955+ * reached from them by their parent pointers. The only way to obtain a
69956+ * reference to the node is to use sibling pointers from its left and
69957+ * right neighbors. In the next several lines we remove the node from
69958+ * the sibling list. */
69959+
69960+ write_lock_tree(tree);
69961+ sibling_list_remove(node);
69962+ znode_remove(node, tree);
69963+ write_unlock_tree(tree);
69964+
69965+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
69966+ * forces all lock requestor threads to repeat iterations of getting
69967+ * lock on a child, neighbor or parent node. But, those threads can't
69968+ * come to this node again, because this node is no longer a child,
69969+ * neighbor or parent of any other node. This order of znode
69970+ * invalidation does not allow other threads to waste cpu time is a busy
69971+ * loop, trying to lock dying object. The exception is in the flush
69972+ * code when we take node directly from atom's capture list.*/
69973+ reiser4_invalidate_lock(handle);
69974+ uncapture_znode(node);
69975+}
69976+
69977+/* Check that internal item at @pointer really contains pointer to @child. */
69978+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69979+ * @child */ ,
69980+ const znode * child /* child znode */ )
69981+{
69982+ assert("nikita-1016", pointer != NULL);
69983+ assert("nikita-1017", child != NULL);
69984+ assert("nikita-1018", pointer->node != NULL);
69985+
69986+ assert("nikita-1325", znode_is_any_locked(pointer->node));
69987+
69988+ assert("nikita-2985",
69989+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
69990+
69991+ coord_clear_iplug((coord_t *) pointer);
69992+
69993+ if (coord_is_existing_unit(pointer)) {
69994+ item_plugin *iplug;
69995+ reiser4_block_nr addr;
69996+
69997+ if (item_is_internal(pointer)) {
69998+ iplug = item_plugin_by_coord(pointer);
69999+ assert("vs-513", iplug->s.internal.down_link);
70000+ iplug->s.internal.down_link(pointer, NULL, &addr);
70001+ /* check that cached value is correct */
70002+ if (disk_addr_eq(&addr, znode_get_block(child))) {
70003+ return NS_FOUND;
70004+ }
70005+ }
70006+ }
70007+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
70008+ return NS_NOT_FOUND;
70009+}
70010+
70011+/* find coord of pointer to new @child in @parent.
70012+
70013+ Find the &coord_t in the @parent where pointer to a given @child will
70014+ be in.
70015+
70016+*/
70017+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
70018+ znode *
70019+ child UNUSED_ARG /* child znode, passed locked */ ,
70020+ znode * left /* left brother of new node */ ,
70021+ coord_t * result /* where result is stored in */ )
70022+{
70023+ int ret;
70024+
70025+ assert("nikita-1486", parent != NULL);
70026+ assert("nikita-1487", child != NULL);
70027+ assert("nikita-1488", result != NULL);
70028+
70029+ ret = find_child_ptr(parent, left, result);
70030+ if (ret != NS_FOUND) {
70031+ warning("nikita-1489", "Cannot find brother position: %i", ret);
70032+ return RETERR(-EIO);
70033+ } else {
70034+ result->between = AFTER_UNIT;
70035+ return RETERR(NS_NOT_FOUND);
70036+ }
70037+}
70038+
70039+/* find coord of pointer to @child in @parent.
70040+
70041+ Find the &coord_t in the @parent where pointer to a given @child is in.
70042+
70043+*/
70044+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
70045+ znode * child /* child znode, passed locked */ ,
70046+ coord_t * result /* where result is stored in */ )
70047+{
70048+ int lookup_res;
70049+ node_plugin *nplug;
70050+ /* left delimiting key of a child */
70051+ reiser4_key ld;
70052+ reiser4_tree *tree;
70053+
70054+ assert("nikita-934", parent != NULL);
70055+ assert("nikita-935", child != NULL);
70056+ assert("nikita-936", result != NULL);
70057+ assert("zam-356", znode_is_loaded(parent));
70058+
70059+ coord_init_zero(result);
70060+ result->node = parent;
70061+
70062+ nplug = parent->nplug;
70063+ assert("nikita-939", nplug != NULL);
70064+
70065+ tree = znode_get_tree(parent);
70066+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
70067+ * not aliased to ->in_parent of some znode. Otherwise,
70068+ * parent_coord_to_coord() below would modify data protected by tree
70069+ * lock. */
70070+ read_lock_tree(tree);
70071+ /* fast path. Try to use cached value. Lock tree to keep
70072+ node->pos_in_parent and pos->*_blocknr consistent. */
70073+ if (child->in_parent.item_pos + 1 != 0) {
70074+ parent_coord_to_coord(&child->in_parent, result);
70075+ if (check_tree_pointer(result, child) == NS_FOUND) {
70076+ read_unlock_tree(tree);
70077+ return NS_FOUND;
70078+ }
70079+
70080+ child->in_parent.item_pos = (unsigned short)~0;
70081+ }
70082+ read_unlock_tree(tree);
70083+
70084+ /* is above failed, find some key from @child. We are looking for the
70085+ least key in a child. */
70086+ read_lock_dk(tree);
70087+ ld = *znode_get_ld_key(child);
70088+ read_unlock_dk(tree);
70089+ /*
70090+ * now, lookup parent with key just found. Note, that left delimiting
70091+ * key doesn't identify node uniquely, because (in extremely rare
70092+ * case) two nodes can have equal left delimiting keys, if one of them
70093+ * is completely filled with directory entries that all happened to be
70094+ * hash collision. But, we check block number in check_tree_pointer()
70095+ * and, so, are safe.
70096+ */
70097+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
70098+ /* update cached pos_in_node */
70099+ if (lookup_res == NS_FOUND) {
70100+ write_lock_tree(tree);
70101+ coord_to_parent_coord(result, &child->in_parent);
70102+ write_unlock_tree(tree);
70103+ lookup_res = check_tree_pointer(result, child);
70104+ }
70105+ if (lookup_res == NS_NOT_FOUND)
70106+ lookup_res = find_child_by_addr(parent, child, result);
70107+ return lookup_res;
70108+}
70109+
70110+/* find coord of pointer to @child in @parent by scanning
70111+
70112+ Find the &coord_t in the @parent where pointer to a given @child
70113+ is in by scanning all internal items in @parent and comparing block
70114+ numbers in them with that of @child.
70115+
70116+*/
70117+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
70118+ znode * child /* child znode, passed locked */ ,
70119+ coord_t * result /* where result is stored in */ )
70120+{
70121+ int ret;
70122+
70123+ assert("nikita-1320", parent != NULL);
70124+ assert("nikita-1321", child != NULL);
70125+ assert("nikita-1322", result != NULL);
70126+
70127+ ret = NS_NOT_FOUND;
70128+
70129+ for_all_units(result, parent) {
70130+ if (check_tree_pointer(result, child) == NS_FOUND) {
70131+ write_lock_tree(znode_get_tree(parent));
70132+ coord_to_parent_coord(result, &child->in_parent);
70133+ write_unlock_tree(znode_get_tree(parent));
70134+ ret = NS_FOUND;
70135+ break;
70136+ }
70137+ }
70138+ return ret;
70139+}
70140+
70141+/* true, if @addr is "unallocated block number", which is just address, with
70142+ highest bit set. */
70143+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
70144+ * check */ )
70145+{
70146+ assert("nikita-1766", addr != NULL);
70147+ cassert(sizeof(reiser4_block_nr) == 8);
70148+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
70149+ REISER4_UNALLOCATED_STATUS_VALUE;
70150+}
70151+
70152+/* returns true if removing bytes of given range of key [from_key, to_key]
70153+ causes removing of whole item @from */
70154+static int
70155+item_removed_completely(coord_t * from, const reiser4_key * from_key,
70156+ const reiser4_key * to_key)
70157+{
70158+ item_plugin *iplug;
70159+ reiser4_key key_in_item;
70160+
70161+ assert("umka-325", from != NULL);
70162+ assert("", item_is_extent(from));
70163+
70164+ /* check first key just for case */
70165+ item_key_by_coord(from, &key_in_item);
70166+ if (keygt(from_key, &key_in_item))
70167+ return 0;
70168+
70169+ /* check last key */
70170+ iplug = item_plugin_by_coord(from);
70171+ assert("vs-611", iplug && iplug->s.file.append_key);
70172+
70173+ iplug->s.file.append_key(from, &key_in_item);
70174+ set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
70175+
70176+ if (keylt(to_key, &key_in_item))
70177+ /* last byte is not removed */
70178+ return 0;
70179+ return 1;
70180+}
70181+
70182+/* helper function for prepare_twig_kill(): @left and @right are formatted
70183+ * neighbors of extent item being completely removed. Load and lock neighbors
70184+ * and store lock handles into @cdata for later use by kill_hook_extent() */
70185+static int
70186+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
70187+{
70188+ int result;
70189+ int left_loaded;
70190+ int right_loaded;
70191+
70192+ result = 0;
70193+ left_loaded = right_loaded = 0;
70194+
70195+ if (left != NULL) {
70196+ result = zload(left);
70197+ if (result == 0) {
70198+ left_loaded = 1;
70199+ result = longterm_lock_znode(kdata->left, left,
70200+ ZNODE_READ_LOCK,
70201+ ZNODE_LOCK_LOPRI);
70202+ }
70203+ }
70204+ if (result == 0 && right != NULL) {
70205+ result = zload(right);
70206+ if (result == 0) {
70207+ right_loaded = 1;
70208+ result = longterm_lock_znode(kdata->right, right,
70209+ ZNODE_READ_LOCK,
70210+ ZNODE_LOCK_HIPRI |
70211+ ZNODE_LOCK_NONBLOCK);
70212+ }
70213+ }
70214+ if (result != 0) {
70215+ done_lh(kdata->left);
70216+ done_lh(kdata->right);
70217+ if (left_loaded != 0)
70218+ zrelse(left);
70219+ if (right_loaded != 0)
70220+ zrelse(right);
70221+ }
70222+ return result;
70223+}
70224+
70225+static void done_children(carry_kill_data * kdata)
70226+{
70227+ if (kdata->left != NULL && kdata->left->node != NULL) {
70228+ zrelse(kdata->left->node);
70229+ done_lh(kdata->left);
70230+ }
70231+ if (kdata->right != NULL && kdata->right->node != NULL) {
70232+ zrelse(kdata->right->node);
70233+ done_lh(kdata->right);
70234+ }
70235+}
70236+
70237+/* part of cut_node. It is called when cut_node is called to remove or cut part
70238+ of extent item. When head of that item is removed - we have to update right
70239+ delimiting of left neighbor of extent. When item is removed completely - we
70240+ have to set sibling link between left and right neighbor of removed
70241+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
70242+ locked. So, caller should repeat an attempt
70243+*/
70244+/* Audited by: umka (2002.06.16) */
70245+static int
70246+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
70247+{
70248+ int result;
70249+ reiser4_key key;
70250+ lock_handle left_lh;
70251+ lock_handle right_lh;
70252+ coord_t left_coord;
70253+ coord_t *from;
70254+ znode *left_child;
70255+ znode *right_child;
70256+ reiser4_tree *tree;
70257+ int left_zloaded_here, right_zloaded_here;
70258+
70259+ from = kdata->params.from;
70260+ assert("umka-326", from != NULL);
70261+ assert("umka-327", kdata->params.to != NULL);
70262+
70263+ /* for one extent item only yet */
70264+ assert("vs-591", item_is_extent(from));
70265+ assert("vs-592", from->item_pos == kdata->params.to->item_pos);
70266+
70267+ if ((kdata->params.from_key
70268+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
70269+ || from->unit_pos != 0) {
70270+ /* head of item @from is not removed, there is nothing to
70271+ worry about */
70272+ return 0;
70273+ }
70274+
70275+ result = 0;
70276+ left_zloaded_here = 0;
70277+ right_zloaded_here = 0;
70278+
70279+ left_child = right_child = NULL;
70280+
70281+ coord_dup(&left_coord, from);
70282+ init_lh(&left_lh);
70283+ init_lh(&right_lh);
70284+ if (coord_prev_unit(&left_coord)) {
70285+ /* @from is leftmost item in its node */
70286+ if (!locked_left_neighbor) {
70287+ result =
70288+ reiser4_get_left_neighbor(&left_lh, from->node,
70289+ ZNODE_READ_LOCK,
70290+ GN_CAN_USE_UPPER_LEVELS);
70291+ switch (result) {
70292+ case 0:
70293+ break;
70294+ case -E_NO_NEIGHBOR:
70295+ /* there is no formatted node to the left of
70296+ from->node */
70297+ warning("vs-605",
70298+ "extent item has smallest key in "
70299+ "the tree and it is about to be removed");
70300+ return 0;
70301+ case -E_DEADLOCK:
70302+ /* need to restart */
70303+ default:
70304+ return result;
70305+ }
70306+
70307+ /* we have acquired left neighbor of from->node */
70308+ result = zload(left_lh.node);
70309+ if (result)
70310+ goto done;
70311+
70312+ locked_left_neighbor = left_lh.node;
70313+ } else {
70314+ /* squalloc_right_twig_cut should have supplied locked
70315+ * left neighbor */
70316+ assert("vs-834",
70317+ znode_is_write_locked(locked_left_neighbor));
70318+ result = zload(locked_left_neighbor);
70319+ if (result)
70320+ return result;
70321+ }
70322+
70323+ left_zloaded_here = 1;
70324+ coord_init_last_unit(&left_coord, locked_left_neighbor);
70325+ }
70326+
70327+ if (!item_is_internal(&left_coord)) {
70328+ /* what else but extent can be on twig level */
70329+ assert("vs-606", item_is_extent(&left_coord));
70330+
70331+ /* there is no left formatted child */
70332+ if (left_zloaded_here)
70333+ zrelse(locked_left_neighbor);
70334+ done_lh(&left_lh);
70335+ return 0;
70336+ }
70337+
70338+ tree = znode_get_tree(left_coord.node);
70339+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
70340+
70341+ if (IS_ERR(left_child)) {
70342+ result = PTR_ERR(left_child);
70343+ goto done;
70344+ }
70345+
70346+ /* left child is acquired, calculate new right delimiting key for it
70347+ and get right child if it is necessary */
70348+ if (item_removed_completely
70349+ (from, kdata->params.from_key, kdata->params.to_key)) {
70350+ /* try to get right child of removed item */
70351+ coord_t right_coord;
70352+
70353+ assert("vs-607",
70354+ kdata->params.to->unit_pos ==
70355+ coord_last_unit_pos(kdata->params.to));
70356+ coord_dup(&right_coord, kdata->params.to);
70357+ if (coord_next_unit(&right_coord)) {
70358+ /* @to is rightmost unit in the node */
70359+ result =
70360+ reiser4_get_right_neighbor(&right_lh, from->node,
70361+ ZNODE_READ_LOCK,
70362+ GN_CAN_USE_UPPER_LEVELS);
70363+ switch (result) {
70364+ case 0:
70365+ result = zload(right_lh.node);
70366+ if (result)
70367+ goto done;
70368+
70369+ right_zloaded_here = 1;
70370+ coord_init_first_unit(&right_coord,
70371+ right_lh.node);
70372+ item_key_by_coord(&right_coord, &key);
70373+ break;
70374+
70375+ case -E_NO_NEIGHBOR:
70376+ /* there is no formatted node to the right of
70377+ from->node */
70378+ read_lock_dk(tree);
70379+ key = *znode_get_rd_key(from->node);
70380+ read_unlock_dk(tree);
70381+ right_coord.node = NULL;
70382+ result = 0;
70383+ break;
70384+ default:
70385+ /* real error */
70386+ goto done;
70387+ }
70388+ } else {
70389+ /* there is an item to the right of @from - take its key */
70390+ item_key_by_coord(&right_coord, &key);
70391+ }
70392+
70393+ /* try to get right child of @from */
70394+ if (right_coord.node && /* there is right neighbor of @from */
70395+ item_is_internal(&right_coord)) { /* it is internal item */
70396+ right_child = child_znode(&right_coord,
70397+ right_coord.node, 1, 0);
70398+
70399+ if (IS_ERR(right_child)) {
70400+ result = PTR_ERR(right_child);
70401+ goto done;
70402+ }
70403+
70404+ }
70405+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
70406+ update of right delimiting key of left_child */
70407+ result = prepare_children(left_child, right_child, kdata);
70408+ } else {
70409+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
70410+ result = prepare_children(left_child, NULL, kdata);
70411+ }
70412+
70413+ done:
70414+ if (right_child)
70415+ zput(right_child);
70416+ if (right_zloaded_here)
70417+ zrelse(right_lh.node);
70418+ done_lh(&right_lh);
70419+
70420+ if (left_child)
70421+ zput(left_child);
70422+ if (left_zloaded_here)
70423+ zrelse(locked_left_neighbor);
70424+ done_lh(&left_lh);
70425+ return result;
70426+}
70427+
70428+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
70429+ are to be cut completely */
70430+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
70431+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
70432+ const reiser4_key * to_key, /* last key to be removed */
70433+ reiser4_key *
70434+ smallest_removed /* smallest key actually removed */ )
70435+{
70436+ int result;
70437+ carry_pool *pool;
70438+ carry_level *lowest_level;
70439+ carry_cut_data *cut_data;
70440+ carry_op *op;
70441+
70442+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
70443+
70444+ pool =
70445+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70446+ sizeof(*cut_data));
70447+ if (IS_ERR(pool))
70448+ return PTR_ERR(pool);
70449+ lowest_level = (carry_level *) (pool + 1);
70450+ init_carry_level(lowest_level, pool);
70451+
70452+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
70453+ assert("vs-1509", op != 0);
70454+ if (IS_ERR(op)) {
70455+ done_carry_pool(pool);
70456+ return PTR_ERR(op);
70457+ }
70458+
70459+ cut_data = (carry_cut_data *) (lowest_level + 3);
70460+ cut_data->params.from = from;
70461+ cut_data->params.to = to;
70462+ cut_data->params.from_key = from_key;
70463+ cut_data->params.to_key = to_key;
70464+ cut_data->params.smallest_removed = smallest_removed;
70465+
70466+ op->u.cut_or_kill.is_cut = 1;
70467+ op->u.cut_or_kill.u.cut = cut_data;
70468+
70469+ result = reiser4_carry(lowest_level, NULL);
70470+ done_carry_pool(pool);
70471+
70472+ return result;
70473+}
70474+
70475+/* cut part of the node
70476+
70477+ Cut part or whole content of node.
70478+
70479+ cut data between @from and @to of @from->node and call carry() to make
70480+ corresponding changes in the tree. @from->node may become empty. If so -
70481+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
70482+ removed key is stored in @smallest_removed
70483+
70484+*/
70485+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
70486+ coord_t * to, /* coord of the last unit/item that will be eliminated */
70487+ const reiser4_key * from_key, /* first key to be removed */
70488+ const reiser4_key * to_key, /* last key to be removed */
70489+ reiser4_key * smallest_removed, /* smallest key actually removed */
70490+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
70491+ * locked (in squalloc_right_twig_cut, namely) */
70492+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
70493+ invalidate pages together with item pointing to them */
70494+ int truncate)
70495+{ /* this call is made for file truncate) */
70496+ int result;
70497+ carry_pool *pool;
70498+ carry_level *lowest_level;
70499+ carry_kill_data *kdata;
70500+ lock_handle *left_child;
70501+ lock_handle *right_child;
70502+ carry_op *op;
70503+
70504+ assert("umka-328", from != NULL);
70505+ assert("vs-316", !node_is_empty(from->node));
70506+ assert("nikita-1812", coord_is_existing_unit(from)
70507+ && coord_is_existing_unit(to));
70508+
70509+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
70510+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70511+ sizeof(carry_kill_data) +
70512+ 2 * sizeof(lock_handle) +
70513+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
70514+ if (IS_ERR(pool))
70515+ return PTR_ERR(pool);
70516+
70517+ lowest_level = (carry_level *) (pool + 1);
70518+ init_carry_level(lowest_level, pool);
70519+
70520+ kdata = (carry_kill_data *) (lowest_level + 3);
70521+ left_child = (lock_handle *) (kdata + 1);
70522+ right_child = left_child + 1;
70523+
70524+ init_lh(left_child);
70525+ init_lh(right_child);
70526+
70527+ kdata->params.from = from;
70528+ kdata->params.to = to;
70529+ kdata->params.from_key = from_key;
70530+ kdata->params.to_key = to_key;
70531+ kdata->params.smallest_removed = smallest_removed;
70532+ kdata->params.truncate = truncate;
70533+ kdata->flags = 0;
70534+ kdata->inode = inode;
70535+ kdata->left = left_child;
70536+ kdata->right = right_child;
70537+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70538+ kdata->buf = (char *)(right_child + 1);
70539+
70540+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70541+ /* left child of extent item may have to get updated right
70542+ delimiting key and to get linked with right child of extent
70543+ @from if it will be removed completely */
70544+ result = prepare_twig_kill(kdata, locked_left_neighbor);
70545+ if (result) {
70546+ done_children(kdata);
70547+ done_carry_pool(pool);
70548+ return result;
70549+ }
70550+ }
70551+
70552+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
70553+ if (IS_ERR(op) || (op == NULL)) {
70554+ done_children(kdata);
70555+ done_carry_pool(pool);
70556+ return RETERR(op ? PTR_ERR(op) : -EIO);
70557+ }
70558+
70559+ op->u.cut_or_kill.is_cut = 0;
70560+ op->u.cut_or_kill.u.kill = kdata;
70561+
70562+ result = reiser4_carry(lowest_level, NULL);
70563+
70564+ done_children(kdata);
70565+ done_carry_pool(pool);
70566+ return result;
70567+}
70568+
70569+void
70570+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70571+{
70572+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
70573+ pgoff_t start_pg, end_pg;
70574+
70575+ start_pg = start >> PAGE_CACHE_SHIFT;
70576+ end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70577+
70578+ if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70579+ /*
70580+ * kill up to the page boundary.
70581+ */
70582+ assert("vs-123456", start_pg == end_pg);
70583+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70584+ truncate);
70585+ } else if (start_pg != end_pg) {
70586+ /*
70587+ * page boundary is within killed portion of node.
70588+ */
70589+ assert("vs-654321", end_pg - start_pg == 1);
70590+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
70591+ end_pg - start_pg, 1);
70592+ }
70593+ }
70594+ inode_sub_bytes(inode, end - start);
70595+}
70596+
70597+/**
70598+ * Delete whole @node from the reiser4 tree without loading it.
70599+ *
70600+ * @left: locked left neighbor,
70601+ * @node: node to be deleted,
70602+ * @smallest_removed: leftmost key of deleted node,
70603+ * @object: inode pointer, if we truncate a file body.
70604+ * @truncate: true if called for file truncate.
70605+ *
70606+ * @return: 0 if success, error code otherwise.
70607+ *
70608+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70609+ * contains the right value of the smallest removed key from the previous
70610+ * cut_worker() iteration. This is needed for proper accounting of
70611+ * "i_blocks" and "i_bytes" fields of the @object.
70612+ */
70613+int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
70614+ struct inode *object, int truncate)
70615+{
70616+ lock_handle parent_lock;
70617+ coord_t cut_from;
70618+ coord_t cut_to;
70619+ reiser4_tree *tree;
70620+ int ret;
70621+
70622+ assert("zam-937", node != NULL);
70623+ assert("zam-933", znode_is_write_locked(node));
70624+ assert("zam-999", smallest_removed != NULL);
70625+
70626+ init_lh(&parent_lock);
70627+
70628+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70629+ if (ret)
70630+ return ret;
70631+
70632+ assert("zam-934", !znode_above_root(parent_lock.node));
70633+
70634+ ret = zload(parent_lock.node);
70635+ if (ret)
70636+ goto failed_nozrelse;
70637+
70638+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
70639+ if (ret)
70640+ goto failed;
70641+
70642+ /* decrement child counter and set parent pointer to NULL before
70643+ deleting the list from parent node because of checks in
70644+ internal_kill_item_hook (we can delete the last item from the parent
70645+ node, the parent node is going to be deleted and its c_count should
70646+ be zero). */
70647+
70648+ tree = znode_get_tree(node);
70649+ write_lock_tree(tree);
70650+ init_parent_coord(&node->in_parent, NULL);
70651+ --parent_lock.node->c_count;
70652+ write_unlock_tree(tree);
70653+
70654+ assert("zam-989", item_is_internal(&cut_from));
70655+
70656+ /* @node should be deleted after unlocking. */
70657+ ZF_SET(node, JNODE_HEARD_BANSHEE);
70658+
70659+ /* remove a pointer from the parent node to the node being deleted. */
70660+ coord_dup(&cut_to, &cut_from);
70661+ /* FIXME: shouldn't this be kill_node_content */
70662+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70663+ if (ret)
70664+ /* FIXME(Zam): Should we re-connect the node to its parent if
70665+ * cut_node fails? */
70666+ goto failed;
70667+
70668+ {
70669+ reiser4_tree *tree = current_tree;
70670+ __u64 start_offset = 0, end_offset = 0;
70671+
70672+ read_lock_tree(tree);
70673+ write_lock_dk(tree);
70674+ if (object) {
70675+ /* We use @smallest_removed and the left delimiting of
70676+ * the current node for @object->i_blocks, i_bytes
70677+ * calculation. We assume that the items after the
70678+ * *@smallest_removed key have been deleted from the
70679+ * file body. */
70680+ start_offset = get_key_offset(znode_get_ld_key(node));
70681+ end_offset = get_key_offset(smallest_removed);
70682+ }
70683+
70684+ assert("zam-1021", znode_is_connected(node));
70685+ if (node->left)
70686+ znode_set_rd_key(node->left, znode_get_rd_key(node));
70687+
70688+ *smallest_removed = *znode_get_ld_key(node);
70689+
70690+ write_unlock_dk(tree);
70691+ read_unlock_tree(tree);
70692+
70693+ if (object) {
70694+ /* we used to perform actions which are to be performed on items on their removal from tree in
70695+ special item method - kill_hook. Here for optimization reasons we avoid reading node
70696+ containing item we remove and can not call item's kill hook. Instead we call function which
70697+ does exactly the same things as tail kill hook in assumption that node we avoid reading
70698+ contains only one item and that item is a tail one. */
70699+ fake_kill_hook_tail(object, start_offset, end_offset,
70700+ truncate);
70701+ }
70702+ }
70703+ failed:
70704+ zrelse(parent_lock.node);
70705+ failed_nozrelse:
70706+ done_lh(&parent_lock);
70707+
70708+ return ret;
70709+}
70710+
70711+static int can_delete(const reiser4_key *key, znode *node)
70712+{
70713+ int result;
70714+
70715+ read_lock_dk(current_tree);
70716+ result = keyle(key, znode_get_ld_key(node));
70717+ read_unlock_dk(current_tree);
70718+ return result;
70719+}
70720+
70721+/**
70722+ * This subroutine is not optimal but implementation seems to
70723+ * be easier).
70724+ *
70725+ * @tap: the point deletion process begins from,
70726+ * @from_key: the beginning of the deleted key range,
70727+ * @to_key: the end of the deleted key range,
70728+ * @smallest_removed: the smallest removed key,
70729+ * @truncate: true if called for file truncate.
70730+ * @progress: return true if a progress in file items deletions was made,
70731+ * @smallest_removed value is actual in that case.
70732+ *
70733+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long
70734+ * reiser4_cut_tree operation was interrupted for allowing atom commit.
70735+ */
70736+int
70737+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70738+ const reiser4_key * to_key,
70739+ reiser4_key * smallest_removed, struct inode *object,
70740+ int truncate, int *progress)
70741+{
70742+ lock_handle next_node_lock;
70743+ coord_t left_coord;
70744+ int result;
70745+
70746+ assert("zam-931", tap->coord->node != NULL);
70747+ assert("zam-932", znode_is_write_locked(tap->coord->node));
70748+
70749+ *progress = 0;
70750+ init_lh(&next_node_lock);
70751+
70752+ while (1) {
70753+ znode *node; /* node from which items are cut */
70754+ node_plugin *nplug; /* node plugin for @node */
70755+
70756+ node = tap->coord->node;
70757+
70758+ /* Move next_node_lock to the next node on the left. */
70759+ result =
70760+ reiser4_get_left_neighbor(&next_node_lock, node,
70761+ ZNODE_WRITE_LOCK,
70762+ GN_CAN_USE_UPPER_LEVELS);
70763+ if (result != 0 && result != -E_NO_NEIGHBOR)
70764+ break;
70765+ /* Check can we delete the node as a whole. */
70766+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70767+ can_delete(from_key, node)) {
70768+ result = reiser4_delete_node(node, smallest_removed,
70769+ object, truncate);
70770+ } else {
70771+ result = reiser4_tap_load(tap);
70772+ if (result)
70773+ return result;
70774+
70775+ /* Prepare the second (right) point for cut_node() */
70776+ if (*progress)
70777+ coord_init_last_unit(tap->coord, node);
70778+
70779+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70780+ NULL)
70781+ /* set rightmost unit for the items without lookup method */
70782+ tap->coord->unit_pos =
70783+ coord_last_unit_pos(tap->coord);
70784+
70785+ nplug = node->nplug;
70786+
70787+ assert("vs-686", nplug);
70788+ assert("vs-687", nplug->lookup);
70789+
70790+ /* left_coord is leftmost unit cut from @node */
70791+ result = nplug->lookup(node, from_key,
70792+ FIND_MAX_NOT_MORE_THAN,
70793+ &left_coord);
70794+
70795+ if (IS_CBKERR(result))
70796+ break;
70797+
70798+ /* adjust coordinates so that they are set to existing units */
70799+ if (coord_set_to_right(&left_coord)
70800+ || coord_set_to_left(tap->coord)) {
70801+ result = 0;
70802+ break;
70803+ }
70804+
70805+ if (coord_compare(&left_coord, tap->coord) ==
70806+ COORD_CMP_ON_RIGHT) {
70807+ /* keys from @from_key to @to_key are not in the tree */
70808+ result = 0;
70809+ break;
70810+ }
70811+
70812+ if (left_coord.item_pos != tap->coord->item_pos) {
70813+ /* do not allow to cut more than one item. It is added to solve problem of truncating
70814+ partially converted files. If file is partially converted there may exist a twig node
70815+ containing both internal item or items pointing to leaf nodes with formatting items
70816+ and extent item. We do not want to kill internal items being at twig node here
70817+ because cut_tree_worker assumes killing them from level level */
70818+ coord_dup(&left_coord, tap->coord);
70819+ assert("vs-1652",
70820+ coord_is_existing_unit(&left_coord));
70821+ left_coord.unit_pos = 0;
70822+ }
70823+
70824+ /* cut data from one node */
70825+ // *smallest_removed = *reiser4_min_key();
70826+ result =
70827+ kill_node_content(&left_coord, tap->coord, from_key,
70828+ to_key, smallest_removed,
70829+ next_node_lock.node, object,
70830+ truncate);
70831+ reiser4_tap_relse(tap);
70832+ }
70833+ if (result)
70834+ break;
70835+
70836+ ++(*progress);
70837+
70838+ /* Check whether all items with keys >= from_key were removed
70839+ * from the tree. */
70840+ if (keyle(smallest_removed, from_key))
70841+ /* result = 0; */
70842+ break;
70843+
70844+ if (next_node_lock.node == NULL)
70845+ break;
70846+
70847+ result = reiser4_tap_move(tap, &next_node_lock);
70848+ done_lh(&next_node_lock);
70849+ if (result)
70850+ break;
70851+
70852+ /* Break long reiser4_cut_tree operation (deletion of a large
70853+ file) if atom requires commit. */
70854+ if (*progress > CUT_TREE_MIN_ITERATIONS
70855+ && current_atom_should_commit()) {
70856+ result = -E_REPEAT;
70857+ break;
70858+ }
70859+ }
70860+ done_lh(&next_node_lock);
70861+ // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
70862+ return result;
70863+}
70864+
70865+/* there is a fundamental problem with optimizing deletes: VFS does it
70866+ one file at a time. Another problem is that if an item can be
70867+ anything, then deleting items must be done one at a time. It just
70868+ seems clean to writes this to specify a from and a to key, and cut
70869+ everything between them though. */
70870+
70871+/* use this function with care if deleting more than what is part of a single file. */
70872+/* do not use this when cutting a single item, it is suboptimal for that */
70873+
70874+/* You are encouraged to write plugin specific versions of this. It
70875+ cannot be optimal for all plugins because it works item at a time,
70876+ and some plugins could sometimes work node at a time. Regular files
70877+ however are not optimizable to work node at a time because of
70878+ extents needing to free the blocks they point to.
70879+
70880+ Optimizations compared to v3 code:
70881+
70882+ It does not balance (that task is left to memory pressure code).
70883+
70884+ Nodes are deleted only if empty.
70885+
70886+ Uses extents.
70887+
70888+ Performs read-ahead of formatted nodes whose contents are part of
70889+ the deletion.
70890+*/
70891+
70892+/**
70893+ * Delete everything from the reiser4 tree between two keys: @from_key and
70894+ * @to_key.
70895+ *
70896+ * @from_key: the beginning of the deleted key range,
70897+ * @to_key: the end of the deleted key range,
70898+ * @smallest_removed: the smallest removed key,
70899+ * @object: owner of cutting items.
70900+ * @truncate: true if called for file truncate.
70901+ * @progress: return true if a progress in file items deletions was made,
70902+ * @smallest_removed value is actual in that case.
70903+ *
70904+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70905+ * operation was interrupted for allowing atom commit .
70906+ */
70907+
70908+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70909+ const reiser4_key * to_key,
70910+ reiser4_key * smallest_removed_p,
70911+ struct inode *object, int truncate, int *progress)
70912+{
70913+ lock_handle lock;
70914+ int result;
70915+ tap_t tap;
70916+ coord_t right_coord;
70917+ reiser4_key smallest_removed;
70918+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70919+ const reiser4_key *, reiser4_key *,
70920+ struct inode *, int, int *);
70921+ STORE_COUNTERS;
70922+
70923+ assert("umka-329", tree != NULL);
70924+ assert("umka-330", from_key != NULL);
70925+ assert("umka-331", to_key != NULL);
70926+ assert("zam-936", keyle(from_key, to_key));
70927+
70928+ if (smallest_removed_p == NULL)
70929+ smallest_removed_p = &smallest_removed;
70930+
70931+ init_lh(&lock);
70932+
70933+ do {
70934+ /* Find rightmost item to cut away from the tree. */
70935+ result = reiser4_object_lookup(object, to_key, &right_coord,
70936+ &lock, ZNODE_WRITE_LOCK,
70937+ FIND_MAX_NOT_MORE_THAN,
70938+ TWIG_LEVEL, LEAF_LEVEL,
70939+ CBK_UNIQUE, NULL /*ra_info */);
70940+ if (result != CBK_COORD_FOUND)
70941+ break;
70942+ if (object == NULL
70943+ || inode_file_plugin(object)->cut_tree_worker == NULL)
70944+ cut_tree_worker = cut_tree_worker_common;
70945+ else
70946+ cut_tree_worker =
70947+ inode_file_plugin(object)->cut_tree_worker;
70948+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70949+ result =
70950+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70951+ object, truncate, progress);
70952+ reiser4_tap_done(&tap);
70953+
70954+ reiser4_preempt_point();
70955+
70956+ } while (0);
70957+
70958+ done_lh(&lock);
70959+
70960+ if (result) {
70961+ switch (result) {
70962+ case -E_NO_NEIGHBOR:
70963+ result = 0;
70964+ break;
70965+ case -E_DEADLOCK:
70966+ result = -E_REPEAT;
70967+ case -E_REPEAT:
70968+ case -ENOMEM:
70969+ case -ENOENT:
70970+ break;
70971+ default:
70972+ warning("nikita-2861", "failure: %i", result);
70973+ }
70974+ }
70975+
70976+ CHECK_COUNTERS;
70977+ return result;
70978+}
70979+
70980+/* repeat reiser4_cut_tree_object until everything is deleted.
70981+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT
70982+ * is returned by cut_tree_object. */
70983+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70984+ const reiser4_key * to, struct inode *inode, int truncate)
70985+{
70986+ int result;
70987+ int progress;
70988+
70989+ do {
70990+ result = reiser4_cut_tree_object(tree, from, to, NULL,
70991+ inode, truncate, &progress);
70992+ } while (result == -E_REPEAT);
70993+
70994+ return result;
70995+}
70996+
70997+/* finishing reiser4 initialization */
70998+int reiser4_init_tree(reiser4_tree * tree /* pointer to structure being
70999+ * initialized */ ,
71000+ const reiser4_block_nr * root_block /* address of a root block
71001+ * on a disk */ ,
71002+ tree_level height /* height of a tree */ ,
71003+ node_plugin * nplug /* default node plugin */ )
71004+{
71005+ int result;
71006+
71007+ assert("nikita-306", tree != NULL);
71008+ assert("nikita-307", root_block != NULL);
71009+ assert("nikita-308", height > 0);
71010+ assert("nikita-309", nplug != NULL);
71011+ assert("zam-587", tree->super != NULL);
71012+
71013+ tree->root_block = *root_block;
71014+ tree->height = height;
71015+ tree->estimate_one_insert = calc_estimate_one_insert(height);
71016+ tree->nplug = nplug;
71017+
71018+ tree->znode_epoch = 1ull;
71019+
71020+ cbk_cache_init(&tree->cbk_cache);
71021+
71022+ result = znodes_tree_init(tree);
71023+ if (result == 0)
71024+ result = jnodes_tree_init(tree);
71025+ if (result == 0) {
71026+ tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
71027+ reiser4_ctx_gfp_mask_get());
71028+ if (IS_ERR(tree->uber)) {
71029+ result = PTR_ERR(tree->uber);
71030+ tree->uber = NULL;
71031+ }
71032+ }
71033+ return result;
71034+}
71035+
71036+/* release resources associated with @tree */
71037+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
71038+{
71039+ if (tree == NULL)
71040+ return;
71041+
71042+ if (tree->uber != NULL) {
71043+ zput(tree->uber);
71044+ tree->uber = NULL;
71045+ }
71046+ znodes_tree_done(tree);
71047+ jnodes_tree_done(tree);
71048+ cbk_cache_done(&tree->cbk_cache);
71049+}
71050+
71051+/* Make Linus happy.
71052+ Local variables:
71053+ c-indentation-style: "K&R"
71054+ mode-name: "LC"
71055+ c-basic-offset: 8
71056+ tab-width: 8
71057+ fill-column: 120
71058+ scroll-step: 1
71059+ End:
71060+*/
71061diff --git a/fs/reiser4/tree.h b/fs/reiser4/tree.h
71062new file mode 100644
71063index 0000000..73aa70a
71064--- /dev/null
71065+++ b/fs/reiser4/tree.h
71066@@ -0,0 +1,577 @@
71067+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71068+ * reiser4/README */
71069+
71070+/* Tree operations. See fs/reiser4/tree.c for comments */
71071+
71072+#if !defined( __REISER4_TREE_H__ )
71073+#define __REISER4_TREE_H__
71074+
71075+#include "forward.h"
71076+#include "debug.h"
71077+#include "dformat.h"
71078+#include "plugin/node/node.h"
71079+#include "plugin/plugin.h"
71080+#include "znode.h"
71081+#include "tap.h"
71082+
71083+#include <linux/types.h> /* for __u?? */
71084+#include <linux/fs.h> /* for struct super_block */
71085+#include <linux/spinlock.h>
71086+#include <linux/sched.h> /* for struct task_struct */
71087+
71088+/* fictive block number never actually used */
71089+extern const reiser4_block_nr UBER_TREE_ADDR;
71090+
71091+/* &cbk_cache_slot - entry in a coord cache.
71092+
71093+ This is entry in a coord_by_key (cbk) cache, represented by
71094+ &cbk_cache.
71095+
71096+*/
71097+typedef struct cbk_cache_slot {
71098+ /* cached node */
71099+ znode *node;
71100+ /* linkage to the next cbk cache slot in a LRU order */
71101+ struct list_head lru;
71102+} cbk_cache_slot;
71103+
71104+/* &cbk_cache - coord cache. This is part of reiser4_tree.
71105+
71106+ cbk_cache is supposed to speed up tree lookups by caching results of recent
71107+ successful lookups (we don't cache negative results as dentry cache
71108+ does). Cache consists of relatively small number of entries kept in a LRU
71109+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
71110+ which we can obtain a range of keys that covered by this znode. Before
71111+ embarking into real tree traversal we scan cbk_cache slot by slot and for
71112+ each slot check whether key we are looking for is between minimal and
71113+ maximal keys for node pointed to by this slot. If no match is found, real
71114+ tree traversal is performed and if result is successful, appropriate entry
71115+ is inserted into cache, possibly pulling least recently used entry out of
71116+ it.
71117+
71118+ Tree spin lock is used to protect coord cache. If contention for this
71119+ lock proves to be too high, more finer grained locking can be added.
71120+
71121+ Invariants involving parts of this data-type:
71122+
71123+ [cbk-cache-invariant]
71124+*/
71125+typedef struct cbk_cache {
71126+ /* serializator */
71127+ rwlock_t guard;
71128+ int nr_slots;
71129+ /* head of LRU list of cache slots */
71130+ struct list_head lru;
71131+ /* actual array of slots */
71132+ cbk_cache_slot *slot;
71133+} cbk_cache;
71134+
71135+/* level_lookup_result - possible outcome of looking up key at some level.
71136+ This is used by coord_by_key when traversing tree downward. */
71137+typedef enum {
71138+ /* continue to the next level */
71139+ LOOKUP_CONT,
71140+ /* done. Either required item was found, or we can prove it
71141+ doesn't exist, or some error occurred. */
71142+ LOOKUP_DONE,
71143+ /* restart traversal from the root. Infamous "repetition". */
71144+ LOOKUP_REST
71145+} level_lookup_result;
71146+
71147+/* This is representation of internal reiser4 tree where all file-system
71148+ data and meta-data are stored. This structure is passed to all tree
71149+ manipulation functions. It's different from the super block because:
71150+ we don't want to limit ourselves to strictly one to one mapping
71151+ between super blocks and trees, and, because they are logically
71152+ different: there are things in a super block that have no relation to
71153+ the tree (bitmaps, journalling area, mount options, etc.) and there
71154+ are things in a tree that bear no relation to the super block, like
71155+ tree of znodes.
71156+
71157+ At this time, there is only one tree
71158+ per filesystem, and this struct is part of the super block. We only
71159+ call the super block the super block for historical reasons (most
71160+ other filesystems call the per filesystem metadata the super block).
71161+*/
71162+
71163+struct reiser4_tree {
71164+ /* block_nr == 0 is fake znode. Write lock it, while changing
71165+ tree height. */
71166+ /* disk address of root node of a tree */
71167+ reiser4_block_nr root_block;
71168+
71169+ /* level of the root node. If this is 1, tree consists of root
71170+ node only */
71171+ tree_level height;
71172+
71173+ /*
71174+ * this is cached here avoid calling plugins through function
71175+ * dereference all the time.
71176+ */
71177+ __u64 estimate_one_insert;
71178+
71179+ /* cache of recent tree lookup results */
71180+ cbk_cache cbk_cache;
71181+
71182+ /* hash table to look up znodes by block number. */
71183+ z_hash_table zhash_table;
71184+ z_hash_table zfake_table;
71185+ /* hash table to look up jnodes by inode and offset. */
71186+ j_hash_table jhash_table;
71187+
71188+ /* lock protecting:
71189+ - parent pointers,
71190+ - sibling pointers,
71191+ - znode hash table
71192+ - coord cache
71193+ */
71194+ /* NOTE: The "giant" tree lock can be replaced by more spin locks,
71195+ hoping they will be less contented. We can use one spin lock per one
71196+ znode hash bucket. With adding of some code complexity, sibling
71197+ pointers can be protected by both znode spin locks. However it looks
71198+ more SMP scalable we should test this locking change on n-ways (n >
71199+ 4) SMP machines. Current 4-ways machine test does not show that tree
71200+ lock is contented and it is a bottleneck (2003.07.25). */
71201+
71202+ rwlock_t tree_lock;
71203+
71204+ /* lock protecting delimiting keys */
71205+ rwlock_t dk_lock;
71206+
71207+ /* spin lock protecting znode_epoch */
71208+ spinlock_t epoch_lock;
71209+ /* version stamp used to mark znode updates. See seal.[ch] for more
71210+ * information. */
71211+ __u64 znode_epoch;
71212+
71213+ znode *uber;
71214+ node_plugin *nplug;
71215+ struct super_block *super;
71216+ struct {
71217+ /* carry flags used for insertion of new nodes */
71218+ __u32 new_node_flags;
71219+ /* carry flags used for insertion of new extents */
71220+ __u32 new_extent_flags;
71221+ /* carry flags used for paste operations */
71222+ __u32 paste_flags;
71223+ /* carry flags used for insert operations */
71224+ __u32 insert_flags;
71225+ } carry;
71226+};
71227+
71228+extern int reiser4_init_tree(reiser4_tree * tree,
71229+ const reiser4_block_nr * root_block,
71230+ tree_level height, node_plugin * default_plugin);
71231+extern void reiser4_done_tree(reiser4_tree * tree);
71232+
71233+/* cbk flags: options for coord_by_key() */
71234+typedef enum {
71235+ /* coord_by_key() is called for insertion. This is necessary because
71236+ of extents being located at the twig level. For explanation, see
71237+ comment just above is_next_item_internal().
71238+ */
71239+ CBK_FOR_INSERT = (1 << 0),
71240+ /* coord_by_key() is called with key that is known to be unique */
71241+ CBK_UNIQUE = (1 << 1),
71242+ /* coord_by_key() can trust delimiting keys. This options is not user
71243+ accessible. coord_by_key() will set it automatically. It will be
71244+ only cleared by special-case in extents-on-the-twig-level handling
71245+ where it is necessary to insert item with a key smaller than
71246+ leftmost key in a node. This is necessary because of extents being
71247+ located at the twig level. For explanation, see comment just above
71248+ is_next_item_internal().
71249+ */
71250+ CBK_TRUST_DK = (1 << 2),
71251+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
71252+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
71253+ CBK_DKSET = (1 << 5),
71254+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
71255+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
71256+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
71257+ * lock */
71258+} cbk_flags;
71259+
71260+/* insertion outcome. IBK = insert by key */
71261+typedef enum {
71262+ IBK_INSERT_OK = 0,
71263+ IBK_ALREADY_EXISTS = -EEXIST,
71264+ IBK_IO_ERROR = -EIO,
71265+ IBK_NO_SPACE = -E_NODE_FULL,
71266+ IBK_OOM = -ENOMEM
71267+} insert_result;
71268+
71269+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
71270+
71271+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
71272+ lock_handle * lh, void *arg);
71273+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
71274+ lock_handle * lh,
71275+ tree_iterate_actor_t actor, void *arg,
71276+ znode_lock_mode mode, int through_units_p);
71277+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
71278+ znode_lock_request pri, lock_handle * lh);
71279+
71280+/* return node plugin of @node */
71281+static inline node_plugin *node_plugin_by_node(const znode *
71282+ node /* node to query */ )
71283+{
71284+ assert("vs-213", node != NULL);
71285+ assert("vs-214", znode_is_loaded(node));
71286+
71287+ return node->nplug;
71288+}
71289+
71290+/* number of items in @node */
71291+static inline pos_in_node_t node_num_items(const znode * node)
71292+{
71293+ assert("nikita-2754", znode_is_loaded(node));
71294+ assert("nikita-2468",
71295+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
71296+
71297+ return node->nr_items;
71298+}
71299+
71300+/* Return the number of items at the present node. Asserts coord->node !=
71301+ NULL. */
71302+static inline unsigned coord_num_items(const coord_t * coord)
71303+{
71304+ assert("jmacd-9805", coord->node != NULL);
71305+
71306+ return node_num_items(coord->node);
71307+}
71308+
71309+/* true if @node is empty */
71310+static inline int node_is_empty(const znode * node)
71311+{
71312+ return node_num_items(node) == 0;
71313+}
71314+
71315+typedef enum {
71316+ SHIFTED_SOMETHING = 0,
71317+ SHIFT_NO_SPACE = -E_NODE_FULL,
71318+ SHIFT_IO_ERROR = -EIO,
71319+ SHIFT_OOM = -ENOMEM,
71320+} shift_result;
71321+
71322+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
71323+extern int is_coord_in_node(const coord_t * coord);
71324+extern int key_in_node(const reiser4_key *, const coord_t *);
71325+extern void coord_item_move_to(coord_t * coord, int items);
71326+extern void coord_unit_move_to(coord_t * coord, int units);
71327+
71328+/* there are two types of repetitive accesses (ra): intra-syscall
71329+ (local) and inter-syscall (global). Local ra is used when
71330+ during single syscall we add/delete several items and units in the
71331+ same place in a tree. Note that plan-A fragments local ra by
71332+ separating stat-data and file body in key-space. Global ra is
71333+ used when user does repetitive modifications in the same place in a
71334+ tree.
71335+
71336+ Our ra implementation serves following purposes:
71337+ 1 it affects balancing decisions so that next operation in a row
71338+ can be performed faster;
71339+ 2 it affects lower-level read-ahead in page-cache;
71340+ 3 it allows to avoid unnecessary lookups by maintaining some state
71341+ across several operations (this is only for local ra);
71342+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
71343+ operations they are performed without actually doing any intra-node
71344+ shifts, until we finish sequence or scope of sequence leaves
71345+ current node, only then we really pack node (local ra only).
71346+*/
71347+
71348+/* another thing that can be useful is to keep per-tree and/or
71349+ per-process cache of recent lookups. This cache can be organised as a
71350+ list of block numbers of formatted nodes sorted by starting key in
71351+ this node. Balancings should invalidate appropriate parts of this
71352+ cache.
71353+*/
71354+
71355+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
71356+ coord_t * coord, lock_handle * handle,
71357+ znode_lock_mode lock, lookup_bias bias,
71358+ tree_level lock_level, tree_level stop_level,
71359+ __u32 flags, ra_info_t *);
71360+
71361+lookup_result reiser4_object_lookup(struct inode *object,
71362+ const reiser4_key * key,
71363+ coord_t * coord,
71364+ lock_handle * lh,
71365+ znode_lock_mode lock_mode,
71366+ lookup_bias bias,
71367+ tree_level lock_level,
71368+ tree_level stop_level,
71369+ __u32 flags, ra_info_t * info);
71370+
71371+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
71372+ reiser4_item_data * data, coord_t * coord,
71373+ lock_handle * lh,
71374+ tree_level stop_level, __u32 flags);
71375+insert_result insert_by_coord(coord_t * coord,
71376+ reiser4_item_data * data, const reiser4_key * key,
71377+ lock_handle * lh, __u32);
71378+insert_result insert_extent_by_coord(coord_t * coord,
71379+ reiser4_item_data * data,
71380+ const reiser4_key * key, lock_handle * lh);
71381+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
71382+ const reiser4_key * to_key,
71383+ reiser4_key * smallest_removed);
71384+int kill_node_content(coord_t * from, coord_t * to,
71385+ const reiser4_key * from_key, const reiser4_key * to_key,
71386+ reiser4_key * smallest_removed,
71387+ znode * locked_left_neighbor, struct inode *inode,
71388+ int truncate);
71389+
71390+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
71391+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
71392+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
71393+ reiser4_item_data * data, unsigned);
71394+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
71395+int find_new_child_ptr(znode * parent, znode * child, znode * left,
71396+ coord_t * result);
71397+
71398+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
71399+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
71400+
71401+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
71402+
71403+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
71404+ const reiser4_key *, reiser4_key *,
71405+ struct inode *, int, int *);
71406+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
71407+ const reiser4_key *, reiser4_key *,
71408+ struct inode *, int, int *);
71409+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
71410+ const reiser4_key * to, struct inode *, int);
71411+
71412+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
71413+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
71414+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
71415+ znode * left, coord_t * result);
71416+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
71417+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
71418+ znode * child);
71419+extern znode *child_znode(const coord_t * in_parent, znode * parent,
71420+ int incore_p, int setup_dkeys_p);
71421+
71422+extern int cbk_cache_init(cbk_cache * cache);
71423+extern void cbk_cache_done(cbk_cache * cache);
71424+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
71425+
71426+extern char *sprint_address(const reiser4_block_nr * block);
71427+
71428+#if REISER4_DEBUG
71429+extern void print_coord_content(const char *prefix, coord_t * p);
71430+extern void reiser4_print_address(const char *prefix,
71431+ const reiser4_block_nr * block);
71432+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
71433+ __u32 flags);
71434+extern void check_dkeys(znode *node);
71435+#else
71436+#define print_coord_content(p, c) noop
71437+#define reiser4_print_address(p, b) noop
71438+#endif
71439+
71440+extern void forget_znode(lock_handle * handle);
71441+extern int deallocate_znode(znode * node);
71442+
71443+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
71444+
71445+/* struct used internally to pack all numerous arguments of tree lookup.
71446+ Used to avoid passing a lot of arguments to helper functions. */
71447+typedef struct cbk_handle {
71448+ /* tree we are in */
71449+ reiser4_tree *tree;
71450+ /* key we are going after */
71451+ const reiser4_key *key;
71452+ /* coord we will store result in */
71453+ coord_t *coord;
71454+ /* type of lock to take on target node */
71455+ znode_lock_mode lock_mode;
71456+ /* lookup bias. See comments at the declaration of lookup_bias */
71457+ lookup_bias bias;
71458+ /* lock level: level starting from which tree traversal starts taking
71459+ * write locks. */
71460+ tree_level lock_level;
71461+ /* level where search will stop. Either item will be found between
71462+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
71463+ returned.
71464+ */
71465+ tree_level stop_level;
71466+ /* level we are currently at */
71467+ tree_level level;
71468+ /* block number of @active node. Tree traversal operates on two
71469+ nodes: active and parent. */
71470+ reiser4_block_nr block;
71471+ /* put here error message to be printed by caller */
71472+ const char *error;
71473+ /* result passed back to caller */
71474+ lookup_result result;
71475+ /* lock handles for active and parent */
71476+ lock_handle *parent_lh;
71477+ lock_handle *active_lh;
71478+ reiser4_key ld_key;
71479+ reiser4_key rd_key;
71480+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
71481+ in tree.h:cbk_flags enum. */
71482+ __u32 flags;
71483+ ra_info_t *ra_info;
71484+ struct inode *object;
71485+} cbk_handle;
71486+
71487+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
71488+
71489+/* eottl.c */
71490+extern int handle_eottl(cbk_handle *h, int *outcome);
71491+
71492+int lookup_multikey(cbk_handle * handle, int nr_keys);
71493+int lookup_couple(reiser4_tree * tree,
71494+ const reiser4_key * key1, const reiser4_key * key2,
71495+ coord_t * coord1, coord_t * coord2,
71496+ lock_handle * lh1, lock_handle * lh2,
71497+ znode_lock_mode lock_mode, lookup_bias bias,
71498+ tree_level lock_level, tree_level stop_level, __u32 flags,
71499+ int *result1, int *result2);
71500+
71501+static inline void read_lock_tree(reiser4_tree *tree)
71502+{
71503+ /* check that tree is not locked */
71504+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71505+ LOCK_CNT_NIL(read_locked_tree) &&
71506+ LOCK_CNT_NIL(write_locked_tree)));
71507+ /* check that spinlocks of lower priorities are not held */
71508+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71509+ LOCK_CNT_NIL(rw_locked_dk) &&
71510+ LOCK_CNT_NIL(spin_locked_stack)));
71511+
71512+ read_lock(&(tree->tree_lock));
71513+
71514+ LOCK_CNT_INC(read_locked_tree);
71515+ LOCK_CNT_INC(rw_locked_tree);
71516+ LOCK_CNT_INC(spin_locked);
71517+}
71518+
71519+static inline void read_unlock_tree(reiser4_tree *tree)
71520+{
71521+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
71522+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71523+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71524+
71525+ LOCK_CNT_DEC(read_locked_tree);
71526+ LOCK_CNT_DEC(rw_locked_tree);
71527+ LOCK_CNT_DEC(spin_locked);
71528+
71529+ read_unlock(&(tree->tree_lock));
71530+}
71531+
71532+static inline void write_lock_tree(reiser4_tree *tree)
71533+{
71534+ /* check that tree is not locked */
71535+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71536+ LOCK_CNT_NIL(read_locked_tree) &&
71537+ LOCK_CNT_NIL(write_locked_tree)));
71538+ /* check that spinlocks of lower priorities are not held */
71539+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71540+ LOCK_CNT_NIL(rw_locked_dk) &&
71541+ LOCK_CNT_NIL(spin_locked_stack)));
71542+
71543+ write_lock(&(tree->tree_lock));
71544+
71545+ LOCK_CNT_INC(write_locked_tree);
71546+ LOCK_CNT_INC(rw_locked_tree);
71547+ LOCK_CNT_INC(spin_locked);
71548+}
71549+
71550+static inline void write_unlock_tree(reiser4_tree *tree)
71551+{
71552+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71553+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71554+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71555+
71556+ LOCK_CNT_DEC(write_locked_tree);
71557+ LOCK_CNT_DEC(rw_locked_tree);
71558+ LOCK_CNT_DEC(spin_locked);
71559+
71560+ write_unlock(&(tree->tree_lock));
71561+}
71562+
71563+static inline void read_lock_dk(reiser4_tree *tree)
71564+{
71565+ /* check that dk is not locked */
71566+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71567+ LOCK_CNT_NIL(read_locked_dk) &&
71568+ LOCK_CNT_NIL(write_locked_dk)));
71569+ /* check that spinlocks of lower priorities are not held */
71570+ assert("", LOCK_CNT_NIL(spin_locked_stack));
71571+
71572+ read_lock(&((tree)->dk_lock));
71573+
71574+ LOCK_CNT_INC(read_locked_dk);
71575+ LOCK_CNT_INC(rw_locked_dk);
71576+ LOCK_CNT_INC(spin_locked);
71577+}
71578+
71579+static inline void read_unlock_dk(reiser4_tree *tree)
71580+{
71581+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71582+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71583+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71584+
71585+ LOCK_CNT_DEC(read_locked_dk);
71586+ LOCK_CNT_DEC(rw_locked_dk);
71587+ LOCK_CNT_DEC(spin_locked);
71588+
71589+ read_unlock(&(tree->dk_lock));
71590+}
71591+
71592+static inline void write_lock_dk(reiser4_tree *tree)
71593+{
71594+ /* check that dk is not locked */
71595+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71596+ LOCK_CNT_NIL(read_locked_dk) &&
71597+ LOCK_CNT_NIL(write_locked_dk)));
71598+ /* check that spinlocks of lower priorities are not held */
71599+ assert("", LOCK_CNT_NIL(spin_locked_stack));
71600+
71601+ write_lock(&((tree)->dk_lock));
71602+
71603+ LOCK_CNT_INC(write_locked_dk);
71604+ LOCK_CNT_INC(rw_locked_dk);
71605+ LOCK_CNT_INC(spin_locked);
71606+}
71607+
71608+static inline void write_unlock_dk(reiser4_tree *tree)
71609+{
71610+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71611+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71612+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71613+
71614+ LOCK_CNT_DEC(write_locked_dk);
71615+ LOCK_CNT_DEC(rw_locked_dk);
71616+ LOCK_CNT_DEC(spin_locked);
71617+
71618+ write_unlock(&(tree->dk_lock));
71619+}
71620+
71621+/* estimate api. Implementation is in estimate.c */
71622+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71623+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71624+reiser4_block_nr estimate_insert_flow(tree_level);
71625+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71626+reiser4_block_nr calc_estimate_one_insert(tree_level);
71627+reiser4_block_nr estimate_dirty_cluster(struct inode *);
71628+reiser4_block_nr estimate_insert_cluster(struct inode *);
71629+reiser4_block_nr estimate_update_cluster(struct inode *);
71630+
71631+/* __REISER4_TREE_H__ */
71632+#endif
71633+
71634+/* Make Linus happy.
71635+ Local variables:
71636+ c-indentation-style: "K&R"
71637+ mode-name: "LC"
71638+ c-basic-offset: 8
71639+ tab-width: 8
71640+ fill-column: 120
71641+ scroll-step: 1
71642+ End:
71643+*/
71644diff --git a/fs/reiser4/tree_mod.c b/fs/reiser4/tree_mod.c
71645new file mode 100644
71646index 0000000..bcc6548
71647--- /dev/null
71648+++ b/fs/reiser4/tree_mod.c
71649@@ -0,0 +1,386 @@
71650+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71651+ * reiser4/README */
71652+
71653+/*
71654+ * Functions to add/delete new nodes to/from the tree.
71655+ *
71656+ * Functions from this file are used by carry (see carry*) to handle:
71657+ *
71658+ * . insertion of new formatted node into tree
71659+ *
71660+ * . addition of new tree root, increasing tree height
71661+ *
71662+ * . removing tree root, decreasing tree height
71663+ *
71664+ */
71665+
71666+#include "forward.h"
71667+#include "debug.h"
71668+#include "dformat.h"
71669+#include "key.h"
71670+#include "coord.h"
71671+#include "plugin/plugin.h"
71672+#include "jnode.h"
71673+#include "znode.h"
71674+#include "tree_mod.h"
71675+#include "block_alloc.h"
71676+#include "tree_walk.h"
71677+#include "tree.h"
71678+#include "super.h"
71679+
71680+#include <linux/err.h>
71681+
71682+static int add_child_ptr(znode * parent, znode * child);
71683+/* warning only issued if error is not -E_REPEAT */
71684+#define ewarning( error, ... ) \
71685+ if( ( error ) != -E_REPEAT ) \
71686+ warning( __VA_ARGS__ )
71687+
71688+/* allocate new node on the @level and immediately on the right of @brother. */
71689+znode * reiser4_new_node(znode * brother /* existing left neighbor
71690+ * of new node */,
71691+ tree_level level /* tree level at which new node is to
71692+ * be allocated */)
71693+{
71694+ znode *result;
71695+ int retcode;
71696+ reiser4_block_nr blocknr;
71697+
71698+ assert("nikita-930", brother != NULL);
71699+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71700+
71701+ retcode = assign_fake_blocknr_formatted(&blocknr);
71702+ if (retcode == 0) {
71703+ result =
71704+ zget(znode_get_tree(brother), &blocknr, NULL, level,
71705+ reiser4_ctx_gfp_mask_get());
71706+ if (IS_ERR(result)) {
71707+ ewarning(PTR_ERR(result), "nikita-929",
71708+ "Cannot allocate znode for carry: %li",
71709+ PTR_ERR(result));
71710+ return result;
71711+ }
71712+ /* cheap test, can be executed even when debugging is off */
71713+ if (!znode_just_created(result)) {
71714+ warning("nikita-2213",
71715+ "Allocated already existing block: %llu",
71716+ (unsigned long long)blocknr);
71717+ zput(result);
71718+ return ERR_PTR(RETERR(-EIO));
71719+ }
71720+
71721+ assert("nikita-931", result != NULL);
71722+ result->nplug = znode_get_tree(brother)->nplug;
71723+ assert("nikita-933", result->nplug != NULL);
71724+
71725+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
71726+ if (retcode == 0) {
71727+ ZF_SET(result, JNODE_CREATED);
71728+ zrelse(result);
71729+ } else {
71730+ zput(result);
71731+ result = ERR_PTR(retcode);
71732+ }
71733+ } else {
71734+ /* failure to allocate new node during balancing.
71735+ This should never happen. Ever. Returning -E_REPEAT
71736+ is not viable solution, because "out of disk space"
71737+ is not transient error that will go away by itself.
71738+ */
71739+ ewarning(retcode, "nikita-928",
71740+ "Cannot allocate block for carry: %i", retcode);
71741+ result = ERR_PTR(retcode);
71742+ }
71743+ assert("nikita-1071", result != NULL);
71744+ return result;
71745+}
71746+
71747+/* allocate new root and add it to the tree
71748+
71749+ This helper function is called by add_new_root().
71750+
71751+*/
71752+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
71753+ znode * fake /* "fake" znode */ )
71754+{
71755+ reiser4_tree *tree = znode_get_tree(old_root);
71756+ znode *new_root = NULL; /* to shut gcc up */
71757+ int result;
71758+
71759+ assert("nikita-1069", old_root != NULL);
71760+ assert("umka-262", fake != NULL);
71761+ assert("umka-263", tree != NULL);
71762+
71763+ /* "fake" znode---one always hanging just above current root. This
71764+ node is locked when new root is created or existing root is
71765+ deleted. Downward tree traversal takes lock on it before taking
71766+ lock on a root node. This avoids race conditions with root
71767+ manipulations.
71768+
71769+ */
71770+ assert("nikita-1348", znode_above_root(fake));
71771+ assert("nikita-1211", znode_is_root(old_root));
71772+
71773+ result = 0;
71774+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71775+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
71776+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
71777+ following comment (fs/ext2/ialloc.c:441): Is it really
71778+ ENOSPC?
71779+
71780+ -EXFULL? -EINVAL?
71781+ */
71782+ result = RETERR(-ENOSPC);
71783+ } else {
71784+ /* Allocate block for new root. It's not that
71785+ important where it will be allocated, as root is
71786+ almost always in memory. Moreover, allocate on
71787+ flush can be going here.
71788+ */
71789+ assert("nikita-1448", znode_is_root(old_root));
71790+ new_root = reiser4_new_node(fake, tree->height + 1);
71791+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71792+ lock_handle rlh;
71793+
71794+ init_lh(&rlh);
71795+ result =
71796+ longterm_lock_znode(&rlh, new_root,
71797+ ZNODE_WRITE_LOCK,
71798+ ZNODE_LOCK_LOPRI);
71799+ if (result == 0) {
71800+ parent_coord_t *in_parent;
71801+
71802+ znode_make_dirty(fake);
71803+
71804+ /* new root is a child of "fake" node */
71805+ write_lock_tree(tree);
71806+
71807+ ++tree->height;
71808+
71809+ /* recalculate max balance overhead */
71810+ tree->estimate_one_insert =
71811+ estimate_one_insert_item(tree);
71812+
71813+ tree->root_block = *znode_get_block(new_root);
71814+ in_parent = &new_root->in_parent;
71815+ init_parent_coord(in_parent, fake);
71816+ /* manually insert new root into sibling
71817+ * list. With this all nodes involved into
71818+ * balancing are connected after balancing is
71819+ * done---useful invariant to check. */
71820+ sibling_list_insert_nolock(new_root, NULL);
71821+ write_unlock_tree(tree);
71822+
71823+ /* insert into new root pointer to the
71824+ @old_root. */
71825+ assert("nikita-1110",
71826+ WITH_DATA(new_root,
71827+ node_is_empty(new_root)));
71828+ write_lock_dk(tree);
71829+ znode_set_ld_key(new_root, reiser4_min_key());
71830+ znode_set_rd_key(new_root, reiser4_max_key());
71831+ write_unlock_dk(tree);
71832+ if (REISER4_DEBUG) {
71833+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71834+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71835+ ZF_SET(old_root, JNODE_ORPHAN);
71836+ }
71837+ result = add_child_ptr(new_root, old_root);
71838+ done_lh(&rlh);
71839+ }
71840+ zrelse(new_root);
71841+ }
71842+ }
71843+ if (result != 0)
71844+ new_root = ERR_PTR(result);
71845+ return new_root;
71846+}
71847+
71848+/* build &reiser4_item_data for inserting child pointer
71849+
71850+ Build &reiser4_item_data that can be later used to insert pointer to @child
71851+ in its parent.
71852+
71853+*/
71854+void build_child_ptr_data(znode * child /* node pointer to which will be
71855+ * inserted */ ,
71856+ reiser4_item_data * data /* where to store result */ )
71857+{
71858+ assert("nikita-1116", child != NULL);
71859+ assert("nikita-1117", data != NULL);
71860+
71861+ /*
71862+ * NOTE: use address of child's blocknr as address of data to be
71863+ * inserted. As result of this data gets into on-disk structure in cpu
71864+ * byte order. internal's create_hook converts it to little endian byte
71865+ * order.
71866+ */
71867+ data->data = (char *)znode_get_block(child);
71868+ /* data -> data is kernel space */
71869+ data->user = 0;
71870+ data->length = sizeof(reiser4_block_nr);
71871+ /* FIXME-VS: hardcoded internal item? */
71872+
71873+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71874+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71875+}
71876+
71877+/* add pointer to @child into empty @parent.
71878+
71879+ This is used when pointer to old root is inserted into new root which is
71880+ empty.
71881+*/
71882+static int add_child_ptr(znode * parent, znode * child)
71883+{
71884+ coord_t coord;
71885+ reiser4_item_data data;
71886+ int result;
71887+ reiser4_key key;
71888+
71889+ assert("nikita-1111", parent != NULL);
71890+ assert("nikita-1112", child != NULL);
71891+ assert("nikita-1115",
71892+ znode_get_level(parent) == znode_get_level(child) + 1);
71893+
71894+ result = zload(parent);
71895+ if (result != 0)
71896+ return result;
71897+ assert("nikita-1113", node_is_empty(parent));
71898+ coord_init_first_unit(&coord, parent);
71899+
71900+ build_child_ptr_data(child, &data);
71901+ data.arg = NULL;
71902+
71903+ read_lock_dk(znode_get_tree(parent));
71904+ key = *znode_get_ld_key(child);
71905+ read_unlock_dk(znode_get_tree(parent));
71906+
71907+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71908+ NULL);
71909+ znode_make_dirty(parent);
71910+ zrelse(parent);
71911+ return result;
71912+}
71913+
71914+/* actually remove tree root */
71915+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
71916+ * being removed */,
71917+ znode * old_root /* root node that is being
71918+ * removed */ ,
71919+ znode * new_root /* new root---sole child of
71920+ * @old_root */,
71921+ const reiser4_block_nr * new_root_blk /* disk address of
71922+ * @new_root */)
71923+{
71924+ znode *uber;
71925+ int result;
71926+ lock_handle handle_for_uber;
71927+
71928+ assert("umka-265", tree != NULL);
71929+ assert("nikita-1198", new_root != NULL);
71930+ assert("nikita-1199",
71931+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
71932+
71933+ assert("nikita-1201", znode_is_write_locked(old_root));
71934+
71935+ assert("nikita-1203",
71936+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71937+
71938+ init_lh(&handle_for_uber);
71939+ /* obtain and lock "fake" znode protecting changes in tree height. */
71940+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71941+ &handle_for_uber);
71942+ if (result == 0) {
71943+ uber = handle_for_uber.node;
71944+
71945+ znode_make_dirty(uber);
71946+
71947+ /* don't take long term lock a @new_root. Take spinlock. */
71948+
71949+ write_lock_tree(tree);
71950+
71951+ tree->root_block = *new_root_blk;
71952+ --tree->height;
71953+
71954+ /* recalculate max balance overhead */
71955+ tree->estimate_one_insert = estimate_one_insert_item(tree);
71956+
71957+ assert("nikita-1202",
71958+ tree->height == znode_get_level(new_root));
71959+
71960+ /* new root is child on "fake" node */
71961+ init_parent_coord(&new_root->in_parent, uber);
71962+ ++uber->c_count;
71963+
71964+ /* sibling_list_insert_nolock(new_root, NULL); */
71965+ write_unlock_tree(tree);
71966+
71967+ /* reinitialise old root. */
71968+ result = node_plugin_by_node(old_root)->init(old_root);
71969+ znode_make_dirty(old_root);
71970+ if (result == 0) {
71971+ assert("nikita-1279", node_is_empty(old_root));
71972+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71973+ old_root->c_count = 0;
71974+ }
71975+ }
71976+ done_lh(&handle_for_uber);
71977+
71978+ return result;
71979+}
71980+
71981+/* remove tree root
71982+
71983+ This function removes tree root, decreasing tree height by one. Tree root
71984+ and its only child (that is going to become new tree root) are write locked
71985+ at the entry.
71986+
71987+ To remove tree root we need to take lock on special "fake" znode that
71988+ protects changes of tree height. See comments in reiser4_add_tree_root() for
71989+ more on this.
71990+
71991+ Also parent pointers have to be updated in
71992+ old and new root. To simplify code, function is split into two parts: outer
71993+ reiser4_kill_tree_root() collects all necessary arguments and calls
71994+ reiser4_kill_root() to do the actual job.
71995+
71996+*/
71997+int reiser4_kill_tree_root(znode * old_root /* tree root that we are
71998+ removing*/)
71999+{
72000+ int result;
72001+ coord_t down_link;
72002+ znode *new_root;
72003+ reiser4_tree *tree;
72004+
72005+ assert("umka-266", current_tree != NULL);
72006+ assert("nikita-1194", old_root != NULL);
72007+ assert("nikita-1196", znode_is_root(old_root));
72008+ assert("nikita-1200", node_num_items(old_root) == 1);
72009+ assert("nikita-1401", znode_is_write_locked(old_root));
72010+
72011+ coord_init_first_unit(&down_link, old_root);
72012+
72013+ tree = znode_get_tree(old_root);
72014+ new_root = child_znode(&down_link, old_root, 0, 1);
72015+ if (!IS_ERR(new_root)) {
72016+ result =
72017+ reiser4_kill_root(tree, old_root, new_root,
72018+ znode_get_block(new_root));
72019+ zput(new_root);
72020+ } else
72021+ result = PTR_ERR(new_root);
72022+
72023+ return result;
72024+}
72025+
72026+/* Make Linus happy.
72027+ Local variables:
72028+ c-indentation-style: "K&R"
72029+ mode-name: "LC"
72030+ c-basic-offset: 8
72031+ tab-width: 8
72032+ fill-column: 120
72033+ scroll-step: 1
72034+ End:
72035+*/
72036diff --git a/fs/reiser4/tree_mod.h b/fs/reiser4/tree_mod.h
72037new file mode 100644
72038index 0000000..1519641
72039--- /dev/null
72040+++ b/fs/reiser4/tree_mod.h
72041@@ -0,0 +1,29 @@
72042+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72043+ * reiser4/README */
72044+
72045+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
72046+ * comments. */
72047+
72048+#if !defined( __REISER4_TREE_MOD_H__ )
72049+#define __REISER4_TREE_MOD_H__
72050+
72051+#include "forward.h"
72052+
72053+znode *reiser4_new_node(znode * brother, tree_level level);
72054+znode *reiser4_add_tree_root(znode * old_root, znode * fake);
72055+int reiser4_kill_tree_root(znode * old_root);
72056+void build_child_ptr_data(znode * child, reiser4_item_data * data);
72057+
72058+/* __REISER4_TREE_MOD_H__ */
72059+#endif
72060+
72061+/* Make Linus happy.
72062+ Local variables:
72063+ c-indentation-style: "K&R"
72064+ mode-name: "LC"
72065+ c-basic-offset: 8
72066+ tab-width: 8
72067+ fill-column: 120
72068+ scroll-step: 1
72069+ End:
72070+*/
72071diff --git a/fs/reiser4/tree_walk.c b/fs/reiser4/tree_walk.c
72072new file mode 100644
72073index 0000000..cde4875
72074--- /dev/null
72075+++ b/fs/reiser4/tree_walk.c
72076@@ -0,0 +1,927 @@
72077+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72078+ * reiser4/README */
72079+
72080+/* Routines and macros to:
72081+
72082+ get_left_neighbor()
72083+
72084+ get_right_neighbor()
72085+
72086+ get_parent()
72087+
72088+ get_first_child()
72089+
72090+ get_last_child()
72091+
72092+ various routines to walk the whole tree and do things to it like
72093+ repack it, or move it to tertiary storage. Please make them as
72094+ generic as is reasonable.
72095+
72096+*/
72097+
72098+#include "forward.h"
72099+#include "debug.h"
72100+#include "dformat.h"
72101+#include "coord.h"
72102+#include "plugin/item/item.h"
72103+#include "jnode.h"
72104+#include "znode.h"
72105+#include "tree_walk.h"
72106+#include "tree.h"
72107+#include "super.h"
72108+
72109+/* These macros are used internally in tree_walk.c in attempt to make
72110+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
72111+ lock_left_neighbor */
72112+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
72113+#define FIELD_OFFSET(name) offsetof(znode, name)
72114+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
72115+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
72116+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
72117+
72118+/* This is the generic procedure to get and lock `generic' neighbor (left or
72119+ right neighbor or parent). It implements common algorithm for all cases of
72120+ getting lock on neighbor node, only znode structure field is different in
72121+ each case. This is parameterized by ptr_offset argument, which is byte
72122+ offset for the pointer to the desired neighbor within the current node's
72123+ znode structure. This function should be called with the tree lock held */
72124+static int lock_neighbor(
72125+ /* resulting lock handle */
72126+ lock_handle * result,
72127+ /* znode to lock */
72128+ znode * node,
72129+ /* pointer to neighbor (or parent) znode field offset, in bytes from
72130+ the base address of znode structure */
72131+ int ptr_offset,
72132+ /* lock mode for longterm_lock_znode call */
72133+ znode_lock_mode mode,
72134+ /* lock request for longterm_lock_znode call */
72135+ znode_lock_request req,
72136+ /* GN_* flags */
72137+ int flags, int rlocked)
72138+{
72139+ reiser4_tree *tree = znode_get_tree(node);
72140+ znode *neighbor;
72141+ int ret;
72142+
72143+ assert("umka-236", node != NULL);
72144+ assert("umka-237", tree != NULL);
72145+ assert_rw_locked(&(tree->tree_lock));
72146+
72147+ if (flags & GN_TRY_LOCK)
72148+ req |= ZNODE_LOCK_NONBLOCK;
72149+ if (flags & GN_SAME_ATOM)
72150+ req |= ZNODE_LOCK_DONT_FUSE;
72151+
72152+ /* get neighbor's address by using of sibling link, quit while loop
72153+ (and return) if link is not available. */
72154+ while (1) {
72155+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
72156+
72157+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
72158+ * node pointed by it is not connected.
72159+ *
72160+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
72161+ * check and allows passing reference to not connected znode to
72162+ * subsequent longterm_lock_znode() call. This kills possible
72163+ * busy loop if we are trying to get longterm lock on locked but
72164+ * not yet connected parent node. */
72165+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
72166+ || znode_is_connected(neighbor))) {
72167+ return RETERR(-E_NO_NEIGHBOR);
72168+ }
72169+
72170+ /* protect it from deletion. */
72171+ zref(neighbor);
72172+
72173+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
72174+
72175+ ret = longterm_lock_znode(result, neighbor, mode, req);
72176+
72177+ /* The lock handle obtains its own reference, release the one from above. */
72178+ zput(neighbor);
72179+
72180+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
72181+
72182+ /* restart if node we got reference to is being
72183+ invalidated. we should not get reference to this node
72184+ again. */
72185+ if (ret == -EINVAL)
72186+ continue;
72187+ if (ret)
72188+ return ret;
72189+
72190+ /* check if neighbor link still points to just locked znode;
72191+ the link could have been changed while the process slept. */
72192+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
72193+ return 0;
72194+
72195+ /* znode was locked by mistake; unlock it and restart locking
72196+ process from beginning. */
72197+ rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
72198+ longterm_unlock_znode(result);
72199+ rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
72200+ }
72201+}
72202+
72203+/* get parent node with longterm lock, accepts GN* flags. */
72204+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
72205+ znode * node /* child node */ ,
72206+ znode_lock_mode mode
72207+ /* type of lock: read or write */ ,
72208+ int flags /* GN_* flags */ )
72209+{
72210+ int result;
72211+
72212+ read_lock_tree(znode_get_tree(node));
72213+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
72214+ ZNODE_LOCK_HIPRI, flags, 1);
72215+ read_unlock_tree(znode_get_tree(node));
72216+ return result;
72217+}
72218+
72219+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
72220+ bit in @flags parameter */
72221+/* Audited by: umka (2002.06.14) */
72222+static inline int
72223+lock_side_neighbor(lock_handle * result,
72224+ znode * node, znode_lock_mode mode, int flags, int rlocked)
72225+{
72226+ int ret;
72227+ int ptr_offset;
72228+ znode_lock_request req;
72229+
72230+ if (flags & GN_GO_LEFT) {
72231+ ptr_offset = LEFT_PTR_OFFSET;
72232+ req = ZNODE_LOCK_LOPRI;
72233+ } else {
72234+ ptr_offset = RIGHT_PTR_OFFSET;
72235+ req = ZNODE_LOCK_HIPRI;
72236+ }
72237+
72238+ ret =
72239+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
72240+
72241+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
72242+ * guarantee that neighbor is absent in the
72243+ * tree; in this case we return -ENOENT --
72244+ * means neighbor at least not found in
72245+ * cache */
72246+ return RETERR(-ENOENT);
72247+
72248+ return ret;
72249+}
72250+
72251+#if REISER4_DEBUG
72252+
72253+int check_sibling_list(znode * node)
72254+{
72255+ znode *scan;
72256+ znode *next;
72257+
72258+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
72259+
72260+ if (node == NULL)
72261+ return 1;
72262+
72263+ if (ZF_ISSET(node, JNODE_RIP))
72264+ return 1;
72265+
72266+ assert("nikita-3270", node != NULL);
72267+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
72268+
72269+ for (scan = node; znode_is_left_connected(scan); scan = next) {
72270+ next = scan->left;
72271+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
72272+ assert("nikita-3271", znode_is_right_connected(next));
72273+ assert("nikita-3272", next->right == scan);
72274+ } else
72275+ break;
72276+ }
72277+ for (scan = node; znode_is_right_connected(scan); scan = next) {
72278+ next = scan->right;
72279+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
72280+ assert("nikita-3273", znode_is_left_connected(next));
72281+ assert("nikita-3274", next->left == scan);
72282+ } else
72283+ break;
72284+ }
72285+ return 1;
72286+}
72287+
72288+#endif
72289+
72290+/* Znode sibling pointers maintenence. */
72291+
72292+/* Znode sibling pointers are established between any neighbored nodes which are
72293+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
72294+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
72295+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
72296+
72297+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
72298+ take care about searching (hash table lookup may be required) of znode
72299+ neighbors, establishing sibling pointers between them and setting
72300+ JNODE_*_CONNECTED state bits. */
72301+
72302+/* adjusting of sibling pointers and `connected' states for two
72303+ neighbors; works if one neighbor is NULL (was not found). */
72304+
72305+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
72306+void link_left_and_right(znode * left, znode * right)
72307+{
72308+ assert("nikita-3275", check_sibling_list(left));
72309+ assert("nikita-3275", check_sibling_list(right));
72310+
72311+ if (left != NULL) {
72312+ if (left->right == NULL) {
72313+ left->right = right;
72314+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
72315+
72316+ ON_DEBUG(left->right_version =
72317+ atomic_inc_return(&delim_key_version);
72318+ );
72319+
72320+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
72321+ && left->right != right) {
72322+
72323+ ON_DEBUG(left->right->left_version =
72324+ atomic_inc_return(&delim_key_version);
72325+ left->right_version =
72326+ atomic_inc_return(&delim_key_version););
72327+
72328+ left->right->left = NULL;
72329+ left->right = right;
72330+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
72331+ } else
72332+ /*
72333+ * there is a race condition in renew_sibling_link()
72334+ * and assertions below check that it is only one
72335+ * there. Thread T1 calls renew_sibling_link() without
72336+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
72337+ * node, but before T1 gets to the
72338+ * link_left_and_right(), another thread T2 creates
72339+ * neighbor node and connects it. check for
72340+ * left->right == NULL above protects T1 from
72341+ * overwriting correct left->right pointer installed
72342+ * by T2.
72343+ */
72344+ assert("nikita-3302",
72345+ right == NULL || left->right == right);
72346+ }
72347+ if (right != NULL) {
72348+ if (right->left == NULL) {
72349+ right->left = left;
72350+ ZF_SET(right, JNODE_LEFT_CONNECTED);
72351+
72352+ ON_DEBUG(right->left_version =
72353+ atomic_inc_return(&delim_key_version);
72354+ );
72355+
72356+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
72357+ && right->left != left) {
72358+
72359+ ON_DEBUG(right->left->right_version =
72360+ atomic_inc_return(&delim_key_version);
72361+ right->left_version =
72362+ atomic_inc_return(&delim_key_version););
72363+
72364+ right->left->right = NULL;
72365+ right->left = left;
72366+ ZF_SET(right, JNODE_LEFT_CONNECTED);
72367+
72368+ } else
72369+ assert("nikita-3303",
72370+ left == NULL || right->left == left);
72371+ }
72372+ assert("nikita-3275", check_sibling_list(left));
72373+ assert("nikita-3275", check_sibling_list(right));
72374+}
72375+
72376+/* Audited by: umka (2002.06.14) */
72377+static void link_znodes(znode * first, znode * second, int to_left)
72378+{
72379+ if (to_left)
72380+ link_left_and_right(second, first);
72381+ else
72382+ link_left_and_right(first, second);
72383+}
72384+
72385+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
72386+ coord's unit position in horizontal direction, even across node
72387+ boundary. Should be called under tree lock, it protects nonexistence of
72388+ sibling link on parent level, if lock_side_neighbor() fails with
72389+ -ENOENT. */
72390+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
72391+{
72392+ int ret;
72393+ znode *node;
72394+ reiser4_tree *tree;
72395+
72396+ assert("umka-243", coord != NULL);
72397+ assert("umka-244", handle != NULL);
72398+ assert("zam-1069", handle->node == NULL);
72399+
72400+ ret =
72401+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
72402+ coord_next_unit(coord);
72403+ if (!ret)
72404+ return 0;
72405+
72406+ ret =
72407+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
72408+ if (ret)
72409+ return ret;
72410+
72411+ node = handle->node;
72412+ tree = znode_get_tree(node);
72413+ write_unlock_tree(tree);
72414+
72415+ coord_init_zero(coord);
72416+
72417+ /* We avoid synchronous read here if it is specified by flag. */
72418+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
72419+ ret = jstartio(ZJNODE(handle->node));
72420+ if (!ret)
72421+ ret = -E_REPEAT;
72422+ goto error_locked;
72423+ }
72424+
72425+ /* corresponded zrelse() should be called by the clients of
72426+ far_next_coord(), in place when this node gets unlocked. */
72427+ ret = zload(handle->node);
72428+ if (ret)
72429+ goto error_locked;
72430+
72431+ if (flags & GN_GO_LEFT)
72432+ coord_init_last_unit(coord, node);
72433+ else
72434+ coord_init_first_unit(coord, node);
72435+
72436+ if (0) {
72437+ error_locked:
72438+ longterm_unlock_znode(handle);
72439+ }
72440+ write_lock_tree(tree);
72441+ return ret;
72442+}
72443+
72444+/* Very significant function which performs a step in horizontal direction
72445+ when sibling pointer is not available. Actually, it is only function which
72446+ does it.
72447+ Note: this function does not restore locking status at exit,
72448+ caller should does care about proper unlocking and zrelsing */
72449+static int
72450+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
72451+ tree_level level, int flags, int *nr_locked)
72452+{
72453+ int ret;
72454+ int to_left = flags & GN_GO_LEFT;
72455+ reiser4_block_nr da;
72456+ /* parent of the neighbor node; we set it to parent until not sharing
72457+ of one parent between child and neighbor node is detected */
72458+ znode *side_parent = coord->node;
72459+ reiser4_tree *tree = znode_get_tree(child);
72460+ znode *neighbor = NULL;
72461+
72462+ assert("umka-245", coord != NULL);
72463+ assert("umka-246", handle != NULL);
72464+ assert("umka-247", child != NULL);
72465+ assert("umka-303", tree != NULL);
72466+
72467+ init_lh(handle);
72468+ write_lock_tree(tree);
72469+ ret = far_next_coord(coord, handle, flags);
72470+
72471+ if (ret) {
72472+ if (ret != -ENOENT) {
72473+ write_unlock_tree(tree);
72474+ return ret;
72475+ }
72476+ } else {
72477+ item_plugin *iplug;
72478+
72479+ if (handle->node != NULL) {
72480+ (*nr_locked)++;
72481+ side_parent = handle->node;
72482+ }
72483+
72484+ /* does coord object points to internal item? We do not
72485+ support sibling pointers between znode for formatted and
72486+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
72487+ iplug = item_plugin_by_coord(coord);
72488+ if (!item_is_internal(coord)) {
72489+ link_znodes(child, NULL, to_left);
72490+ write_unlock_tree(tree);
72491+ /* we know there can't be formatted neighbor */
72492+ return RETERR(-E_NO_NEIGHBOR);
72493+ }
72494+ write_unlock_tree(tree);
72495+
72496+ iplug->s.internal.down_link(coord, NULL, &da);
72497+
72498+ if (flags & GN_NO_ALLOC) {
72499+ neighbor = zlook(tree, &da);
72500+ } else {
72501+ neighbor =
72502+ zget(tree, &da, side_parent, level,
72503+ reiser4_ctx_gfp_mask_get());
72504+ }
72505+
72506+ if (IS_ERR(neighbor)) {
72507+ ret = PTR_ERR(neighbor);
72508+ return ret;
72509+ }
72510+
72511+ if (neighbor)
72512+ /* update delimiting keys */
72513+ set_child_delimiting_keys(coord->node, coord, neighbor);
72514+
72515+ write_lock_tree(tree);
72516+ }
72517+
72518+ if (likely(neighbor == NULL ||
72519+ (znode_get_level(child) == znode_get_level(neighbor)
72520+ && child != neighbor)))
72521+ link_znodes(child, neighbor, to_left);
72522+ else {
72523+ warning("nikita-3532",
72524+ "Sibling nodes on the different levels: %i != %i\n",
72525+ znode_get_level(child), znode_get_level(neighbor));
72526+ ret = RETERR(-EIO);
72527+ }
72528+
72529+ write_unlock_tree(tree);
72530+
72531+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72532+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
72533+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72534+ zput(neighbor);
72535+
72536+ return ret;
72537+}
72538+
72539+/* This function is for establishing of one side relation. */
72540+/* Audited by: umka (2002.06.14) */
72541+static int connect_one_side(coord_t * coord, znode * node, int flags)
72542+{
72543+ coord_t local;
72544+ lock_handle handle;
72545+ int nr_locked;
72546+ int ret;
72547+
72548+ assert("umka-248", coord != NULL);
72549+ assert("umka-249", node != NULL);
72550+
72551+ coord_dup_nocheck(&local, coord);
72552+
72553+ init_lh(&handle);
72554+
72555+ ret =
72556+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
72557+ flags | GN_NO_ALLOC, &nr_locked);
72558+
72559+ if (handle.node != NULL) {
72560+ /* complementary operations for zload() and lock() in far_next_coord() */
72561+ zrelse(handle.node);
72562+ longterm_unlock_znode(&handle);
72563+ }
72564+
72565+ /* we catch error codes which are not interesting for us because we
72566+ run renew_sibling_link() only for znode connection. */
72567+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72568+ return 0;
72569+
72570+ return ret;
72571+}
72572+
72573+/* if @child is not in `connected' state, performs hash searches for left and
72574+ right neighbor nodes and establishes horizontal sibling links */
72575+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72576+int connect_znode(coord_t * parent_coord, znode * child)
72577+{
72578+ reiser4_tree *tree = znode_get_tree(child);
72579+ int ret = 0;
72580+
72581+ assert("zam-330", parent_coord != NULL);
72582+ assert("zam-331", child != NULL);
72583+ assert("zam-332", parent_coord->node != NULL);
72584+ assert("umka-305", tree != NULL);
72585+
72586+ /* it is trivial to `connect' root znode because it can't have
72587+ neighbors */
72588+ if (znode_above_root(parent_coord->node)) {
72589+ child->left = NULL;
72590+ child->right = NULL;
72591+ ZF_SET(child, JNODE_LEFT_CONNECTED);
72592+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
72593+
72594+ ON_DEBUG(child->left_version =
72595+ atomic_inc_return(&delim_key_version);
72596+ child->right_version =
72597+ atomic_inc_return(&delim_key_version););
72598+
72599+ return 0;
72600+ }
72601+
72602+ /* load parent node */
72603+ coord_clear_iplug(parent_coord);
72604+ ret = zload(parent_coord->node);
72605+
72606+ if (ret != 0)
72607+ return ret;
72608+
72609+ /* protect `connected' state check by tree_lock */
72610+ read_lock_tree(tree);
72611+
72612+ if (!znode_is_right_connected(child)) {
72613+ read_unlock_tree(tree);
72614+ /* connect right (default is right) */
72615+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72616+ if (ret)
72617+ goto zrelse_and_ret;
72618+
72619+ read_lock_tree(tree);
72620+ }
72621+
72622+ ret = znode_is_left_connected(child);
72623+
72624+ read_unlock_tree(tree);
72625+
72626+ if (!ret) {
72627+ ret =
72628+ connect_one_side(parent_coord, child,
72629+ GN_NO_ALLOC | GN_GO_LEFT);
72630+ } else
72631+ ret = 0;
72632+
72633+ zrelse_and_ret:
72634+ zrelse(parent_coord->node);
72635+
72636+ return ret;
72637+}
72638+
72639+/* this function is like renew_sibling_link() but allocates neighbor node if
72640+ it doesn't exist and `connects' it. It may require making two steps in
72641+ horizontal direction, first one for neighbor node finding/allocation,
72642+ second one is for finding neighbor of neighbor to connect freshly allocated
72643+ znode. */
72644+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72645+static int
72646+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72647+{
72648+ coord_t local;
72649+ lock_handle empty[2];
72650+ reiser4_tree *tree = znode_get_tree(node);
72651+ znode *neighbor = NULL;
72652+ int nr_locked = 0;
72653+ int ret;
72654+
72655+ assert("umka-250", coord != NULL);
72656+ assert("umka-251", node != NULL);
72657+ assert("umka-307", tree != NULL);
72658+ assert("umka-308", level <= tree->height);
72659+
72660+ /* umka (2002.06.14)
72661+ Here probably should be a check for given "level" validness.
72662+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72663+ */
72664+
72665+ coord_dup(&local, coord);
72666+
72667+ ret =
72668+ renew_sibling_link(&local, &empty[0], node, level,
72669+ flags & ~GN_NO_ALLOC, &nr_locked);
72670+ if (ret)
72671+ goto out;
72672+
72673+ /* tree lock is not needed here because we keep parent node(s) locked
72674+ and reference to neighbor znode incremented */
72675+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72676+
72677+ read_lock_tree(tree);
72678+ ret = znode_is_connected(neighbor);
72679+ read_unlock_tree(tree);
72680+ if (ret) {
72681+ ret = 0;
72682+ goto out;
72683+ }
72684+
72685+ ret =
72686+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72687+ flags | GN_NO_ALLOC, &nr_locked);
72688+ /* second renew_sibling_link() call is used for znode connection only,
72689+ so we can live with these errors */
72690+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72691+ ret = 0;
72692+
72693+ out:
72694+
72695+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
72696+ zrelse(empty[nr_locked].node);
72697+ longterm_unlock_znode(&empty[nr_locked]);
72698+ }
72699+
72700+ if (neighbor != NULL)
72701+ /* decrement znode reference counter without actually
72702+ releasing it. */
72703+ atomic_dec(&ZJNODE(neighbor)->x_count);
72704+
72705+ return ret;
72706+}
72707+
72708+/*
72709+ reiser4_get_neighbor() -- lock node's neighbor.
72710+
72711+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72712+ given parameter) using sibling link to it. If sibling link is not available
72713+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72714+ level up for information about neighbor's disk address. We lock node's
72715+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
72716+ disk address is in next (to left or to right) down link from link that points
72717+ to original node. If not, we need to lock parent's neighbor, read its content
72718+ and take first(last) downlink with neighbor's disk address. That locking
72719+ could be done by using sibling link and lock_neighbor() function, if sibling
72720+ link exists. In another case we have to go level up again until we find
72721+ common parent or valid sibling link. Then go down
72722+ allocating/connecting/locking/reading nodes until neighbor of first one is
72723+ locked.
72724+
72725+ @neighbor: result lock handle,
72726+ @node: a node which we lock neighbor of,
72727+ @lock_mode: lock mode {LM_READ, LM_WRITE},
72728+ @flags: logical OR of {GN_*} (see description above) subset.
72729+
72730+ @return: 0 if success, negative value if lock was impossible due to an error
72731+ or lack of neighbor node.
72732+*/
72733+
72734+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72735+int
72736+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72737+ znode_lock_mode lock_mode, int flags)
72738+{
72739+ reiser4_tree *tree = znode_get_tree(node);
72740+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72741+
72742+ coord_t coord;
72743+
72744+ tree_level base_level;
72745+ tree_level h = 0;
72746+ int ret;
72747+
72748+ assert("umka-252", tree != NULL);
72749+ assert("umka-253", neighbor != NULL);
72750+ assert("umka-254", node != NULL);
72751+
72752+ base_level = znode_get_level(node);
72753+
72754+ assert("umka-310", base_level <= tree->height);
72755+
72756+ coord_init_zero(&coord);
72757+
72758+ again:
72759+ /* first, we try to use simple lock_neighbor() which requires sibling
72760+ link existence */
72761+ read_lock_tree(tree);
72762+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72763+ read_unlock_tree(tree);
72764+ if (!ret) {
72765+ /* load znode content if it was specified */
72766+ if (flags & GN_LOAD_NEIGHBOR) {
72767+ ret = zload(node);
72768+ if (ret)
72769+ longterm_unlock_znode(neighbor);
72770+ }
72771+ return ret;
72772+ }
72773+
72774+ /* only -ENOENT means we may look upward and try to connect
72775+ @node with its neighbor (if @flags allow us to do it) */
72776+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72777+ return ret;
72778+
72779+ /* before establishing of sibling link we lock parent node; it is
72780+ required by renew_neighbor() to work. */
72781+ init_lh(&path[0]);
72782+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72783+ if (ret)
72784+ return ret;
72785+ if (znode_above_root(path[0].node)) {
72786+ longterm_unlock_znode(&path[0]);
72787+ return RETERR(-E_NO_NEIGHBOR);
72788+ }
72789+
72790+ while (1) {
72791+ znode *child = (h == 0) ? node : path[h - 1].node;
72792+ znode *parent = path[h].node;
72793+
72794+ ret = zload(parent);
72795+ if (ret)
72796+ break;
72797+
72798+ ret = find_child_ptr(parent, child, &coord);
72799+
72800+ if (ret) {
72801+ zrelse(parent);
72802+ break;
72803+ }
72804+
72805+ /* try to establish missing sibling link */
72806+ ret = renew_neighbor(&coord, child, h + base_level, flags);
72807+
72808+ zrelse(parent);
72809+
72810+ switch (ret) {
72811+ case 0:
72812+ /* unlocking of parent znode prevents simple
72813+ deadlock situation */
72814+ done_lh(&path[h]);
72815+
72816+ /* depend on tree level we stay on we repeat first
72817+ locking attempt ... */
72818+ if (h == 0)
72819+ goto again;
72820+
72821+ /* ... or repeat establishing of sibling link at
72822+ one level below. */
72823+ --h;
72824+ break;
72825+
72826+ case -ENOENT:
72827+ /* sibling link is not available -- we go
72828+ upward. */
72829+ init_lh(&path[h + 1]);
72830+ ret =
72831+ reiser4_get_parent(&path[h + 1], parent,
72832+ ZNODE_READ_LOCK);
72833+ if (ret)
72834+ goto fail;
72835+ ++h;
72836+ if (znode_above_root(path[h].node)) {
72837+ ret = RETERR(-E_NO_NEIGHBOR);
72838+ goto fail;
72839+ }
72840+ break;
72841+
72842+ case -E_DEADLOCK:
72843+ /* there was lock request from hi-pri locker. if
72844+ it is possible we unlock last parent node and
72845+ re-lock it again. */
72846+ for (; reiser4_check_deadlock(); h--) {
72847+ done_lh(&path[h]);
72848+ if (h == 0)
72849+ goto fail;
72850+ }
72851+
72852+ break;
72853+
72854+ default: /* other errors. */
72855+ goto fail;
72856+ }
72857+ }
72858+ fail:
72859+ ON_DEBUG(check_lock_node_data(node));
72860+ ON_DEBUG(check_lock_data());
72861+
72862+ /* unlock path */
72863+ do {
72864+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72865+ fail; path[0] is already done_lh-ed, therefore
72866+ longterm_unlock_znode(&path[h]); is not applicable */
72867+ done_lh(&path[h]);
72868+ --h;
72869+ } while (h + 1 != 0);
72870+
72871+ return ret;
72872+}
72873+
72874+/* remove node from sibling list */
72875+/* Audited by: umka (2002.06.14) */
72876+void sibling_list_remove(znode * node)
72877+{
72878+ reiser4_tree *tree;
72879+
72880+ tree = znode_get_tree(node);
72881+ assert("umka-255", node != NULL);
72882+ assert_rw_write_locked(&(tree->tree_lock));
72883+ assert("nikita-3275", check_sibling_list(node));
72884+
72885+ write_lock_dk(tree);
72886+ if (znode_is_right_connected(node) && node->right != NULL &&
72887+ znode_is_left_connected(node) && node->left != NULL) {
72888+ assert("zam-32245",
72889+ keyeq(znode_get_rd_key(node),
72890+ znode_get_ld_key(node->right)));
72891+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72892+ }
72893+ write_unlock_dk(tree);
72894+
72895+ if (znode_is_right_connected(node) && node->right != NULL) {
72896+ assert("zam-322", znode_is_left_connected(node->right));
72897+ node->right->left = node->left;
72898+ ON_DEBUG(node->right->left_version =
72899+ atomic_inc_return(&delim_key_version);
72900+ );
72901+ }
72902+ if (znode_is_left_connected(node) && node->left != NULL) {
72903+ assert("zam-323", znode_is_right_connected(node->left));
72904+ node->left->right = node->right;
72905+ ON_DEBUG(node->left->right_version =
72906+ atomic_inc_return(&delim_key_version);
72907+ );
72908+ }
72909+
72910+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72911+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72912+ ON_DEBUG(node->left = node->right = NULL;
72913+ node->left_version = atomic_inc_return(&delim_key_version);
72914+ node->right_version = atomic_inc_return(&delim_key_version););
72915+ assert("nikita-3276", check_sibling_list(node));
72916+}
72917+
72918+/* disconnect node from sibling list */
72919+void sibling_list_drop(znode * node)
72920+{
72921+ znode *right;
72922+ znode *left;
72923+
72924+ assert("nikita-2464", node != NULL);
72925+ assert("nikita-3277", check_sibling_list(node));
72926+
72927+ right = node->right;
72928+ if (right != NULL) {
72929+ assert("nikita-2465", znode_is_left_connected(right));
72930+ right->left = NULL;
72931+ ON_DEBUG(right->left_version =
72932+ atomic_inc_return(&delim_key_version);
72933+ );
72934+ }
72935+ left = node->left;
72936+ if (left != NULL) {
72937+ assert("zam-323", znode_is_right_connected(left));
72938+ left->right = NULL;
72939+ ON_DEBUG(left->right_version =
72940+ atomic_inc_return(&delim_key_version);
72941+ );
72942+ }
72943+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
72944+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72945+ ON_DEBUG(node->left = node->right = NULL;
72946+ node->left_version = atomic_inc_return(&delim_key_version);
72947+ node->right_version = atomic_inc_return(&delim_key_version););
72948+}
72949+
72950+/* Insert new node into sibling list. Regular balancing inserts new node
72951+ after (at right side) existing and locked node (@before), except one case
72952+ of adding new tree root node. @before should be NULL in that case. */
72953+void sibling_list_insert_nolock(znode * new, znode * before)
72954+{
72955+ assert("zam-334", new != NULL);
72956+ assert("nikita-3298", !znode_is_left_connected(new));
72957+ assert("nikita-3299", !znode_is_right_connected(new));
72958+ assert("nikita-3300", new->left == NULL);
72959+ assert("nikita-3301", new->right == NULL);
72960+ assert("nikita-3278", check_sibling_list(new));
72961+ assert("nikita-3279", check_sibling_list(before));
72962+
72963+ if (before != NULL) {
72964+ assert("zam-333", znode_is_connected(before));
72965+ new->right = before->right;
72966+ new->left = before;
72967+ ON_DEBUG(new->right_version =
72968+ atomic_inc_return(&delim_key_version);
72969+ new->left_version =
72970+ atomic_inc_return(&delim_key_version););
72971+ if (before->right != NULL) {
72972+ before->right->left = new;
72973+ ON_DEBUG(before->right->left_version =
72974+ atomic_inc_return(&delim_key_version);
72975+ );
72976+ }
72977+ before->right = new;
72978+ ON_DEBUG(before->right_version =
72979+ atomic_inc_return(&delim_key_version);
72980+ );
72981+ } else {
72982+ new->right = NULL;
72983+ new->left = NULL;
72984+ ON_DEBUG(new->right_version =
72985+ atomic_inc_return(&delim_key_version);
72986+ new->left_version =
72987+ atomic_inc_return(&delim_key_version););
72988+ }
72989+ ZF_SET(new, JNODE_LEFT_CONNECTED);
72990+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
72991+ assert("nikita-3280", check_sibling_list(new));
72992+ assert("nikita-3281", check_sibling_list(before));
72993+}
72994+
72995+/*
72996+ Local variables:
72997+ c-indentation-style: "K&R"
72998+ mode-name: "LC"
72999+ c-basic-offset: 8
73000+ tab-width: 8
73001+ fill-column: 80
73002+ End:
73003+*/
73004diff --git a/fs/reiser4/tree_walk.h b/fs/reiser4/tree_walk.h
73005new file mode 100644
73006index 0000000..3d5f09f
73007--- /dev/null
73008+++ b/fs/reiser4/tree_walk.h
73009@@ -0,0 +1,125 @@
73010+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73011+
73012+/* definitions of reiser4 tree walk functions */
73013+
73014+#ifndef __FS_REISER4_TREE_WALK_H__
73015+#define __FS_REISER4_TREE_WALK_H__
73016+
73017+#include "debug.h"
73018+#include "forward.h"
73019+
73020+/* establishes horizontal links between cached znodes */
73021+int connect_znode(coord_t * coord, znode * node);
73022+
73023+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
73024+ have the following common arguments:
73025+
73026+ return codes:
73027+
73028+ @return : 0 - OK,
73029+
73030+ZAM-FIXME-HANS: wrong return code name. Change them all.
73031+ -ENOENT - neighbor is not in cache, what is detected by sibling
73032+ link absence.
73033+
73034+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
73035+ found (because we are left-/right- most node of the
73036+ tree, for example). Also, this return code is for
73037+ reiser4_get_parent() when we see no parent link -- it
73038+ means that our node is root node.
73039+
73040+ -E_DEADLOCK - deadlock detected (request from high-priority process
73041+ received), other error codes are conformed to
73042+ /usr/include/asm/errno.h .
73043+*/
73044+
73045+int
73046+reiser4_get_parent_flags(lock_handle * result, znode * node,
73047+ znode_lock_mode mode, int flags);
73048+
73049+/* bits definition for reiser4_get_neighbor function `flags' arg. */
73050+typedef enum {
73051+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
73052+ * find not allocated not connected neigbor by going though upper
73053+ * levels */
73054+ GN_CAN_USE_UPPER_LEVELS = 0x1,
73055+ /* locking left neighbor instead of right one */
73056+ GN_GO_LEFT = 0x2,
73057+ /* automatically load neighbor node content */
73058+ GN_LOAD_NEIGHBOR = 0x4,
73059+ /* return -E_REPEAT if can't lock */
73060+ GN_TRY_LOCK = 0x8,
73061+ /* used internally in tree_walk.c, causes renew_sibling to not
73062+ allocate neighbor znode, but only search for it in znode cache */
73063+ GN_NO_ALLOC = 0x10,
73064+ /* do not go across atom boundaries */
73065+ GN_SAME_ATOM = 0x20,
73066+ /* allow to lock not connected nodes */
73067+ GN_ALLOW_NOT_CONNECTED = 0x40,
73068+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
73069+ GN_ASYNC = 0x80
73070+} znode_get_neigbor_flags;
73071+
73072+/* A commonly used wrapper for reiser4_get_parent_flags(). */
73073+static inline int reiser4_get_parent(lock_handle * result, znode * node,
73074+ znode_lock_mode mode)
73075+{
73076+ return reiser4_get_parent_flags(result, node, mode,
73077+ GN_ALLOW_NOT_CONNECTED);
73078+}
73079+
73080+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
73081+ znode_lock_mode lock_mode, int flags);
73082+
73083+/* there are wrappers for most common usages of reiser4_get_neighbor() */
73084+static inline int
73085+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
73086+ int flags)
73087+{
73088+ return reiser4_get_neighbor(result, node, lock_mode,
73089+ flags | GN_GO_LEFT);
73090+}
73091+
73092+static inline int
73093+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
73094+ int flags)
73095+{
73096+ ON_DEBUG(check_lock_node_data(node));
73097+ ON_DEBUG(check_lock_data());
73098+ return reiser4_get_neighbor(result, node, lock_mode,
73099+ flags & (~GN_GO_LEFT));
73100+}
73101+
73102+extern void sibling_list_remove(znode * node);
73103+extern void sibling_list_drop(znode * node);
73104+extern void sibling_list_insert_nolock(znode * new, znode * before);
73105+extern void link_left_and_right(znode * left, znode * right);
73106+
73107+/* Functions called by tree_walk() when tree_walk() ... */
73108+struct tree_walk_actor {
73109+ /* ... meets a formatted node, */
73110+ int (*process_znode) (tap_t *, void *);
73111+ /* ... meets an extent, */
73112+ int (*process_extent) (tap_t *, void *);
73113+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
73114+ * node or extent processing functions. */
73115+ int (*before) (void *);
73116+};
73117+
73118+#if REISER4_DEBUG
73119+int check_sibling_list(znode * node);
73120+#else
73121+#define check_sibling_list(n) (1)
73122+#endif
73123+
73124+#endif /* __FS_REISER4_TREE_WALK_H__ */
73125+
73126+/*
73127+ Local variables:
73128+ c-indentation-style: "K&R"
73129+ mode-name: "LC"
73130+ c-basic-offset: 8
73131+ tab-width: 8
73132+ fill-column: 120
73133+ End:
73134+*/
73135diff --git a/fs/reiser4/txnmgr.c b/fs/reiser4/txnmgr.c
73136new file mode 100644
73137index 0000000..72d525b
73138--- /dev/null
73139+++ b/fs/reiser4/txnmgr.c
73140@@ -0,0 +1,3164 @@
73141+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73142+ * reiser4/README */
73143+
73144+/* Joshua MacDonald wrote the first draft of this code. */
73145+
73146+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
73147+filesystem scales only as well as its worst locking design. You need to
73148+substantially restructure this code. Josh was not as experienced a programmer
73149+as you. Particularly review how the locking style differs from what you did
73150+for znodes usingt hi-lo priority locking, and present to me an opinion on
73151+whether the differences are well founded. */
73152+
73153+/* I cannot help but to disagree with the sentiment above. Locking of
73154+ * transaction manager is _not_ badly designed, and, at the very least, is not
73155+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
73156+ * locking on znodes, especially on the root node of the tree. --nikita,
73157+ * 2003.10.13 */
73158+
73159+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
73160+ txnmgr processes capture_block requests and manages the relationship between jnodes and
73161+ atoms through the various stages of a transcrash, and it also oversees the fusion and
73162+ capture-on-copy processes. The main difficulty with this task is maintaining a
73163+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
73164+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
73165+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
73166+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
73167+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
73168+ atom, you must use trylock() and possibly reverse the order.
73169+
73170+ This code implements the design documented at:
73171+
73172+ http://namesys.com/txn-doc.html
73173+
73174+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
73175+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
73176+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
73177+year old --- define all technical terms used.
73178+
73179+*/
73180+
73181+/* Thoughts on the external transaction interface:
73182+
73183+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
73184+ creates state that lasts for the duration of a system call and is called at the start
73185+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
73186+ occupying the scope of a single system call. We wish to give certain applications an
73187+ interface to begin and close (commit) transactions. Since our implementation of
73188+ transactions does not yet support isolation, allowing an application to open a
73189+ transaction implies trusting it to later close the transaction. Part of the
73190+ transaction interface will be aimed at enabling that trust, but the interface for
73191+ actually using transactions is fairly narrow.
73192+
73193+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
73194+ this identifier into a string that a shell-script could use, allowing you to start a
73195+ transaction by issuing a command. Once open, the transcrash should be set in the task
73196+ structure, and there should be options (I suppose) to allow it to be carried across
73197+ fork/exec. A transcrash has several options:
73198+
73199+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
73200+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
73201+ capture on reads as well, it should set READ_FUSING.
73202+
73203+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
73204+ eventually close (or else the machine must crash). If the application dies an
73205+ unexpected death with an open transcrash, for example, or if it hangs for a long
73206+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
73207+ This is a dangerous option, but it is one way to solve the problem until isolated
73208+ transcrashes are available for untrusted applications.
73209+
73210+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
73211+ creating a vulnerability based on resource starvation. Guaranteeing that some
73212+ minimum amount of computational resources are made available would seem more correct
73213+ than guaranteeing some amount of time. When we again have someone to code the work,
73214+ this issue should be considered carefully. -Hans
73215+
73216+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
73217+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
73218+ where it is safe for the application to fail, because the system may not be able to
73219+ grant the allocation and the application must be able to back-out. For this reason,
73220+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
73221+ the application may also wish to extend the allocation after beginning its transcrash.
73222+
73223+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
73224+ modifications that require transaction protection. When isolated transactions are
73225+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
73226+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
73227+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
73228+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
73229+
73230+ For actually implementing these out-of-system-call-scopped transcrashes, the
73231+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
73232+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
73233+ "struct kmem_cache *_txnh_slab" created for that purpose in this file.
73234+*/
73235+
73236+/* Extending the other system call interfaces for future transaction features:
73237+
73238+ Specialized applications may benefit from passing flags to the ordinary system call
73239+ interface such as read(), write(), or stat(). For example, the application specifies
73240+ WRITE_FUSING by default but wishes to add that a certain read() command should be
73241+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
73242+ read, or the file-data read? These issues are straight-forward, but there are a lot of
73243+ them and adding the necessary flags-passing code will be tedious.
73244+
73245+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
73246+ flag, which specifies that although it is a read operation being requested, a
73247+ write-lock should be taken. The reason is that read-locks are shared while write-locks
73248+ are exclusive, so taking a read-lock when a later-write is known in advance will often
73249+ leads to deadlock. If a reader knows it will write later, it should issue read
73250+ requests with the RMW flag set.
73251+*/
73252+
73253+/*
73254+ The znode/atom deadlock avoidance.
73255+
73256+ FIXME(Zam): writing of this comment is in progress.
73257+
73258+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
73259+ long-term locking, which makes reiser4 locking scheme more complex. It had
73260+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
73261+ looked as the following: one stopped thread waits for a long-term lock on
73262+ znode, the thread who owns that lock waits when fusion with another atom will
73263+ be allowed.
73264+
73265+ The source of the deadlocks is an optimization of not capturing index nodes
73266+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
73267+ unconditionally captures each block before locking it.
73268+
73269+ That scheme has no deadlocks. Let's begin with the thread which stage is
73270+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
73271+ a capture because it's stage allows fusion with any atom except which are
73272+ being committed currently. A process of atom commit can't deadlock because
73273+ atom commit procedure does not acquire locks and does not fuse with other
73274+ atoms. Reiser4 does capturing right before going to sleep inside the
73275+ longtertm_lock_znode() function, it means the znode which we want to lock is
73276+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
73277+ continue the analysis we understand that no one process in the sequence may
73278+ waits atom fusion. Thereby there are no deadlocks of described kind.
73279+
73280+ The capturing optimization makes the deadlocks possible. A thread can wait a
73281+ lock which owner did not captured that node. The lock owner's current atom
73282+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
73283+ state. A deadlock is possible when that atom meets another one which is in
73284+ ASTAGE_CAPTURE_WAIT already.
73285+
73286+ The deadlock avoidance scheme includes two algorithms:
73287+
73288+ First algorithm is used when a thread captures a node which is locked but not
73289+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
73290+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
73291+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
73292+ routine which forces all lock owners to join with current atom is executed.
73293+
73294+ Second algorithm does not allow to skip capturing of already captured nodes.
73295+
73296+ Both algorithms together prevent waiting a longterm lock without atom fusion
73297+ with atoms of all lock owners, which is a key thing for getting atom/znode
73298+ locking deadlocks.
73299+*/
73300+
73301+/*
73302+ * Transactions and mmap(2).
73303+ *
73304+ * 1. Transactions are not supported for accesses through mmap(2), because
73305+ * this would effectively amount to user-level transactions whose duration
73306+ * is beyond control of the kernel.
73307+ *
73308+ * 2. That said, we still want to preserve some decency with regard to
73309+ * mmap(2). During normal write(2) call, following sequence of events
73310+ * happens:
73311+ *
73312+ * 1. page is created;
73313+ *
73314+ * 2. jnode is created, dirtied and captured into current atom.
73315+ *
73316+ * 3. extent is inserted and modified.
73317+ *
73318+ * Steps (2) and (3) take place under long term lock on the twig node.
73319+ *
73320+ * When file is accessed through mmap(2) page is always created during
73321+ * page fault.
73322+ * After this (in reiser4_readpage()->reiser4_readpage_extent()):
73323+ *
73324+ * 1. if access is made to non-hole page new jnode is created, (if
73325+ * necessary)
73326+ *
73327+ * 2. if access is made to the hole page, jnode is not created (XXX
73328+ * not clear why).
73329+ *
73330+ * Also, even if page is created by write page fault it is not marked
73331+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
73332+ * with page write-out.
73333+ *
73334+ * Dirty bit installed by hardware is only transferred to the struct page
73335+ * later, when page is unmapped (in zap_pte_range(), or
73336+ * try_to_unmap_one()).
73337+ *
73338+ * So, with mmap(2) we have to handle following irksome situations:
73339+ *
73340+ * 1. there exists modified page (clean or dirty) without jnode
73341+ *
73342+ * 2. there exists modified page (clean or dirty) with clean jnode
73343+ *
73344+ * 3. clean page which is a part of atom can be transparently modified
73345+ * at any moment through mapping without becoming dirty.
73346+ *
73347+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
73348+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
73349+ * don't see them, because these methods operate on atoms.
73350+ *
73351+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
73352+ * captured jnode captured by some atom. As part of early flush (for
73353+ * example) page was written out. Dirty bit was cleared on both page and
73354+ * jnode. After this page is modified through mapping, but kernel doesn't
73355+ * notice and just discards page and jnode as part of commit. (XXX
73356+ * actually it doesn't, because to reclaim page ->releasepage() has to be
73357+ * called and before this dirty bit will be transferred to the struct
73358+ * page).
73359+ *
73360+ */
73361+
73362+#include "debug.h"
73363+#include "txnmgr.h"
73364+#include "jnode.h"
73365+#include "znode.h"
73366+#include "block_alloc.h"
73367+#include "tree.h"
73368+#include "wander.h"
73369+#include "ktxnmgrd.h"
73370+#include "super.h"
73371+#include "page_cache.h"
73372+#include "reiser4.h"
73373+#include "vfs_ops.h"
73374+#include "inode.h"
73375+#include "flush.h"
73376+
73377+#include <asm/atomic.h>
73378+#include <linux/types.h>
73379+#include <linux/fs.h>
73380+#include <linux/mm.h>
73381+#include <linux/slab.h>
73382+#include <linux/pagemap.h>
73383+#include <linux/writeback.h>
73384+#include <linux/swap.h> /* for totalram_pages */
73385+
73386+static void atom_free(txn_atom * atom);
73387+
73388+static int commit_txnh(txn_handle * txnh);
73389+
73390+static void wakeup_atom_waitfor_list(txn_atom * atom);
73391+static void wakeup_atom_waiting_list(txn_atom * atom);
73392+
73393+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
73394+
73395+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
73396+
73397+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
73398+
73399+static int capture_init_fusion(jnode * node, txn_handle * txnh,
73400+ txn_capture mode);
73401+
73402+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
73403+
73404+static void capture_fuse_into(txn_atom * small, txn_atom * large);
73405+
73406+void reiser4_invalidate_list(struct list_head *);
73407+
73408+/* GENERIC STRUCTURES */
73409+
73410+typedef struct _txn_wait_links txn_wait_links;
73411+
73412+struct _txn_wait_links {
73413+ lock_stack *_lock_stack;
73414+ struct list_head _fwaitfor_link;
73415+ struct list_head _fwaiting_link;
73416+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
73417+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
73418+};
73419+
73420+/* FIXME: In theory, we should be using the slab cache init & destructor
73421+ methods instead of, e.g., jnode_init, etc. */
73422+static struct kmem_cache *_atom_slab = NULL;
73423+/* this is for user-visible, cross system-call transactions. */
73424+static struct kmem_cache *_txnh_slab = NULL;
73425+
73426+/**
73427+ * init_txnmgr_static - create transaction manager slab caches
73428+ *
73429+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
73430+ * initialization.
73431+ */
73432+int init_txnmgr_static(void)
73433+{
73434+ assert("jmacd-600", _atom_slab == NULL);
73435+ assert("jmacd-601", _txnh_slab == NULL);
73436+
73437+ ON_DEBUG(atomic_set(&flush_cnt, 0));
73438+
73439+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
73440+ SLAB_HWCACHE_ALIGN |
73441+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
73442+ if (_atom_slab == NULL)
73443+ return RETERR(-ENOMEM);
73444+
73445+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
73446+ SLAB_HWCACHE_ALIGN, NULL, NULL);
73447+ if (_txnh_slab == NULL) {
73448+ kmem_cache_destroy(_atom_slab);
73449+ _atom_slab = NULL;
73450+ return RETERR(-ENOMEM);
73451+ }
73452+
73453+ return 0;
73454+}
73455+
73456+/**
73457+ * done_txnmgr_static - delete txn_atom and txn_handle caches
73458+ *
73459+ * This is called on reiser4 module unloading or system shutdown.
73460+ */
73461+void done_txnmgr_static(void)
73462+{
73463+ destroy_reiser4_cache(&_atom_slab);
73464+ destroy_reiser4_cache(&_txnh_slab);
73465+}
73466+
73467+/**
73468+ * init_txnmgr - initialize a new transaction manager
73469+ * @mgr: pointer to transaction manager embedded in reiser4 super block
73470+ *
73471+ * This is called on mount. Makes necessary initializations.
73472+ */
73473+void reiser4_init_txnmgr(txn_mgr *mgr)
73474+{
73475+ assert("umka-169", mgr != NULL);
73476+
73477+ mgr->atom_count = 0;
73478+ mgr->id_count = 1;
73479+ INIT_LIST_HEAD(&mgr->atoms_list);
73480+ spin_lock_init(&mgr->tmgr_lock);
73481+ mutex_init(&mgr->commit_mutex);
73482+}
73483+
73484+/**
73485+ * reiser4_done_txnmgr - stop transaction manager
73486+ * @mgr: pointer to transaction manager embedded in reiser4 super block
73487+ *
73488+ * This is called on umount. Does sanity checks.
73489+ */
73490+void reiser4_done_txnmgr(txn_mgr *mgr)
73491+{
73492+ assert("umka-170", mgr != NULL);
73493+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
73494+ assert("umka-1702", mgr->atom_count == 0);
73495+}
73496+
73497+/* Initialize a transaction handle. */
73498+/* Audited by: umka (2002.06.13) */
73499+static void txnh_init(txn_handle * txnh, txn_mode mode)
73500+{
73501+ assert("umka-171", txnh != NULL);
73502+
73503+ txnh->mode = mode;
73504+ txnh->atom = NULL;
73505+ reiser4_ctx_gfp_mask_set();
73506+ txnh->flags = 0;
73507+ spin_lock_init(&txnh->hlock);
73508+ INIT_LIST_HEAD(&txnh->txnh_link);
73509+}
73510+
73511+#if REISER4_DEBUG
73512+/* Check if a transaction handle is clean. */
73513+static int txnh_isclean(txn_handle * txnh)
73514+{
73515+ assert("umka-172", txnh != NULL);
73516+ return txnh->atom == NULL &&
73517+ LOCK_CNT_NIL(spin_locked_txnh);
73518+}
73519+#endif
73520+
73521+/* Initialize an atom. */
73522+static void atom_init(txn_atom * atom)
73523+{
73524+ int level;
73525+
73526+ assert("umka-173", atom != NULL);
73527+
73528+ memset(atom, 0, sizeof(txn_atom));
73529+
73530+ atom->stage = ASTAGE_FREE;
73531+ atom->start_time = jiffies;
73532+
73533+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
73534+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73535+
73536+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73537+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73538+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73539+ INIT_LIST_HEAD(&atom->inodes);
73540+ spin_lock_init(&(atom->alock));
73541+ /* list of transaction handles */
73542+ INIT_LIST_HEAD(&atom->txnh_list);
73543+ /* link to transaction manager's list of atoms */
73544+ INIT_LIST_HEAD(&atom->atom_link);
73545+ INIT_LIST_HEAD(&atom->fwaitfor_list);
73546+ INIT_LIST_HEAD(&atom->fwaiting_list);
73547+ blocknr_set_init(&atom->delete_set);
73548+ blocknr_set_init(&atom->wandered_map);
73549+
73550+ init_atom_fq_parts(atom);
73551+}
73552+
73553+#if REISER4_DEBUG
73554+/* Check if an atom is clean. */
73555+static int atom_isclean(txn_atom * atom)
73556+{
73557+ int level;
73558+
73559+ assert("umka-174", atom != NULL);
73560+
73561+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73562+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73563+ return 0;
73564+ }
73565+ }
73566+
73567+ return atom->stage == ASTAGE_FREE &&
73568+ atom->txnh_count == 0 &&
73569+ atom->capture_count == 0 &&
73570+ atomic_read(&atom->refcount) == 0 &&
73571+ (&atom->atom_link == atom->atom_link.next &&
73572+ &atom->atom_link == atom->atom_link.prev) &&
73573+ list_empty_careful(&atom->txnh_list) &&
73574+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73575+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73576+ list_empty_careful(ATOM_WB_LIST(atom)) &&
73577+ list_empty_careful(&atom->fwaitfor_list) &&
73578+ list_empty_careful(&atom->fwaiting_list) &&
73579+ atom_fq_parts_are_clean(atom);
73580+}
73581+#endif
73582+
73583+/* Begin a transaction in this context. Currently this uses the reiser4_context's
73584+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
73585+ this will be extended to allow transaction handles to span several contexts. */
73586+/* Audited by: umka (2002.06.13) */
73587+void reiser4_txn_begin(reiser4_context * context)
73588+{
73589+ assert("jmacd-544", context->trans == NULL);
73590+
73591+ context->trans = &context->trans_in_ctx;
73592+
73593+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73594+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
73595+ stack allocated right now, but we would like to allow for dynamically allocated
73596+ transcrashes that span multiple system calls.
73597+ */
73598+ txnh_init(context->trans, TXN_WRITE_FUSING);
73599+}
73600+
73601+/* Finish a transaction handle context. */
73602+int reiser4_txn_end(reiser4_context * context)
73603+{
73604+ long ret = 0;
73605+ txn_handle *txnh;
73606+
73607+ assert("umka-283", context != NULL);
73608+ assert("nikita-3012", reiser4_schedulable());
73609+ assert("vs-24", context == get_current_context());
73610+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73611+
73612+ txnh = context->trans;
73613+ if (txnh != NULL) {
73614+ if (txnh->atom != NULL)
73615+ ret = commit_txnh(txnh);
73616+ assert("jmacd-633", txnh_isclean(txnh));
73617+ context->trans = NULL;
73618+ }
73619+ return ret;
73620+}
73621+
73622+void reiser4_txn_restart(reiser4_context * context)
73623+{
73624+ reiser4_txn_end(context);
73625+ reiser4_preempt_point();
73626+ reiser4_txn_begin(context);
73627+}
73628+
73629+void reiser4_txn_restart_current(void)
73630+{
73631+ reiser4_txn_restart(get_current_context());
73632+}
73633+
73634+/* TXN_ATOM */
73635+
73636+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
73637+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
73638+ return NULL. */
73639+static txn_atom *txnh_get_atom(txn_handle * txnh)
73640+{
73641+ txn_atom *atom;
73642+
73643+ assert("umka-180", txnh != NULL);
73644+ assert_spin_not_locked(&(txnh->hlock));
73645+
73646+ while (1) {
73647+ spin_lock_txnh(txnh);
73648+ atom = txnh->atom;
73649+
73650+ if (atom == NULL)
73651+ break;
73652+
73653+ if (spin_trylock_atom(atom))
73654+ break;
73655+
73656+ atomic_inc(&atom->refcount);
73657+
73658+ spin_unlock_txnh(txnh);
73659+ spin_lock_atom(atom);
73660+ spin_lock_txnh(txnh);
73661+
73662+ if (txnh->atom == atom) {
73663+ atomic_dec(&atom->refcount);
73664+ break;
73665+ }
73666+
73667+ spin_unlock_txnh(txnh);
73668+ atom_dec_and_unlock(atom);
73669+ }
73670+
73671+ return atom;
73672+}
73673+
73674+/* Get the current atom and spinlock it if current atom present. May return NULL */
73675+txn_atom *get_current_atom_locked_nocheck(void)
73676+{
73677+ reiser4_context *cx;
73678+ txn_atom *atom;
73679+ txn_handle *txnh;
73680+
73681+ cx = get_current_context();
73682+ assert("zam-437", cx != NULL);
73683+
73684+ txnh = cx->trans;
73685+ assert("zam-435", txnh != NULL);
73686+
73687+ atom = txnh_get_atom(txnh);
73688+
73689+ spin_unlock_txnh(txnh);
73690+ return atom;
73691+}
73692+
73693+/* Get the atom belonging to a jnode, which is initially locked. Return with
73694+ both jnode and atom locked. This performs the necessary spin_trylock to
73695+ break the lock-ordering cycle. Assumes the jnode is already locked, and
73696+ returns NULL if atom is not set. */
73697+txn_atom *jnode_get_atom(jnode * node)
73698+{
73699+ txn_atom *atom;
73700+
73701+ assert("umka-181", node != NULL);
73702+
73703+ while (1) {
73704+ assert_spin_locked(&(node->guard));
73705+
73706+ atom = node->atom;
73707+ /* node is not in any atom */
73708+ if (atom == NULL)
73709+ break;
73710+
73711+ /* If atom is not locked, grab the lock and return */
73712+ if (spin_trylock_atom(atom))
73713+ break;
73714+
73715+ /* At least one jnode belongs to this atom it guarantees that
73716+ * atom->refcount > 0, we can safely increment refcount. */
73717+ atomic_inc(&atom->refcount);
73718+ spin_unlock_jnode(node);
73719+
73720+ /* re-acquire spin locks in the right order */
73721+ spin_lock_atom(atom);
73722+ spin_lock_jnode(node);
73723+
73724+ /* check if node still points to the same atom. */
73725+ if (node->atom == atom) {
73726+ atomic_dec(&atom->refcount);
73727+ break;
73728+ }
73729+
73730+ /* releasing of atom lock and reference requires not holding
73731+ * locks on jnodes. */
73732+ spin_unlock_jnode(node);
73733+
73734+ /* We do not sure that this atom has extra references except our
73735+ * one, so we should call proper function which may free atom if
73736+ * last reference is released. */
73737+ atom_dec_and_unlock(atom);
73738+
73739+ /* lock jnode again for getting valid node->atom pointer
73740+ * value. */
73741+ spin_lock_jnode(node);
73742+ }
73743+
73744+ return atom;
73745+}
73746+
73747+/* Returns true if @node is dirty and part of the same atom as one of its neighbors. Used
73748+ by flush code to indicate whether the next node (in some direction) is suitable for
73749+ flushing. */
73750+int
73751+same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73752+{
73753+ int compat;
73754+ txn_atom *atom;
73755+
73756+ assert("umka-182", node != NULL);
73757+ assert("umka-183", check != NULL);
73758+
73759+ /* Not sure what this function is supposed to do if supplied with @check that is
73760+ neither formatted nor unformatted (bitmap or so). */
73761+ assert("nikita-2373", jnode_is_znode(check)
73762+ || jnode_is_unformatted(check));
73763+
73764+ /* Need a lock on CHECK to get its atom and to check various state bits.
73765+ Don't need a lock on NODE once we get the atom lock. */
73766+ /* It is not enough to lock two nodes and check (node->atom ==
73767+ check->atom) because atom could be locked and being fused at that
73768+ moment, jnodes of the atom of that state (being fused) can point to
73769+ different objects, but the atom is the same. */
73770+ spin_lock_jnode(check);
73771+
73772+ atom = jnode_get_atom(check);
73773+
73774+ if (atom == NULL) {
73775+ compat = 0;
73776+ } else {
73777+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73778+
73779+ if (compat && jnode_is_znode(check)) {
73780+ compat &= znode_is_connected(JZNODE(check));
73781+ }
73782+
73783+ if (compat && alloc_check) {
73784+ compat &= (alloc_value == jnode_is_flushprepped(check));
73785+ }
73786+
73787+ spin_unlock_atom(atom);
73788+ }
73789+
73790+ spin_unlock_jnode(check);
73791+
73792+ return compat;
73793+}
73794+
73795+/* Decrement the atom's reference count and if it falls to zero, free it. */
73796+void atom_dec_and_unlock(txn_atom * atom)
73797+{
73798+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73799+
73800+ assert("umka-186", atom != NULL);
73801+ assert_spin_locked(&(atom->alock));
73802+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
73803+
73804+ if (atomic_dec_and_test(&atom->refcount)) {
73805+ /* take txnmgr lock and atom lock in proper order. */
73806+ if (!spin_trylock_txnmgr(mgr)) {
73807+ /* This atom should exist after we re-acquire its
73808+ * spinlock, so we increment its reference counter. */
73809+ atomic_inc(&atom->refcount);
73810+ spin_unlock_atom(atom);
73811+ spin_lock_txnmgr(mgr);
73812+ spin_lock_atom(atom);
73813+
73814+ if (!atomic_dec_and_test(&atom->refcount)) {
73815+ spin_unlock_atom(atom);
73816+ spin_unlock_txnmgr(mgr);
73817+ return;
73818+ }
73819+ }
73820+ assert_spin_locked(&(mgr->tmgr_lock));
73821+ atom_free(atom);
73822+ spin_unlock_txnmgr(mgr);
73823+ } else
73824+ spin_unlock_atom(atom);
73825+}
73826+
73827+/* Create new atom and connect it to given transaction handle. This adds the
73828+ atom to the transaction manager's list and sets its reference count to 1, an
73829+ artificial reference which is kept until it commits. We play strange games
73830+ to avoid allocation under jnode & txnh spinlocks.*/
73831+
73832+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73833+{
73834+ txn_atom *atom;
73835+ txn_mgr *mgr;
73836+
73837+ if (REISER4_DEBUG && rofs_tree(current_tree)) {
73838+ warning("nikita-3366", "Creating atom on rofs");
73839+ dump_stack();
73840+ }
73841+
73842+ if (*atom_alloc == NULL) {
73843+ (*atom_alloc) = kmem_cache_alloc(_atom_slab,
73844+ reiser4_ctx_gfp_mask_get());
73845+
73846+ if (*atom_alloc == NULL)
73847+ return RETERR(-ENOMEM);
73848+ }
73849+
73850+ /* and, also, txnmgr spin lock should be taken before jnode and txnh
73851+ locks. */
73852+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73853+ spin_lock_txnmgr(mgr);
73854+ spin_lock_txnh(txnh);
73855+
73856+ /* Check whether new atom still needed */
73857+ if (txnh->atom != NULL) {
73858+ /* NOTE-NIKITA probably it is rather better to free
73859+ * atom_alloc here than thread it up to reiser4_try_capture() */
73860+
73861+ spin_unlock_txnh(txnh);
73862+ spin_unlock_txnmgr(mgr);
73863+
73864+ return -E_REPEAT;
73865+ }
73866+
73867+ atom = *atom_alloc;
73868+ *atom_alloc = NULL;
73869+
73870+ atom_init(atom);
73871+
73872+ assert("jmacd-17", atom_isclean(atom));
73873+
73874+ /*
73875+ * lock ordering is broken here. It is ok, as long as @atom is new
73876+ * and inaccessible for others. We can't use spin_lock_atom or
73877+ * spin_lock(&atom->alock) because they care about locking
73878+ * dependencies. spin_trylock_lock doesn't.
73879+ */
73880+ check_me("", spin_trylock_atom(atom));
73881+
73882+ /* add atom to the end of transaction manager's list of atoms */
73883+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
73884+ atom->atom_id = mgr->id_count++;
73885+ mgr->atom_count += 1;
73886+
73887+ /* Release txnmgr lock */
73888+ spin_unlock_txnmgr(mgr);
73889+
73890+ /* One reference until it commits. */
73891+ atomic_inc(&atom->refcount);
73892+ atom->stage = ASTAGE_CAPTURE_FUSE;
73893+ atom->super = reiser4_get_current_sb();
73894+ capture_assign_txnh_nolock(atom, txnh);
73895+
73896+ spin_unlock_atom(atom);
73897+ spin_unlock_txnh(txnh);
73898+
73899+ return -E_REPEAT;
73900+}
73901+
73902+/* Return true if an atom is currently "open". */
73903+static int atom_isopen(const txn_atom * atom)
73904+{
73905+ assert("umka-185", atom != NULL);
73906+
73907+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73908+}
73909+
73910+/* Return the number of pointers to this atom that must be updated during fusion. This
73911+ approximates the amount of work to be done. Fusion chooses the atom with fewer
73912+ pointers to fuse into the atom with more pointers. */
73913+static int atom_pointer_count(const txn_atom * atom)
73914+{
73915+ assert("umka-187", atom != NULL);
73916+
73917+ /* This is a measure of the amount of work needed to fuse this atom
73918+ * into another. */
73919+ return atom->txnh_count + atom->capture_count;
73920+}
73921+
73922+/* Called holding the atom lock, this removes the atom from the transaction manager list
73923+ and frees it. */
73924+static void atom_free(txn_atom * atom)
73925+{
73926+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73927+
73928+ assert("umka-188", atom != NULL);
73929+ assert_spin_locked(&(atom->alock));
73930+
73931+ /* Remove from the txn_mgr's atom list */
73932+ assert_spin_locked(&(mgr->tmgr_lock));
73933+ mgr->atom_count -= 1;
73934+ list_del_init(&atom->atom_link);
73935+
73936+ /* Clean the atom */
73937+ assert("jmacd-16",
73938+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73939+ atom->stage = ASTAGE_FREE;
73940+
73941+ blocknr_set_destroy(&atom->delete_set);
73942+ blocknr_set_destroy(&atom->wandered_map);
73943+
73944+ assert("jmacd-16", atom_isclean(atom));
73945+
73946+ spin_unlock_atom(atom);
73947+
73948+ kmem_cache_free(_atom_slab, atom);
73949+}
73950+
73951+static int atom_is_dotard(const txn_atom * atom)
73952+{
73953+ return time_after(jiffies, atom->start_time +
73954+ get_current_super_private()->tmgr.atom_max_age);
73955+}
73956+
73957+static int atom_can_be_committed(txn_atom * atom)
73958+{
73959+ assert_spin_locked(&(atom->alock));
73960+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
73961+ return atom->txnh_count == atom->nr_waiters + 1;
73962+}
73963+
73964+/* Return true if an atom should commit now. This is determined by aging, atom
73965+ size or atom flags. */
73966+static int atom_should_commit(const txn_atom * atom)
73967+{
73968+ assert("umka-189", atom != NULL);
73969+ return
73970+ (atom->flags & ATOM_FORCE_COMMIT) ||
73971+ ((unsigned)atom_pointer_count(atom) >
73972+ get_current_super_private()->tmgr.atom_max_size)
73973+ || atom_is_dotard(atom);
73974+}
73975+
73976+/* return 1 if current atom exists and requires commit. */
73977+int current_atom_should_commit(void)
73978+{
73979+ txn_atom *atom;
73980+ int result = 0;
73981+
73982+ atom = get_current_atom_locked_nocheck();
73983+ if (atom) {
73984+ result = atom_should_commit(atom);
73985+ spin_unlock_atom(atom);
73986+ }
73987+ return result;
73988+}
73989+
73990+static int atom_should_commit_asap(const txn_atom * atom)
73991+{
73992+ unsigned int captured;
73993+ unsigned int pinnedpages;
73994+
73995+ assert("nikita-3309", atom != NULL);
73996+
73997+ captured = (unsigned)atom->capture_count;
73998+ pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73999+
74000+ return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
74001+}
74002+
74003+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
74004+{
74005+ jnode *first_dirty;
74006+
74007+ list_for_each_entry(first_dirty, head, capture_link) {
74008+ if (!(flags & JNODE_FLUSH_COMMIT)) {
74009+ /*
74010+ * skip jnodes which "heard banshee" or having active
74011+ * I/O
74012+ */
74013+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
74014+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
74015+ continue;
74016+ }
74017+ return first_dirty;
74018+ }
74019+ return NULL;
74020+}
74021+
74022+/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
74023+ nodes on atom's lists */
74024+jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
74025+{
74026+ jnode *first_dirty;
74027+ tree_level level;
74028+
74029+ assert_spin_locked(&(atom->alock));
74030+
74031+ /* The flush starts from LEAF_LEVEL (=1). */
74032+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
74033+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
74034+ continue;
74035+
74036+ first_dirty =
74037+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
74038+ flags);
74039+ if (first_dirty)
74040+ return first_dirty;
74041+ }
74042+
74043+ /* znode-above-root is on the list #0. */
74044+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
74045+}
74046+
74047+static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
74048+{
74049+ jnode *cur;
74050+
74051+ assert("zam-905", atom_is_protected(atom));
74052+
74053+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
74054+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
74055+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
74056+
74057+ spin_lock_jnode(cur);
74058+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
74059+ if (JF_ISSET(cur, JNODE_DIRTY)) {
74060+ queue_jnode(fq, cur);
74061+ } else {
74062+ /* move jnode to atom's clean list */
74063+ list_move_tail(&cur->capture_link,
74064+ ATOM_CLEAN_LIST(atom));
74065+ }
74066+ }
74067+ spin_unlock_jnode(cur);
74068+
74069+ cur = next;
74070+ }
74071+}
74072+
74073+/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
74074+ * jnodes to disk. */
74075+static int submit_wb_list(void)
74076+{
74077+ int ret;
74078+ flush_queue_t *fq;
74079+
74080+ fq = get_fq_for_current_atom();
74081+ if (IS_ERR(fq))
74082+ return PTR_ERR(fq);
74083+
74084+ dispatch_wb_list(fq->atom, fq);
74085+ spin_unlock_atom(fq->atom);
74086+
74087+ ret = reiser4_write_fq(fq, NULL, 1);
74088+ reiser4_fq_put(fq);
74089+
74090+ return ret;
74091+}
74092+
74093+/* Wait completion of all writes, re-submit atom writeback list if needed. */
74094+static int current_atom_complete_writes(void)
74095+{
74096+ int ret;
74097+
74098+ /* Each jnode from that list was modified and dirtied when it had i/o
74099+ * request running already. After i/o completion we have to resubmit
74100+ * them to disk again.*/
74101+ ret = submit_wb_list();
74102+ if (ret < 0)
74103+ return ret;
74104+
74105+ /* Wait all i/o completion */
74106+ ret = current_atom_finish_all_fq();
74107+ if (ret)
74108+ return ret;
74109+
74110+ /* Scan wb list again; all i/o should be completed, we re-submit dirty
74111+ * nodes to disk */
74112+ ret = submit_wb_list();
74113+ if (ret < 0)
74114+ return ret;
74115+
74116+ /* Wait all nodes we just submitted */
74117+ return current_atom_finish_all_fq();
74118+}
74119+
74120+#if REISER4_DEBUG
74121+
74122+static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
74123+{
74124+ if (atom == NULL) {
74125+ printk("%s: no atom\n", prefix);
74126+ return;
74127+ }
74128+
74129+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
74130+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
74131+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
74132+ atom->txnh_count, atom->capture_count, atom->stage,
74133+ atom->start_time, atom->flushed);
74134+}
74135+
74136+#else /* REISER4_DEBUG */
74137+
74138+static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
74139+
74140+#endif /* REISER4_DEBUG */
74141+
74142+#define TOOMANYFLUSHES (1 << 13)
74143+
74144+/* Called with the atom locked and no open "active" transaction handlers except
74145+ ours, this function calls flush_current_atom() until all dirty nodes are
74146+ processed. Then it initiates commit processing.
74147+
74148+ Called by the single remaining open "active" txnh, which is closing. Other
74149+ open txnhs belong to processes which wait atom commit in commit_txnh()
74150+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
74151+ long as we hold the atom lock none of the jnodes can be captured and/or
74152+ locked.
74153+
74154+ Return value is an error code if commit fails.
74155+*/
74156+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
74157+{
74158+ reiser4_super_info_data *sbinfo = get_current_super_private();
74159+ long ret = 0;
74160+ /* how many times jnode_flush() was called as a part of attempt to
74161+ * commit this atom. */
74162+ int flushiters;
74163+
74164+ assert("zam-888", atom != NULL && *atom != NULL);
74165+ assert_spin_locked(&((*atom)->alock));
74166+ assert("zam-887", get_current_context()->trans->atom == *atom);
74167+ assert("jmacd-151", atom_isopen(*atom));
74168+
74169+ assert("nikita-3184",
74170+ get_current_super_private()->delete_mutex_owner != current);
74171+
74172+ for (flushiters = 0;; ++flushiters) {
74173+ ret =
74174+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
74175+ JNODE_FLUSH_COMMIT,
74176+ LONG_MAX /* nr_to_write */ ,
74177+ nr_submitted, atom, NULL);
74178+ if (ret != -E_REPEAT)
74179+ break;
74180+
74181+ /* if atom's dirty list contains one znode which is
74182+ HEARD_BANSHEE and is locked we have to allow lock owner to
74183+ continue and uncapture that znode */
74184+ reiser4_preempt_point();
74185+
74186+ *atom = get_current_atom_locked();
74187+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
74188+ warning("nikita-3176",
74189+ "Flushing like mad: %i", flushiters);
74190+ reiser4_info_atom("atom", *atom);
74191+ DEBUGON(flushiters > (1 << 20));
74192+ }
74193+ }
74194+
74195+ if (ret)
74196+ return ret;
74197+
74198+ assert_spin_locked(&((*atom)->alock));
74199+
74200+ if (!atom_can_be_committed(*atom)) {
74201+ spin_unlock_atom(*atom);
74202+ return RETERR(-E_REPEAT);
74203+ }
74204+
74205+ if ((*atom)->capture_count == 0)
74206+ goto done;
74207+
74208+ /* Up to this point we have been flushing and after flush is called we
74209+ return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
74210+ at this point, commit should be successful. */
74211+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
74212+ ON_DEBUG(((*atom)->committer = current));
74213+ spin_unlock_atom(*atom);
74214+
74215+ ret = current_atom_complete_writes();
74216+ if (ret)
74217+ return ret;
74218+
74219+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
74220+
74221+ /* isolate critical code path which should be executed by only one
74222+ * thread using tmgr mutex */
74223+ mutex_lock(&sbinfo->tmgr.commit_mutex);
74224+
74225+ ret = reiser4_write_logs(nr_submitted);
74226+ if (ret < 0)
74227+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
74228+
74229+ /* The atom->ovrwr_nodes list is processed under commit mutex held
74230+ because of bitmap nodes which are captured by special way in
74231+ reiser4_pre_commit_hook_bitmap(), that way does not include
74232+ capture_fuse_wait() as a capturing of other nodes does -- the commit
74233+ mutex is used for transaction isolation instead. */
74234+ reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
74235+ mutex_unlock(&sbinfo->tmgr.commit_mutex);
74236+
74237+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
74238+ reiser4_invalidate_list(ATOM_WB_LIST(*atom));
74239+ assert("zam-927", list_empty(&(*atom)->inodes));
74240+
74241+ spin_lock_atom(*atom);
74242+ done:
74243+ reiser4_atom_set_stage(*atom, ASTAGE_DONE);
74244+ ON_DEBUG((*atom)->committer = NULL);
74245+
74246+ /* Atom's state changes, so wake up everybody waiting for this
74247+ event. */
74248+ wakeup_atom_waiting_list(*atom);
74249+
74250+ /* Decrement the "until commit" reference, at least one txnh (the caller) is
74251+ still open. */
74252+ atomic_dec(&(*atom)->refcount);
74253+
74254+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
74255+ assert("jmacd-1062", (*atom)->capture_count == 0);
74256+ BUG_ON((*atom)->capture_count != 0);
74257+ assert_spin_locked(&((*atom)->alock));
74258+
74259+ return ret;
74260+}
74261+
74262+/* TXN_TXNH */
74263+
74264+/**
74265+ * force_commit_atom - commit current atom and wait commit completion
74266+ * @txnh:
74267+ *
74268+ * Commits current atom and wait commit completion; current atom and @txnh have
74269+ * to be spinlocked before call, this function unlocks them on exit.
74270+ */
74271+int force_commit_atom(txn_handle *txnh)
74272+{
74273+ txn_atom *atom;
74274+
74275+ assert("zam-837", txnh != NULL);
74276+ assert_spin_locked(&(txnh->hlock));
74277+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
74278+
74279+ atom = txnh->atom;
74280+
74281+ assert("zam-834", atom != NULL);
74282+ assert_spin_locked(&(atom->alock));
74283+
74284+ /*
74285+ * Set flags for atom and txnh: forcing atom commit and waiting for
74286+ * commit completion
74287+ */
74288+ txnh->flags |= TXNH_WAIT_COMMIT;
74289+ atom->flags |= ATOM_FORCE_COMMIT;
74290+
74291+ spin_unlock_txnh(txnh);
74292+ spin_unlock_atom(atom);
74293+
74294+ /* commit is here */
74295+ reiser4_txn_restart_current();
74296+ return 0;
74297+}
74298+
74299+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
74300+ * should we commit all atoms including new ones which are created after this
74301+ * functions is called. */
74302+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
74303+{
74304+ int ret;
74305+ txn_atom *atom;
74306+ txn_mgr *mgr;
74307+ txn_handle *txnh;
74308+ unsigned long start_time = jiffies;
74309+ reiser4_context *ctx = get_current_context();
74310+
74311+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
74312+ assert("nikita-3058", reiser4_commit_check_locks());
74313+
74314+ reiser4_txn_restart_current();
74315+
74316+ mgr = &get_super_private(super)->tmgr;
74317+
74318+ txnh = ctx->trans;
74319+
74320+ again:
74321+
74322+ spin_lock_txnmgr(mgr);
74323+
74324+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
74325+ spin_lock_atom(atom);
74326+
74327+ /* Commit any atom which can be committed. If @commit_new_atoms
74328+ * is not set we commit only atoms which were created before
74329+ * this call is started. */
74330+ if (commit_all_atoms
74331+ || time_before_eq(atom->start_time, start_time)) {
74332+ if (atom->stage <= ASTAGE_POST_COMMIT) {
74333+ spin_unlock_txnmgr(mgr);
74334+
74335+ if (atom->stage < ASTAGE_PRE_COMMIT) {
74336+ spin_lock_txnh(txnh);
74337+ /* Add force-context txnh */
74338+ capture_assign_txnh_nolock(atom, txnh);
74339+ ret = force_commit_atom(txnh);
74340+ if (ret)
74341+ return ret;
74342+ } else
74343+ /* wait atom commit */
74344+ reiser4_atom_wait_event(atom);
74345+
74346+ goto again;
74347+ }
74348+ }
74349+
74350+ spin_unlock_atom(atom);
74351+ }
74352+
74353+#if REISER4_DEBUG
74354+ if (commit_all_atoms) {
74355+ reiser4_super_info_data *sbinfo = get_super_private(super);
74356+ spin_lock_reiser4_super(sbinfo);
74357+ assert("zam-813",
74358+ sbinfo->blocks_fake_allocated_unformatted == 0);
74359+ assert("zam-812", sbinfo->blocks_fake_allocated == 0);
74360+ spin_unlock_reiser4_super(sbinfo);
74361+ }
74362+#endif
74363+
74364+ spin_unlock_txnmgr(mgr);
74365+
74366+ return 0;
74367+}
74368+
74369+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
74370+ * caller */
74371+static int atom_is_committable(txn_atom * atom)
74372+{
74373+ return
74374+ atom->stage < ASTAGE_PRE_COMMIT &&
74375+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
74376+}
74377+
74378+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
74379+ * lock at exit */
74380+int commit_some_atoms(txn_mgr * mgr)
74381+{
74382+ int ret = 0;
74383+ txn_atom *atom;
74384+ txn_handle *txnh;
74385+ reiser4_context *ctx;
74386+ struct list_head *pos, *tmp;
74387+
74388+ ctx = get_current_context();
74389+ assert("nikita-2444", ctx != NULL);
74390+
74391+ txnh = ctx->trans;
74392+ spin_lock_txnmgr(mgr);
74393+
74394+ /*
74395+ * this is to avoid gcc complain that atom might be used
74396+ * uninitialized
74397+ */
74398+ atom = NULL;
74399+
74400+ /* look for atom to commit */
74401+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
74402+ atom = list_entry(pos, txn_atom, atom_link);
74403+ /*
74404+ * first test without taking atom spin lock, whether it is
74405+ * eligible for committing at all
74406+ */
74407+ if (atom_is_committable(atom)) {
74408+ /* now, take spin lock and re-check */
74409+ spin_lock_atom(atom);
74410+ if (atom_is_committable(atom))
74411+ break;
74412+ spin_unlock_atom(atom);
74413+ }
74414+ }
74415+
74416+ ret = (&mgr->atoms_list == pos);
74417+ spin_unlock_txnmgr(mgr);
74418+
74419+ if (ret) {
74420+ /* nothing found */
74421+ spin_unlock(&mgr->daemon->guard);
74422+ return 0;
74423+ }
74424+
74425+ spin_lock_txnh(txnh);
74426+
74427+ BUG_ON(atom == NULL);
74428+ /* Set the atom to force committing */
74429+ atom->flags |= ATOM_FORCE_COMMIT;
74430+
74431+ /* Add force-context txnh */
74432+ capture_assign_txnh_nolock(atom, txnh);
74433+
74434+ spin_unlock_txnh(txnh);
74435+ spin_unlock_atom(atom);
74436+
74437+ /* we are about to release daemon spin lock, notify daemon it
74438+ has to rescan atoms */
74439+ mgr->daemon->rescan = 1;
74440+ spin_unlock(&mgr->daemon->guard);
74441+ reiser4_txn_restart_current();
74442+ return 0;
74443+}
74444+
74445+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
74446+{
74447+ int atom_stage;
74448+ txn_atom *atom_2;
74449+ int repeat;
74450+
74451+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
74452+
74453+ atom_stage = atom->stage;
74454+ repeat = 0;
74455+
74456+ if (!spin_trylock_txnmgr(tmgr)) {
74457+ atomic_inc(&atom->refcount);
74458+ spin_unlock_atom(atom);
74459+ spin_lock_txnmgr(tmgr);
74460+ spin_lock_atom(atom);
74461+ repeat = 1;
74462+ if (atom->stage != atom_stage) {
74463+ spin_unlock_txnmgr(tmgr);
74464+ atom_dec_and_unlock(atom);
74465+ return -E_REPEAT;
74466+ }
74467+ atomic_dec(&atom->refcount);
74468+ }
74469+
74470+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
74471+ if (atom == atom_2)
74472+ continue;
74473+ /*
74474+ * if trylock does not succeed we just do not fuse with that
74475+ * atom.
74476+ */
74477+ if (spin_trylock_atom(atom_2)) {
74478+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
74479+ spin_unlock_txnmgr(tmgr);
74480+ capture_fuse_into(atom_2, atom);
74481+ /* all locks are lost we can only repeat here */
74482+ return -E_REPEAT;
74483+ }
74484+ spin_unlock_atom(atom_2);
74485+ }
74486+ }
74487+ atom->flags |= ATOM_CANCEL_FUSION;
74488+ spin_unlock_txnmgr(tmgr);
74489+ if (repeat) {
74490+ spin_unlock_atom(atom);
74491+ return -E_REPEAT;
74492+ }
74493+ return 0;
74494+}
74495+
74496+/* Calls jnode_flush for current atom if it exists; if not, just take another
74497+ atom and call jnode_flush() for him. If current transaction handle has
74498+ already assigned atom (current atom) we have to close current transaction
74499+ prior to switch to another atom or do something with current atom. This
74500+ code tries to flush current atom.
74501+
74502+ flush_some_atom() is called as part of memory clearing process. It is
74503+ invoked from balance_dirty_pages(), pdflushd, and entd.
74504+
74505+ If we can flush no nodes, atom is committed, because this frees memory.
74506+
74507+ If atom is too large or too old it is committed also.
74508+*/
74509+int
74510+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
74511+ int flags)
74512+{
74513+ reiser4_context *ctx = get_current_context();
74514+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
74515+ txn_handle *txnh = ctx->trans;
74516+ txn_atom *atom;
74517+ int ret;
74518+
74519+ BUG_ON(wbc->nr_to_write == 0);
74520+ BUG_ON(*nr_submitted != 0);
74521+ assert("zam-1042", txnh != NULL);
74522+ repeat:
74523+ if (txnh->atom == NULL) {
74524+ /* current atom is not available, take first from txnmgr */
74525+ spin_lock_txnmgr(tmgr);
74526+
74527+ /* traverse the list of all atoms */
74528+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74529+ /* lock atom before checking its state */
74530+ spin_lock_atom(atom);
74531+
74532+ /*
74533+ * we need an atom which is not being committed and
74534+ * which has no flushers (jnode_flush() add one flusher
74535+ * at the beginning and subtract one at the end).
74536+ */
74537+ if (atom->stage < ASTAGE_PRE_COMMIT &&
74538+ atom->nr_flushers == 0) {
74539+ spin_lock_txnh(txnh);
74540+ capture_assign_txnh_nolock(atom, txnh);
74541+ spin_unlock_txnh(txnh);
74542+
74543+ goto found;
74544+ }
74545+
74546+ spin_unlock_atom(atom);
74547+ }
74548+
74549+ /*
74550+ * Write throttling is case of no one atom can be
74551+ * flushed/committed.
74552+ */
74553+ if (!current_is_pdflush() && !wbc->nonblocking) {
74554+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74555+ spin_lock_atom(atom);
74556+ /* Repeat the check from the above. */
74557+ if (atom->stage < ASTAGE_PRE_COMMIT
74558+ && atom->nr_flushers == 0) {
74559+ spin_lock_txnh(txnh);
74560+ capture_assign_txnh_nolock(atom, txnh);
74561+ spin_unlock_txnh(txnh);
74562+
74563+ goto found;
74564+ }
74565+ if (atom->stage <= ASTAGE_POST_COMMIT) {
74566+ spin_unlock_txnmgr(tmgr);
74567+ /*
74568+ * we just wait until atom's flusher
74569+ * makes a progress in flushing or
74570+ * committing the atom
74571+ */
74572+ reiser4_atom_wait_event(atom);
74573+ goto repeat;
74574+ }
74575+ spin_unlock_atom(atom);
74576+ }
74577+ }
74578+ spin_unlock_txnmgr(tmgr);
74579+ return 0;
74580+ found:
74581+ spin_unlock_txnmgr(tmgr);
74582+ } else
74583+ atom = get_current_atom_locked();
74584+
74585+ BUG_ON(atom->super != ctx->super);
74586+ assert("vs-35", atom->super == ctx->super);
74587+ if (start) {
74588+ spin_lock_jnode(start);
74589+ ret = (atom == start->atom) ? 1 : 0;
74590+ spin_unlock_jnode(start);
74591+ if (ret == 0)
74592+ start = NULL;
74593+ }
74594+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74595+ if (ret == 0) {
74596+ /* flush_current_atom returns 0 only if it submitted for write
74597+ nothing */
74598+ BUG_ON(*nr_submitted != 0);
74599+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74600+ if (atom->capture_count < tmgr->atom_min_size &&
74601+ !(atom->flags & ATOM_CANCEL_FUSION)) {
74602+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
74603+ if (ret == -E_REPEAT) {
74604+ reiser4_preempt_point();
74605+ goto repeat;
74606+ }
74607+ }
74608+ /* if early flushing could not make more nodes clean,
74609+ * or atom is too old/large,
74610+ * we force current atom to commit */
74611+ /* wait for commit completion but only if this
74612+ * wouldn't stall pdflushd and ent thread. */
74613+ if (!wbc->nonblocking && !ctx->entd)
74614+ txnh->flags |= TXNH_WAIT_COMMIT;
74615+ atom->flags |= ATOM_FORCE_COMMIT;
74616+ }
74617+ spin_unlock_atom(atom);
74618+ } else if (ret == -E_REPEAT) {
74619+ if (*nr_submitted == 0) {
74620+ /* let others who hampers flushing (hold longterm locks,
74621+ for instance) to free the way for flush */
74622+ reiser4_preempt_point();
74623+ goto repeat;
74624+ }
74625+ ret = 0;
74626+ }
74627+/*
74628+ if (*nr_submitted > wbc->nr_to_write)
74629+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74630+*/
74631+ reiser4_txn_restart(ctx);
74632+
74633+ return ret;
74634+}
74635+
74636+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74637+void reiser4_invalidate_list(struct list_head *head)
74638+{
74639+ while (!list_empty(head)) {
74640+ jnode *node;
74641+
74642+ node = list_entry(head->next, jnode, capture_link);
74643+ spin_lock_jnode(node);
74644+ reiser4_uncapture_block(node);
74645+ jput(node);
74646+ }
74647+}
74648+
74649+static void init_wlinks(txn_wait_links * wlinks)
74650+{
74651+ wlinks->_lock_stack = get_current_lock_stack();
74652+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74653+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74654+ wlinks->waitfor_cb = NULL;
74655+ wlinks->waiting_cb = NULL;
74656+}
74657+
74658+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74659+void reiser4_atom_wait_event(txn_atom * atom)
74660+{
74661+ txn_wait_links _wlinks;
74662+
74663+ assert_spin_locked(&(atom->alock));
74664+ assert("nikita-3156",
74665+ lock_stack_isclean(get_current_lock_stack()) ||
74666+ atom->nr_running_queues > 0);
74667+
74668+ init_wlinks(&_wlinks);
74669+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74670+ atomic_inc(&atom->refcount);
74671+ spin_unlock_atom(atom);
74672+
74673+ reiser4_prepare_to_sleep(_wlinks._lock_stack);
74674+ reiser4_go_to_sleep(_wlinks._lock_stack);
74675+
74676+ spin_lock_atom(atom);
74677+ list_del(&_wlinks._fwaitfor_link);
74678+ atom_dec_and_unlock(atom);
74679+}
74680+
74681+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
74682+{
74683+ assert("nikita-3535", atom != NULL);
74684+ assert_spin_locked(&(atom->alock));
74685+ assert("nikita-3536", stage <= ASTAGE_INVALID);
74686+ /* Excelsior! */
74687+ assert("nikita-3537", stage >= atom->stage);
74688+ if (atom->stage != stage) {
74689+ atom->stage = stage;
74690+ reiser4_atom_send_event(atom);
74691+ }
74692+}
74693+
74694+/* wake all threads which wait for an event */
74695+void reiser4_atom_send_event(txn_atom * atom)
74696+{
74697+ assert_spin_locked(&(atom->alock));
74698+ wakeup_atom_waitfor_list(atom);
74699+}
74700+
74701+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74702+ example, because it does fsync(2)) */
74703+static int should_wait_commit(txn_handle * h)
74704+{
74705+ return h->flags & TXNH_WAIT_COMMIT;
74706+}
74707+
74708+typedef struct commit_data {
74709+ txn_atom *atom;
74710+ txn_handle *txnh;
74711+ long nr_written;
74712+ /* as an optimization we start committing atom by first trying to
74713+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74714+ * allows to reduce stalls due to other threads waiting for atom in
74715+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74716+ * preliminary flushes. */
74717+ int preflush;
74718+ /* have we waited on atom. */
74719+ int wait;
74720+ int failed;
74721+ int wake_ktxnmgrd_up;
74722+} commit_data;
74723+
74724+/*
74725+ * Called from commit_txnh() repeatedly, until either error happens, or atom
74726+ * commits successfully.
74727+ */
74728+static int try_commit_txnh(commit_data * cd)
74729+{
74730+ int result;
74731+
74732+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74733+
74734+ /* Get the atom and txnh locked. */
74735+ cd->atom = txnh_get_atom(cd->txnh);
74736+ assert("jmacd-309", cd->atom != NULL);
74737+ spin_unlock_txnh(cd->txnh);
74738+
74739+ if (cd->wait) {
74740+ cd->atom->nr_waiters--;
74741+ cd->wait = 0;
74742+ }
74743+
74744+ if (cd->atom->stage == ASTAGE_DONE)
74745+ return 0;
74746+
74747+ if (cd->failed)
74748+ return 0;
74749+
74750+ if (atom_should_commit(cd->atom)) {
74751+ /* if atom is _very_ large schedule it for commit as soon as
74752+ * possible. */
74753+ if (atom_should_commit_asap(cd->atom)) {
74754+ /*
74755+ * When atom is in PRE_COMMIT or later stage following
74756+ * invariant (encoded in atom_can_be_committed())
74757+ * holds: there is exactly one non-waiter transaction
74758+ * handle opened on this atom. When thread wants to
74759+ * wait until atom commits (for example sync()) it
74760+ * waits on atom event after increasing
74761+ * atom->nr_waiters (see blow in this function). It
74762+ * cannot be guaranteed that atom is already committed
74763+ * after receiving event, so loop has to be
74764+ * re-started. But if atom switched into PRE_COMMIT
74765+ * stage and became too large, we cannot change its
74766+ * state back to CAPTURE_WAIT (atom stage can only
74767+ * increase monotonically), hence this check.
74768+ */
74769+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74770+ reiser4_atom_set_stage(cd->atom,
74771+ ASTAGE_CAPTURE_WAIT);
74772+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74773+ }
74774+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74775+ /*
74776+ * this thread (transaction handle that is) doesn't
74777+ * want to commit atom. Notify waiters that handle is
74778+ * closed. This can happen, for example, when we are
74779+ * under VFS directory lock and don't want to commit
74780+ * atom right now to avoid stalling other threads
74781+ * working in the same directory.
74782+ */
74783+
74784+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
74785+ * commit this atom: no atom waiters and only one
74786+ * (our) open transaction handle. */
74787+ cd->wake_ktxnmgrd_up =
74788+ cd->atom->txnh_count == 1 &&
74789+ cd->atom->nr_waiters == 0;
74790+ reiser4_atom_send_event(cd->atom);
74791+ result = 0;
74792+ } else if (!atom_can_be_committed(cd->atom)) {
74793+ if (should_wait_commit(cd->txnh)) {
74794+ /* sync(): wait for commit */
74795+ cd->atom->nr_waiters++;
74796+ cd->wait = 1;
74797+ reiser4_atom_wait_event(cd->atom);
74798+ result = RETERR(-E_REPEAT);
74799+ } else {
74800+ result = 0;
74801+ }
74802+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74803+ /*
74804+ * optimization: flush atom without switching it into
74805+ * ASTAGE_CAPTURE_WAIT.
74806+ *
74807+ * But don't do this for ktxnmgrd, because ktxnmgrd
74808+ * should never block on atom fusion.
74809+ */
74810+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74811+ LONG_MAX, &cd->nr_written,
74812+ &cd->atom, NULL);
74813+ if (result == 0) {
74814+ spin_unlock_atom(cd->atom);
74815+ cd->preflush = 0;
74816+ result = RETERR(-E_REPEAT);
74817+ } else /* Atoms wasn't flushed
74818+ * completely. Rinse. Repeat. */
74819+ --cd->preflush;
74820+ } else {
74821+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
74822+ prevent atom fusion and count ourself as an active
74823+ flusher */
74824+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74825+ cd->atom->flags |= ATOM_FORCE_COMMIT;
74826+
74827+ result =
74828+ commit_current_atom(&cd->nr_written, &cd->atom);
74829+ if (result != 0 && result != -E_REPEAT)
74830+ cd->failed = 1;
74831+ }
74832+ } else
74833+ result = 0;
74834+
74835+#if REISER4_DEBUG
74836+ if (result == 0)
74837+ assert_spin_locked(&(cd->atom->alock));
74838+#endif
74839+
74840+ /* perfectly valid assertion, except that when atom/txnh is not locked
74841+ * fusion can take place, and cd->atom points nowhere. */
74842+ /*
74843+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74844+ */
74845+ return result;
74846+}
74847+
74848+/* Called to commit a transaction handle. This decrements the atom's number of open
74849+ handles and if it is the last handle to commit and the atom should commit, initiates
74850+ atom commit. if commit does not fail, return number of written blocks */
74851+static int commit_txnh(txn_handle * txnh)
74852+{
74853+ commit_data cd;
74854+ assert("umka-192", txnh != NULL);
74855+
74856+ memset(&cd, 0, sizeof cd);
74857+ cd.txnh = txnh;
74858+ cd.preflush = 10;
74859+
74860+ /* calls try_commit_txnh() until either atom commits, or error
74861+ * happens */
74862+ while (try_commit_txnh(&cd) != 0)
74863+ reiser4_preempt_point();
74864+
74865+ spin_lock_txnh(txnh);
74866+
74867+ cd.atom->txnh_count -= 1;
74868+ txnh->atom = NULL;
74869+ /* remove transaction handle from atom's list of transaction handles */
74870+ list_del_init(&txnh->txnh_link);
74871+
74872+ spin_unlock_txnh(txnh);
74873+ atom_dec_and_unlock(cd.atom);
74874+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74875+ * because it takes time) by current thread, we do that work
74876+ * asynchronously by ktxnmgrd daemon. */
74877+ if (cd.wake_ktxnmgrd_up)
74878+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
74879+
74880+ return 0;
74881+}
74882+
74883+/* TRY_CAPTURE */
74884+
74885+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
74886+ condition indicates that the request should be retried, and it may block if the
74887+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74888+
74889+ This routine encodes the basic logic of block capturing described by:
74890+
74891+ http://namesys.com/v4/v4.html
74892+
74893+ Our goal here is to ensure that any two blocks that contain dependent modifications
74894+ should commit at the same time. This function enforces this discipline by initiating
74895+ fusion whenever a transaction handle belonging to one atom requests to read or write a
74896+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74897+
74898+ In addition, this routine handles the initial assignment of atoms to blocks and
74899+ transaction handles. These are possible outcomes of this function:
74900+
74901+ 1. The block and handle are already part of the same atom: return immediate success
74902+
74903+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74904+ the handle to the block's atom.
74905+
74906+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
74907+ the block to the handle's atom.
74908+
74909+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74910+ to fuse atoms.
74911+
74912+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
74913+
74914+ 6. A read request for a non-captured block: return immediate success.
74915+
74916+ This function acquires and releases the handle's spinlock. This function is called
74917+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
74918+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
74919+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
74920+ lock in the failure case.
74921+*/
74922+static int try_capture_block(
74923+ txn_handle * txnh, jnode * node, txn_capture mode,
74924+ txn_atom ** atom_alloc)
74925+{
74926+ txn_atom *block_atom;
74927+ txn_atom *txnh_atom;
74928+
74929+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
74930+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74931+
74932+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74933+ * node->tree somewhere. */
74934+ assert("umka-194", txnh != NULL);
74935+ assert("umka-195", node != NULL);
74936+
74937+ /* The jnode is already locked! Being called from reiser4_try_capture(). */
74938+ assert_spin_locked(&(node->guard));
74939+ block_atom = node->atom;
74940+
74941+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74942+ let us touch the atoms themselves. */
74943+ spin_lock_txnh(txnh);
74944+ txnh_atom = txnh->atom;
74945+ /* Process of capturing continues into one of four branches depends on
74946+ which atoms from (block atom (node->atom), current atom (txnh->atom))
74947+ exist. */
74948+ if (txnh_atom == NULL) {
74949+ if (block_atom == NULL) {
74950+ spin_unlock_txnh(txnh);
74951+ spin_unlock_jnode(node);
74952+ /* assign empty atom to the txnh and repeat */
74953+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74954+ } else {
74955+ atomic_inc(&block_atom->refcount);
74956+ /* node spin-lock isn't needed anymore */
74957+ spin_unlock_jnode(node);
74958+ if (!spin_trylock_atom(block_atom)) {
74959+ spin_unlock_txnh(txnh);
74960+ spin_lock_atom(block_atom);
74961+ spin_lock_txnh(txnh);
74962+ }
74963+ /* re-check state after getting txnh and the node
74964+ * atom spin-locked */
74965+ if (node->atom != block_atom || txnh->atom != NULL) {
74966+ spin_unlock_txnh(txnh);
74967+ atom_dec_and_unlock(block_atom);
74968+ return RETERR(-E_REPEAT);
74969+ }
74970+ atomic_dec(&block_atom->refcount);
74971+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74972+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74973+ block_atom->txnh_count != 0))
74974+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
74975+ capture_assign_txnh_nolock(block_atom, txnh);
74976+ spin_unlock_txnh(txnh);
74977+ spin_unlock_atom(block_atom);
74978+ return RETERR(-E_REPEAT);
74979+ }
74980+ } else {
74981+ /* It is time to perform deadlock prevention check over the
74982+ node we want to capture. It is possible this node was locked
74983+ for read without capturing it. The optimization which allows
74984+ to do it helps us in keeping atoms independent as long as
74985+ possible but it may cause lock/fuse deadlock problems.
74986+
74987+ A number of similar deadlock situations with locked but not
74988+ captured nodes were found. In each situation there are two
74989+ or more threads: one of them does flushing while another one
74990+ does routine balancing or tree lookup. The flushing thread
74991+ (F) sleeps in long term locking request for node (N), another
74992+ thread (A) sleeps in trying to capture some node already
74993+ belonging the atom F, F has a state which prevents
74994+ immediately fusion .
74995+
74996+ Deadlocks of this kind cannot happen if node N was properly
74997+ captured by thread A. The F thread fuse atoms before locking
74998+ therefore current atom of thread F and current atom of thread
74999+ A became the same atom and thread A may proceed. This does
75000+ not work if node N was not captured because the fusion of
75001+ atom does not happens.
75002+
75003+ The following scheme solves the deadlock: If
75004+ longterm_lock_znode locks and does not capture a znode, that
75005+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
75006+ is processed by the code below which restores the missed
75007+ capture and fuses current atoms of all the node lock owners
75008+ by calling the fuse_not_fused_lock_owners() function. */
75009+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
75010+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
75011+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
75012+ spin_unlock_txnh(txnh);
75013+ spin_unlock_jnode(node);
75014+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
75015+ return RETERR(-E_REPEAT);
75016+ }
75017+ }
75018+ if (block_atom == NULL) {
75019+ atomic_inc(&txnh_atom->refcount);
75020+ spin_unlock_txnh(txnh);
75021+ if (!spin_trylock_atom(txnh_atom)) {
75022+ spin_unlock_jnode(node);
75023+ spin_lock_atom(txnh_atom);
75024+ spin_lock_jnode(node);
75025+ }
75026+ if (txnh->atom != txnh_atom || node->atom != NULL
75027+ || JF_ISSET(node, JNODE_IS_DYING)) {
75028+ spin_unlock_jnode(node);
75029+ atom_dec_and_unlock(txnh_atom);
75030+ return RETERR(-E_REPEAT);
75031+ }
75032+ atomic_dec(&txnh_atom->refcount);
75033+ capture_assign_block_nolock(txnh_atom, node);
75034+ spin_unlock_atom(txnh_atom);
75035+ } else {
75036+ if (txnh_atom != block_atom) {
75037+ if (mode & TXN_CAPTURE_DONT_FUSE) {
75038+ spin_unlock_txnh(txnh);
75039+ spin_unlock_jnode(node);
75040+ /* we are in a "no-fusion" mode and @node is
75041+ * already part of transaction. */
75042+ return RETERR(-E_NO_NEIGHBOR);
75043+ }
75044+ return capture_init_fusion(node, txnh, mode);
75045+ }
75046+ spin_unlock_txnh(txnh);
75047+ }
75048+ }
75049+ return 0;
75050+}
75051+
75052+static txn_capture
75053+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
75054+{
75055+ txn_capture cap_mode;
75056+
75057+ assert_spin_locked(&(node->guard));
75058+
75059+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
75060+
75061+ if (lock_mode == ZNODE_WRITE_LOCK) {
75062+ cap_mode = TXN_CAPTURE_WRITE;
75063+ } else if (node->atom != NULL) {
75064+ cap_mode = TXN_CAPTURE_WRITE;
75065+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
75066+ jnode_get_level(node) == LEAF_LEVEL) {
75067+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
75068+ /* We only need a READ_FUSING capture at the leaf level. This
75069+ is because the internal levels of the tree (twigs included)
75070+ are redundant from the point of the user that asked for a
75071+ read-fusing transcrash. The user only wants to read-fuse
75072+ atoms due to reading uncommitted data that another user has
75073+ written. It is the file system that reads/writes the
75074+ internal tree levels, the user only reads/writes leaves. */
75075+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
75076+ } else {
75077+ /* In this case (read lock at a non-leaf) there's no reason to
75078+ * capture. */
75079+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
75080+ return 0;
75081+ }
75082+
75083+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
75084+ assert("nikita-3186", cap_mode != 0);
75085+ return cap_mode;
75086+}
75087+
75088+/* This is an external interface to try_capture_block(), it calls
75089+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
75090+
75091+ @node: node to capture,
75092+ @lock_mode: read or write lock is used in capture mode calculation,
75093+ @flags: see txn_capture flags enumeration,
75094+ @can_coc : can copy-on-capture
75095+
75096+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
75097+ cannot be processed immediately as it was requested in flags,
75098+ < 0 - other errors.
75099+*/
75100+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
75101+ txn_capture flags)
75102+{
75103+ txn_atom *atom_alloc = NULL;
75104+ txn_capture cap_mode;
75105+ txn_handle *txnh = get_current_context()->trans;
75106+ int ret;
75107+
75108+ assert_spin_locked(&(node->guard));
75109+
75110+ repeat:
75111+ if (JF_ISSET(node, JNODE_IS_DYING))
75112+ return RETERR(-EINVAL);
75113+ if (node->atom != NULL && txnh->atom == node->atom)
75114+ return 0;
75115+ cap_mode = build_capture_mode(node, lock_mode, flags);
75116+ if (cap_mode == 0 ||
75117+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
75118+ /* Mark this node as "MISSED". It helps in further deadlock
75119+ * analysis */
75120+ if (jnode_is_znode(node))
75121+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
75122+ return 0;
75123+ }
75124+ /* Repeat try_capture as long as -E_REPEAT is returned. */
75125+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
75126+ /* Regardless of non_blocking:
75127+
75128+ If ret == 0 then jnode is still locked.
75129+ If ret != 0 then jnode is unlocked.
75130+ */
75131+#if REISER4_DEBUG
75132+ if (ret == 0)
75133+ assert_spin_locked(&(node->guard));
75134+ else
75135+ assert_spin_not_locked(&(node->guard));
75136+#endif
75137+ assert_spin_not_locked(&(txnh->guard));
75138+
75139+ if (ret == -E_REPEAT) {
75140+ /* E_REPEAT implies all locks were released, therefore we need
75141+ to take the jnode's lock again. */
75142+ spin_lock_jnode(node);
75143+
75144+ /* Although this may appear to be a busy loop, it is not.
75145+ There are several conditions that cause E_REPEAT to be
75146+ returned by the call to try_capture_block, all cases
75147+ indicating some kind of state change that means you should
75148+ retry the request and will get a different result. In some
75149+ cases this could be avoided with some extra code, but
75150+ generally it is done because the necessary locks were
75151+ released as a result of the operation and repeating is the
75152+ simplest thing to do (less bug potential). The cases are:
75153+ atom fusion returns E_REPEAT after it completes (jnode and
75154+ txnh were unlocked); race conditions in assign_block,
75155+ assign_txnh, and init_fusion return E_REPEAT (trylock
75156+ failure); after going to sleep in capture_fuse_wait
75157+ (request was blocked but may now succeed). I'm not quite
75158+ sure how capture_copy works yet, but it may also return
75159+ E_REPEAT. When the request is legitimately blocked, the
75160+ requestor goes to sleep in fuse_wait, so this is not a busy
75161+ loop. */
75162+ /* NOTE-NIKITA: still don't understand:
75163+
75164+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
75165+
75166+ looks like busy loop?
75167+ */
75168+ goto repeat;
75169+ }
75170+
75171+ /* free extra atom object that was possibly allocated by
75172+ try_capture_block().
75173+
75174+ Do this before acquiring jnode spin lock to
75175+ minimize time spent under lock. --nikita */
75176+ if (atom_alloc != NULL) {
75177+ kmem_cache_free(_atom_slab, atom_alloc);
75178+ }
75179+
75180+ if (ret != 0) {
75181+ if (ret == -E_BLOCK) {
75182+ assert("nikita-3360",
75183+ cap_mode & TXN_CAPTURE_NONBLOCKING);
75184+ ret = -E_REPEAT;
75185+ }
75186+
75187+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
75188+ want to fix the above code to avoid releasing the lock and
75189+ re-acquiring it, but there are cases were failure occurs
75190+ when the lock is not held, and those cases would need to be
75191+ modified to re-take the lock. */
75192+ spin_lock_jnode(node);
75193+ }
75194+
75195+ /* Jnode is still locked. */
75196+ assert_spin_locked(&(node->guard));
75197+ return ret;
75198+}
75199+
75200+static void release_two_atoms(txn_atom *one, txn_atom *two)
75201+{
75202+ spin_unlock_atom(one);
75203+ atom_dec_and_unlock(two);
75204+ spin_lock_atom(one);
75205+ atom_dec_and_unlock(one);
75206+}
75207+
75208+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
75209+ returned by that routine. The txn_capture request mode is computed here depending on
75210+ the transaction handle's type and the lock request. This is called from the depths of
75211+ the lock manager with the jnode lock held and it always returns with the jnode lock
75212+ held.
75213+*/
75214+
75215+/* fuse all 'active' atoms of lock owners of given node. */
75216+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
75217+{
75218+ lock_handle *lh;
75219+ int repeat;
75220+ txn_atom *atomh, *atomf;
75221+ reiser4_context *me = get_current_context();
75222+ reiser4_context *ctx = NULL;
75223+
75224+ assert_spin_not_locked(&(ZJNODE(node)->guard));
75225+ assert_spin_not_locked(&(txnh->hlock));
75226+
75227+ repeat:
75228+ repeat = 0;
75229+ atomh = txnh_get_atom(txnh);
75230+ spin_unlock_txnh(txnh);
75231+ assert("zam-692", atomh != NULL);
75232+
75233+ spin_lock_zlock(&node->lock);
75234+ /* inspect list of lock owners */
75235+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
75236+ ctx = get_context_by_lock_stack(lh->owner);
75237+ if (ctx == me)
75238+ continue;
75239+ /* below we use two assumptions to avoid addition spin-locks
75240+ for checking the condition :
75241+
75242+ 1) if the lock stack has lock, the transaction should be
75243+ opened, i.e. ctx->trans != NULL;
75244+
75245+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
75246+ equals to the address of spin-locked atomh, we take that
75247+ the atoms are the same, nothing has to be captured. */
75248+ if (atomh != ctx->trans->atom) {
75249+ reiser4_wake_up(lh->owner);
75250+ repeat = 1;
75251+ break;
75252+ }
75253+ }
75254+ if (repeat) {
75255+ if (!spin_trylock_txnh(ctx->trans)) {
75256+ spin_unlock_zlock(&node->lock);
75257+ spin_unlock_atom(atomh);
75258+ goto repeat;
75259+ }
75260+ atomf = ctx->trans->atom;
75261+ if (atomf == NULL) {
75262+ capture_assign_txnh_nolock(atomh, ctx->trans);
75263+ /* release zlock lock _after_ assigning the atom to the
75264+ * transaction handle, otherwise the lock owner thread
75265+ * may unlock all znodes, exit kernel context and here
75266+ * we would access an invalid transaction handle. */
75267+ spin_unlock_zlock(&node->lock);
75268+ spin_unlock_atom(atomh);
75269+ spin_unlock_txnh(ctx->trans);
75270+ goto repeat;
75271+ }
75272+ assert("zam-1059", atomf != atomh);
75273+ spin_unlock_zlock(&node->lock);
75274+ atomic_inc(&atomh->refcount);
75275+ atomic_inc(&atomf->refcount);
75276+ spin_unlock_txnh(ctx->trans);
75277+ if (atomf > atomh) {
75278+ spin_lock_atom_nested(atomf);
75279+ } else {
75280+ spin_unlock_atom(atomh);
75281+ spin_lock_atom(atomf);
75282+ spin_lock_atom_nested(atomh);
75283+ }
75284+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
75285+ release_two_atoms(atomf, atomh);
75286+ goto repeat;
75287+ }
75288+ atomic_dec(&atomh->refcount);
75289+ atomic_dec(&atomf->refcount);
75290+ capture_fuse_into(atomf, atomh);
75291+ goto repeat;
75292+ }
75293+ spin_unlock_zlock(&node->lock);
75294+ spin_unlock_atom(atomh);
75295+}
75296+
75297+/* This is the interface to capture unformatted nodes via their struct page
75298+ reference. Currently it is only used in reiser4_invalidatepage */
75299+int try_capture_page_to_invalidate(struct page *pg)
75300+{
75301+ int ret;
75302+ jnode *node;
75303+
75304+ assert("umka-292", pg != NULL);
75305+ assert("nikita-2597", PageLocked(pg));
75306+
75307+ if (IS_ERR(node = jnode_of_page(pg))) {
75308+ return PTR_ERR(node);
75309+ }
75310+
75311+ spin_lock_jnode(node);
75312+ unlock_page(pg);
75313+
75314+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
75315+ spin_unlock_jnode(node);
75316+ jput(node);
75317+ lock_page(pg);
75318+ return ret;
75319+}
75320+
75321+/* This informs the transaction manager when a node is deleted. Add the block to the
75322+ atom's delete set and uncapture the block.
75323+
75324+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
75325+explanations. find all the functions that use it, and unless there is some very
75326+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
75327+move the loop to inside the function.
75328+
75329+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
75330+ */
75331+void reiser4_uncapture_page(struct page *pg)
75332+{
75333+ jnode *node;
75334+ txn_atom *atom;
75335+
75336+ assert("umka-199", pg != NULL);
75337+ assert("nikita-3155", PageLocked(pg));
75338+
75339+ clear_page_dirty_for_io(pg);
75340+
75341+ reiser4_wait_page_writeback(pg);
75342+
75343+ node = jprivate(pg);
75344+ BUG_ON(node == NULL);
75345+
75346+ spin_lock_jnode(node);
75347+
75348+ atom = jnode_get_atom(node);
75349+ if (atom == NULL) {
75350+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
75351+ spin_unlock_jnode(node);
75352+ return;
75353+ }
75354+
75355+ /* We can remove jnode from transaction even if it is on flush queue
75356+ * prepped list, we only need to be sure that flush queue is not being
75357+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
75358+ * spin lock for protection of the prepped nodes list, instead
75359+ * write_fq() increments atom's nr_running_queues counters for the time
75360+ * when prepped list is not protected by spin lock. Here we check this
75361+ * counter if we want to remove jnode from flush queue and, if the
75362+ * counter is not zero, wait all reiser4_write_fq() for this atom to
75363+ * complete. This is not significant overhead. */
75364+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
75365+ spin_unlock_jnode(node);
75366+ /*
75367+ * at this moment we want to wait for "atom event", viz. wait
75368+ * until @node can be removed from flush queue. But
75369+ * reiser4_atom_wait_event() cannot be called with page locked,
75370+ * because it deadlocks with jnode_extent_write(). Unlock page,
75371+ * after making sure (through page_cache_get()) that it cannot
75372+ * be released from memory.
75373+ */
75374+ page_cache_get(pg);
75375+ unlock_page(pg);
75376+ reiser4_atom_wait_event(atom);
75377+ lock_page(pg);
75378+ /*
75379+ * page may has been detached by ->writepage()->releasepage().
75380+ */
75381+ reiser4_wait_page_writeback(pg);
75382+ spin_lock_jnode(node);
75383+ page_cache_release(pg);
75384+ atom = jnode_get_atom(node);
75385+/* VS-FIXME-HANS: improve the commenting in this function */
75386+ if (atom == NULL) {
75387+ spin_unlock_jnode(node);
75388+ return;
75389+ }
75390+ }
75391+ reiser4_uncapture_block(node);
75392+ spin_unlock_atom(atom);
75393+ jput(node);
75394+}
75395+
75396+/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
75397+ * inode's tree of jnodes */
75398+void reiser4_uncapture_jnode(jnode * node)
75399+{
75400+ txn_atom *atom;
75401+
75402+ assert_spin_locked(&(node->guard));
75403+ assert("", node->pg == 0);
75404+
75405+ atom = jnode_get_atom(node);
75406+ if (atom == NULL) {
75407+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
75408+ spin_unlock_jnode(node);
75409+ return;
75410+ }
75411+
75412+ reiser4_uncapture_block(node);
75413+ spin_unlock_atom(atom);
75414+ jput(node);
75415+}
75416+
75417+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
75418+ increases atom refcount and txnh_count, adds to txnh_list. */
75419+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
75420+{
75421+ assert("umka-200", atom != NULL);
75422+ assert("umka-201", txnh != NULL);
75423+
75424+ assert_spin_locked(&(txnh->hlock));
75425+ assert_spin_locked(&(atom->alock));
75426+ assert("jmacd-824", txnh->atom == NULL);
75427+ assert("nikita-3540", atom_isopen(atom));
75428+ BUG_ON(txnh->atom != NULL);
75429+
75430+ atomic_inc(&atom->refcount);
75431+ txnh->atom = atom;
75432+ reiser4_ctx_gfp_mask_set();
75433+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
75434+ atom->txnh_count += 1;
75435+}
75436+
75437+/* No-locking version of assign_block. Sets the block's atom pointer, references the
75438+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
75439+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
75440+{
75441+ assert("umka-202", atom != NULL);
75442+ assert("umka-203", node != NULL);
75443+ assert_spin_locked(&(node->guard));
75444+ assert_spin_locked(&(atom->alock));
75445+ assert("jmacd-323", node->atom == NULL);
75446+ BUG_ON(!list_empty_careful(&node->capture_link));
75447+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
75448+
75449+ /* Pointer from jnode to atom is not counted in atom->refcount. */
75450+ node->atom = atom;
75451+
75452+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
75453+ atom->capture_count += 1;
75454+ /* reference to jnode is acquired by atom. */
75455+ jref(node);
75456+
75457+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
75458+
75459+ LOCK_CNT_INC(t_refs);
75460+}
75461+
75462+/* common code for dirtying both unformatted jnodes and formatted znodes. */
75463+static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
75464+{
75465+ assert_spin_locked(&(node->guard));
75466+ assert_spin_locked(&(atom->alock));
75467+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
75468+
75469+ JF_SET(node, JNODE_DIRTY);
75470+
75471+ get_current_context()->nr_marked_dirty++;
75472+
75473+ /* We grab2flush_reserve one additional block only if node was
75474+ not CREATED and jnode_flush did not sort it into neither
75475+ relocate set nor overwrite one. If node is in overwrite or
75476+ relocate set we assume that atom's flush reserved counter was
75477+ already adjusted. */
75478+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
75479+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
75480+ && !jnode_is_cluster_page(node)) {
75481+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
75482+ assert("vs-1506", *jnode_get_block(node) != 0);
75483+ grabbed2flush_reserved_nolock(atom, (__u64) 1);
75484+ JF_SET(node, JNODE_FLUSH_RESERVED);
75485+ }
75486+
75487+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75488+ /* If the atom is not set yet, it will be added to the appropriate list in
75489+ capture_assign_block_nolock. */
75490+ /* Sometimes a node is set dirty before being captured -- the case for new
75491+ jnodes. In that case the jnode will be added to the appropriate list
75492+ in capture_assign_block_nolock. Another reason not to re-link jnode is
75493+ that jnode is on a flush queue (see flush.c for details) */
75494+
75495+ int level = jnode_get_level(node);
75496+
75497+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
75498+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
75499+ assert("nikita-2607", 0 <= level);
75500+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
75501+
75502+ /* move node to atom's dirty list */
75503+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
75504+ ON_DEBUG(count_jnode
75505+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
75506+ }
75507+}
75508+
75509+/* Set the dirty status for this (spin locked) jnode. */
75510+void jnode_make_dirty_locked(jnode * node)
75511+{
75512+ assert("umka-204", node != NULL);
75513+ assert_spin_locked(&(node->guard));
75514+
75515+ if (REISER4_DEBUG && rofs_jnode(node)) {
75516+ warning("nikita-3365", "Dirtying jnode on rofs");
75517+ dump_stack();
75518+ }
75519+
75520+ /* Fast check for already dirty node */
75521+ if (!JF_ISSET(node, JNODE_DIRTY)) {
75522+ txn_atom *atom;
75523+
75524+ atom = jnode_get_atom(node);
75525+ assert("vs-1094", atom);
75526+ /* Check jnode dirty status again because node spin lock might
75527+ * be released inside jnode_get_atom(). */
75528+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
75529+ do_jnode_make_dirty(node, atom);
75530+ spin_unlock_atom(atom);
75531+ }
75532+}
75533+
75534+/* Set the dirty status for this znode. */
75535+void znode_make_dirty(znode * z)
75536+{
75537+ jnode *node;
75538+ struct page *page;
75539+
75540+ assert("umka-204", z != NULL);
75541+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
75542+ assert("nikita-3560", znode_is_write_locked(z));
75543+
75544+ node = ZJNODE(z);
75545+ /* znode is longterm locked, we can check dirty bit without spinlock */
75546+ if (JF_ISSET(node, JNODE_DIRTY)) {
75547+ /* znode is dirty already. All we have to do is to change znode version */
75548+ z->version = znode_build_version(jnode_get_tree(node));
75549+ return;
75550+ }
75551+
75552+ spin_lock_jnode(node);
75553+ jnode_make_dirty_locked(node);
75554+ page = jnode_page(node);
75555+ if (page != NULL) {
75556+ /* this is useful assertion (allows one to check that no
75557+ * modifications are lost due to update of in-flight page),
75558+ * but it requires locking on page to check PG_writeback
75559+ * bit. */
75560+ /* assert("nikita-3292",
75561+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75562+ page_cache_get(page);
75563+
75564+ /* jnode lock is not needed for the rest of
75565+ * znode_set_dirty(). */
75566+ spin_unlock_jnode(node);
75567+ /* reiser4 file write code calls set_page_dirty for
75568+ * unformatted nodes, for formatted nodes we do it here. */
75569+ reiser4_set_page_dirty_internal(page);
75570+ page_cache_release(page);
75571+ /* bump version counter in znode */
75572+ z->version = znode_build_version(jnode_get_tree(node));
75573+ } else {
75574+ assert("zam-596", znode_above_root(JZNODE(node)));
75575+ spin_unlock_jnode(node);
75576+ }
75577+
75578+ assert("nikita-1900", znode_is_write_locked(z));
75579+ assert("jmacd-9777", node->atom != NULL);
75580+}
75581+
75582+int reiser4_sync_atom(txn_atom * atom)
75583+{
75584+ int result;
75585+ txn_handle *txnh;
75586+
75587+ txnh = get_current_context()->trans;
75588+
75589+ result = 0;
75590+ if (atom != NULL) {
75591+ if (atom->stage < ASTAGE_PRE_COMMIT) {
75592+ spin_lock_txnh(txnh);
75593+ capture_assign_txnh_nolock(atom, txnh);
75594+ result = force_commit_atom(txnh);
75595+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
75596+ /* wait atom commit */
75597+ reiser4_atom_wait_event(atom);
75598+ /* try once more */
75599+ result = RETERR(-E_REPEAT);
75600+ } else
75601+ spin_unlock_atom(atom);
75602+ }
75603+ return result;
75604+}
75605+
75606+#if REISER4_DEBUG
75607+
75608+/* move jnode form one list to another
75609+ call this after atom->capture_count is updated */
75610+void
75611+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75612+ atom_list new_list, int check_lists)
75613+{
75614+ struct list_head *pos;
75615+
75616+ assert("zam-1018", atom_is_protected(atom));
75617+ assert_spin_locked(&(node->guard));
75618+ assert("", NODE_LIST(node) == old_list);
75619+
75620+ switch (NODE_LIST(node)) {
75621+ case NOT_CAPTURED:
75622+ break;
75623+ case DIRTY_LIST:
75624+ assert("", atom->dirty > 0);
75625+ atom->dirty--;
75626+ break;
75627+ case CLEAN_LIST:
75628+ assert("", atom->clean > 0);
75629+ atom->clean--;
75630+ break;
75631+ case FQ_LIST:
75632+ assert("", atom->fq > 0);
75633+ atom->fq--;
75634+ break;
75635+ case WB_LIST:
75636+ assert("", atom->wb > 0);
75637+ atom->wb--;
75638+ break;
75639+ case OVRWR_LIST:
75640+ assert("", atom->ovrwr > 0);
75641+ atom->ovrwr--;
75642+ break;
75643+ default:
75644+ impossible("", "");
75645+ }
75646+
75647+ switch (new_list) {
75648+ case NOT_CAPTURED:
75649+ break;
75650+ case DIRTY_LIST:
75651+ atom->dirty++;
75652+ break;
75653+ case CLEAN_LIST:
75654+ atom->clean++;
75655+ break;
75656+ case FQ_LIST:
75657+ atom->fq++;
75658+ break;
75659+ case WB_LIST:
75660+ atom->wb++;
75661+ break;
75662+ case OVRWR_LIST:
75663+ atom->ovrwr++;
75664+ break;
75665+ default:
75666+ impossible("", "");
75667+ }
75668+ ASSIGN_NODE_LIST(node, new_list);
75669+ if (0 && check_lists) {
75670+ int count;
75671+ tree_level level;
75672+
75673+ count = 0;
75674+
75675+ /* flush queue list */
75676+ /* reiser4_check_fq(atom); */
75677+
75678+ /* dirty list */
75679+ count = 0;
75680+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75681+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75682+ count++;
75683+ }
75684+ if (count != atom->dirty)
75685+ warning("", "dirty counter %d, real %d\n", atom->dirty,
75686+ count);
75687+
75688+ /* clean list */
75689+ count = 0;
75690+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
75691+ count++;
75692+ if (count != atom->clean)
75693+ warning("", "clean counter %d, real %d\n", atom->clean,
75694+ count);
75695+
75696+ /* wb list */
75697+ count = 0;
75698+ list_for_each(pos, ATOM_WB_LIST(atom))
75699+ count++;
75700+ if (count != atom->wb)
75701+ warning("", "wb counter %d, real %d\n", atom->wb,
75702+ count);
75703+
75704+ /* overwrite list */
75705+ count = 0;
75706+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
75707+ count++;
75708+
75709+ if (count != atom->ovrwr)
75710+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75711+ count);
75712+ }
75713+ assert("vs-1624", atom->num_queued == atom->fq);
75714+ if (atom->capture_count !=
75715+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75716+ printk
75717+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75718+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75719+ atom->wb, atom->fq);
75720+ assert("vs-1622",
75721+ atom->capture_count ==
75722+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75723+ atom->fq);
75724+ }
75725+}
75726+
75727+#endif
75728+
75729+/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75730+ * lock should be taken before calling this function. */
75731+void jnode_make_wander_nolock(jnode * node)
75732+{
75733+ txn_atom *atom;
75734+
75735+ assert("nikita-2431", node != NULL);
75736+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75737+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75738+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75739+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75740+
75741+ atom = node->atom;
75742+
75743+ assert("zam-895", atom != NULL);
75744+ assert("zam-894", atom_is_protected(atom));
75745+
75746+ JF_SET(node, JNODE_OVRWR);
75747+ /* move node to atom's overwrite list */
75748+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75749+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75750+}
75751+
75752+/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75753+ * this function. */
75754+void jnode_make_wander(jnode * node)
75755+{
75756+ txn_atom *atom;
75757+
75758+ spin_lock_jnode(node);
75759+ atom = jnode_get_atom(node);
75760+ assert("zam-913", atom != NULL);
75761+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75762+
75763+ jnode_make_wander_nolock(node);
75764+ spin_unlock_atom(atom);
75765+ spin_unlock_jnode(node);
75766+}
75767+
75768+/* this just sets RELOC bit */
75769+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75770+{
75771+ assert_spin_locked(&(node->guard));
75772+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75773+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75774+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75775+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75776+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75777+ jnode_set_reloc(node);
75778+}
75779+
75780+/* Make znode RELOC and put it on flush queue */
75781+void znode_make_reloc(znode * z, flush_queue_t * fq)
75782+{
75783+ jnode *node;
75784+ txn_atom *atom;
75785+
75786+ node = ZJNODE(z);
75787+ spin_lock_jnode(node);
75788+
75789+ atom = jnode_get_atom(node);
75790+ assert("zam-919", atom != NULL);
75791+
75792+ jnode_make_reloc_nolock(fq, node);
75793+ queue_jnode(fq, node);
75794+
75795+ spin_unlock_atom(atom);
75796+ spin_unlock_jnode(node);
75797+
75798+}
75799+
75800+/* Make unformatted node RELOC and put it on flush queue */
75801+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75802+{
75803+ assert("vs-1479", jnode_is_unformatted(node));
75804+
75805+ jnode_make_reloc_nolock(fq, node);
75806+ queue_jnode(fq, node);
75807+}
75808+
75809+int reiser4_capture_super_block(struct super_block *s)
75810+{
75811+ int result;
75812+ znode *uber;
75813+ lock_handle lh;
75814+
75815+ init_lh(&lh);
75816+ result = get_uber_znode(reiser4_get_tree(s),
75817+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75818+ if (result)
75819+ return result;
75820+
75821+ uber = lh.node;
75822+ /* Grabbing one block for superblock */
75823+ result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75824+ if (result != 0)
75825+ return result;
75826+
75827+ znode_make_dirty(uber);
75828+
75829+ done_lh(&lh);
75830+ return 0;
75831+}
75832+
75833+/* Wakeup every handle on the atom's WAITFOR list */
75834+static void wakeup_atom_waitfor_list(txn_atom * atom)
75835+{
75836+ txn_wait_links *wlinks;
75837+
75838+ assert("umka-210", atom != NULL);
75839+
75840+ /* atom is locked */
75841+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75842+ if (wlinks->waitfor_cb == NULL ||
75843+ wlinks->waitfor_cb(atom, wlinks))
75844+ /* Wake up. */
75845+ reiser4_wake_up(wlinks->_lock_stack);
75846+ }
75847+}
75848+
75849+/* Wakeup every handle on the atom's WAITING list */
75850+static void wakeup_atom_waiting_list(txn_atom * atom)
75851+{
75852+ txn_wait_links *wlinks;
75853+
75854+ assert("umka-211", atom != NULL);
75855+
75856+ /* atom is locked */
75857+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75858+ if (wlinks->waiting_cb == NULL ||
75859+ wlinks->waiting_cb(atom, wlinks))
75860+ /* Wake up. */
75861+ reiser4_wake_up(wlinks->_lock_stack);
75862+ }
75863+}
75864+
75865+/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75866+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75867+{
75868+ assert("nikita-3330", atom != NULL);
75869+ assert_spin_locked(&(atom->alock));
75870+
75871+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75872+ * last transaction handle. */
75873+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75874+}
75875+
75876+/* The general purpose of this function is to wait on the first of two possible events.
75877+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
75878+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
75879+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
75880+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75881+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
75882+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
75883+
75884+ In other words, if either atomh or atomf change state, the handle will be awakened,
75885+ thus there are two lists per atom: WAITING and WAITFOR.
75886+
75887+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75888+ close but it is not assigned to an atom of its own.
75889+
75890+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75891+ BOTH_ATOM_LOCKS. Result: all four locks are released.
75892+*/
75893+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75894+ txn_atom * atomh, txn_capture mode)
75895+{
75896+ int ret;
75897+ txn_wait_links wlinks;
75898+
75899+ assert("umka-213", txnh != NULL);
75900+ assert("umka-214", atomf != NULL);
75901+
75902+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75903+ spin_unlock_txnh(txnh);
75904+ spin_unlock_atom(atomf);
75905+
75906+ if (atomh) {
75907+ spin_unlock_atom(atomh);
75908+ }
75909+
75910+ return RETERR(-E_BLOCK);
75911+ }
75912+
75913+ /* Initialize the waiting list links. */
75914+ init_wlinks(&wlinks);
75915+
75916+ /* Add txnh to atomf's waitfor list, unlock atomf. */
75917+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75918+ wlinks.waitfor_cb = wait_for_fusion;
75919+ atomic_inc(&atomf->refcount);
75920+ spin_unlock_atom(atomf);
75921+
75922+ if (atomh) {
75923+ /* Add txnh to atomh's waiting list, unlock atomh. */
75924+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75925+ atomic_inc(&atomh->refcount);
75926+ spin_unlock_atom(atomh);
75927+ }
75928+
75929+ /* Go to sleep. */
75930+ spin_unlock_txnh(txnh);
75931+
75932+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
75933+ if (ret == 0) {
75934+ reiser4_go_to_sleep(wlinks._lock_stack);
75935+ ret = RETERR(-E_REPEAT);
75936+ }
75937+
75938+ /* Remove from the waitfor list. */
75939+ spin_lock_atom(atomf);
75940+
75941+ list_del(&wlinks._fwaitfor_link);
75942+ atom_dec_and_unlock(atomf);
75943+
75944+ if (atomh) {
75945+ /* Remove from the waiting list. */
75946+ spin_lock_atom(atomh);
75947+ list_del(&wlinks._fwaiting_link);
75948+ atom_dec_and_unlock(atomh);
75949+ }
75950+ return ret;
75951+}
75952+
75953+static void lock_two_atoms(txn_atom * one, txn_atom * two)
75954+{
75955+ assert("zam-1067", one != two);
75956+
75957+ /* lock the atom with lesser address first */
75958+ if (one < two) {
75959+ spin_lock_atom(one);
75960+ spin_lock_atom_nested(two);
75961+ } else {
75962+ spin_lock_atom(two);
75963+ spin_lock_atom_nested(one);
75964+ }
75965+}
75966+
75967+/* Perform the necessary work to prepare for fusing two atoms, which involves
75968+ * acquiring two atom locks in the proper order. If one of the node's atom is
75969+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75970+ * atom is not then the handle's request is put to sleep. If the node's atom
75971+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
75972+ * atom with fewer pointers to be fused into the atom with more pointer and
75973+ * call capture_fuse_into.
75974+ */
75975+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75976+{
75977+ txn_atom * txnh_atom = txnh->atom;
75978+ txn_atom * block_atom = node->atom;
75979+
75980+ atomic_inc(&txnh_atom->refcount);
75981+ atomic_inc(&block_atom->refcount);
75982+
75983+ spin_unlock_txnh(txnh);
75984+ spin_unlock_jnode(node);
75985+
75986+ lock_two_atoms(txnh_atom, block_atom);
75987+
75988+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75989+ release_two_atoms(txnh_atom, block_atom);
75990+ return RETERR(-E_REPEAT);
75991+ }
75992+
75993+ atomic_dec(&txnh_atom->refcount);
75994+ atomic_dec(&block_atom->refcount);
75995+
75996+ assert ("zam-1066", atom_isopen(txnh_atom));
75997+
75998+ if (txnh_atom->stage >= block_atom->stage ||
75999+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
76000+ capture_fuse_into(txnh_atom, block_atom);
76001+ return RETERR(-E_REPEAT);
76002+ }
76003+ spin_lock_txnh(txnh);
76004+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
76005+}
76006+
76007+/* This function splices together two jnode lists (small and large) and sets all jnodes in
76008+ the small list to point to the large atom. Returns the length of the list. */
76009+static int
76010+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
76011+ struct list_head *small_head)
76012+{
76013+ int count = 0;
76014+ jnode *node;
76015+
76016+ assert("umka-218", large != NULL);
76017+ assert("umka-219", large_head != NULL);
76018+ assert("umka-220", small_head != NULL);
76019+ /* small atom should be locked also. */
76020+ assert_spin_locked(&(large->alock));
76021+
76022+ /* For every jnode on small's capture list... */
76023+ list_for_each_entry(node, small_head, capture_link) {
76024+ count += 1;
76025+
76026+ /* With the jnode lock held, update atom pointer. */
76027+ spin_lock_jnode(node);
76028+ node->atom = large;
76029+ spin_unlock_jnode(node);
76030+ }
76031+
76032+ /* Splice the lists. */
76033+ list_splice_init(small_head, large_head->prev);
76034+
76035+ return count;
76036+}
76037+
76038+/* This function splices together two txnh lists (small and large) and sets all txn handles in
76039+ the small list to point to the large atom. Returns the length of the list. */
76040+static int
76041+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
76042+ struct list_head *small_head)
76043+{
76044+ int count = 0;
76045+ txn_handle *txnh;
76046+
76047+ assert("umka-221", large != NULL);
76048+ assert("umka-222", large_head != NULL);
76049+ assert("umka-223", small_head != NULL);
76050+
76051+ /* Adjust every txnh to the new atom. */
76052+ list_for_each_entry(txnh, small_head, txnh_link) {
76053+ count += 1;
76054+
76055+ /* With the txnh lock held, update atom pointer. */
76056+ spin_lock_txnh(txnh);
76057+ txnh->atom = large;
76058+ spin_unlock_txnh(txnh);
76059+ }
76060+
76061+ /* Splice the txn_handle list. */
76062+ list_splice_init(small_head, large_head->prev);
76063+
76064+ return count;
76065+}
76066+
76067+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
76068+ added to LARGE and their ->atom pointers are all updated. The associated counts are
76069+ updated as well, and any waiting handles belonging to either are awakened. Finally the
76070+ smaller atom's refcount is decremented.
76071+*/
76072+static void capture_fuse_into(txn_atom * small, txn_atom * large)
76073+{
76074+ int level;
76075+ unsigned zcount = 0;
76076+ unsigned tcount = 0;
76077+
76078+ assert("umka-224", small != NULL);
76079+ assert("umka-225", small != NULL);
76080+
76081+ assert_spin_locked(&(large->alock));
76082+ assert_spin_locked(&(small->alock));
76083+
76084+ assert("jmacd-201", atom_isopen(small));
76085+ assert("jmacd-202", atom_isopen(large));
76086+
76087+ /* Splice and update the per-level dirty jnode lists */
76088+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
76089+ zcount +=
76090+ capture_fuse_jnode_lists(large,
76091+ ATOM_DIRTY_LIST(large, level),
76092+ ATOM_DIRTY_LIST(small, level));
76093+ }
76094+
76095+ /* Splice and update the [clean,dirty] jnode and txnh lists */
76096+ zcount +=
76097+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
76098+ ATOM_CLEAN_LIST(small));
76099+ zcount +=
76100+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
76101+ ATOM_OVRWR_LIST(small));
76102+ zcount +=
76103+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
76104+ ATOM_WB_LIST(small));
76105+ zcount +=
76106+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
76107+ tcount +=
76108+ capture_fuse_txnh_lists(large, &large->txnh_list,
76109+ &small->txnh_list);
76110+
76111+ /* Check our accounting. */
76112+ assert("jmacd-1063",
76113+ zcount + small->num_queued == small->capture_count);
76114+ assert("jmacd-1065", tcount == small->txnh_count);
76115+
76116+ /* sum numbers of waiters threads */
76117+ large->nr_waiters += small->nr_waiters;
76118+ small->nr_waiters = 0;
76119+
76120+ /* splice flush queues */
76121+ reiser4_fuse_fq(large, small);
76122+
76123+ /* update counter of jnode on every atom' list */
76124+ ON_DEBUG(large->dirty += small->dirty;
76125+ small->dirty = 0;
76126+ large->clean += small->clean;
76127+ small->clean = 0;
76128+ large->ovrwr += small->ovrwr;
76129+ small->ovrwr = 0;
76130+ large->wb += small->wb;
76131+ small->wb = 0;
76132+ large->fq += small->fq;
76133+ small->fq = 0;);
76134+
76135+ /* count flushers in result atom */
76136+ large->nr_flushers += small->nr_flushers;
76137+ small->nr_flushers = 0;
76138+
76139+ /* update counts of flushed nodes */
76140+ large->flushed += small->flushed;
76141+ small->flushed = 0;
76142+
76143+ /* Transfer list counts to large. */
76144+ large->txnh_count += small->txnh_count;
76145+ large->capture_count += small->capture_count;
76146+
76147+ /* Add all txnh references to large. */
76148+ atomic_add(small->txnh_count, &large->refcount);
76149+ atomic_sub(small->txnh_count, &small->refcount);
76150+
76151+ /* Reset small counts */
76152+ small->txnh_count = 0;
76153+ small->capture_count = 0;
76154+
76155+ /* Assign the oldest start_time, merge flags. */
76156+ large->start_time = min(large->start_time, small->start_time);
76157+ large->flags |= small->flags;
76158+
76159+ /* Merge blocknr sets. */
76160+ blocknr_set_merge(&small->delete_set, &large->delete_set);
76161+ blocknr_set_merge(&small->wandered_map, &large->wandered_map);
76162+
76163+ /* Merge allocated/deleted file counts */
76164+ large->nr_objects_deleted += small->nr_objects_deleted;
76165+ large->nr_objects_created += small->nr_objects_created;
76166+
76167+ small->nr_objects_deleted = 0;
76168+ small->nr_objects_created = 0;
76169+
76170+ /* Merge allocated blocks counts */
76171+ large->nr_blocks_allocated += small->nr_blocks_allocated;
76172+
76173+ large->nr_running_queues += small->nr_running_queues;
76174+ small->nr_running_queues = 0;
76175+
76176+ /* Merge blocks reserved for overwrite set. */
76177+ large->flush_reserved += small->flush_reserved;
76178+ small->flush_reserved = 0;
76179+
76180+ if (large->stage < small->stage) {
76181+ /* Large only needs to notify if it has changed state. */
76182+ reiser4_atom_set_stage(large, small->stage);
76183+ wakeup_atom_waiting_list(large);
76184+ }
76185+
76186+ reiser4_atom_set_stage(small, ASTAGE_INVALID);
76187+
76188+ /* Notify any waiters--small needs to unload its wait lists. Waiters
76189+ actually remove themselves from the list before returning from the
76190+ fuse_wait function. */
76191+ wakeup_atom_waiting_list(small);
76192+
76193+ /* Unlock atoms */
76194+ spin_unlock_atom(large);
76195+ atom_dec_and_unlock(small);
76196+}
76197+
76198+/* TXNMGR STUFF */
76199+
76200+/* Release a block from the atom, reversing the effects of being captured,
76201+ do not release atom's reference to jnode due to holding spin-locks.
76202+ Currently this is only called when the atom commits.
76203+
76204+ NOTE: this function does not release a (journal) reference to jnode
76205+ due to locking optimizations, you should call jput() somewhere after
76206+ calling reiser4_uncapture_block(). */
76207+void reiser4_uncapture_block(jnode * node)
76208+{
76209+ txn_atom *atom;
76210+
76211+ assert("umka-226", node != NULL);
76212+ atom = node->atom;
76213+ assert("umka-228", atom != NULL);
76214+
76215+ assert("jmacd-1021", node->atom == atom);
76216+ assert_spin_locked(&(node->guard));
76217+ assert("jmacd-1023", atom_is_protected(atom));
76218+
76219+ JF_CLR(node, JNODE_DIRTY);
76220+ JF_CLR(node, JNODE_RELOC);
76221+ JF_CLR(node, JNODE_OVRWR);
76222+ JF_CLR(node, JNODE_CREATED);
76223+ JF_CLR(node, JNODE_WRITEBACK);
76224+ JF_CLR(node, JNODE_REPACK);
76225+
76226+ list_del_init(&node->capture_link);
76227+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
76228+ assert("zam-925", atom_isopen(atom));
76229+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
76230+ ON_DEBUG(atom->num_queued--);
76231+ JF_CLR(node, JNODE_FLUSH_QUEUED);
76232+ }
76233+ atom->capture_count -= 1;
76234+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
76235+ node->atom = NULL;
76236+
76237+ spin_unlock_jnode(node);
76238+ LOCK_CNT_DEC(t_refs);
76239+}
76240+
76241+/* Unconditional insert of jnode into atom's overwrite list. Currently used in
76242+ bitmap-based allocator code for adding modified bitmap blocks the
76243+ transaction. @atom and @node are spin locked */
76244+void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
76245+{
76246+ assert("zam-538", atom_is_protected(atom));
76247+ assert_spin_locked(&(node->guard));
76248+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
76249+ assert("zam-543", node->atom == NULL);
76250+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
76251+
76252+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
76253+ jref(node);
76254+ node->atom = atom;
76255+ atom->capture_count++;
76256+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
76257+}
76258+
76259+static int count_deleted_blocks_actor(txn_atom * atom,
76260+ const reiser4_block_nr * a,
76261+ const reiser4_block_nr * b, void *data)
76262+{
76263+ reiser4_block_nr *counter = data;
76264+
76265+ assert("zam-995", data != NULL);
76266+ assert("zam-996", a != NULL);
76267+ if (b == NULL)
76268+ *counter += 1;
76269+ else
76270+ *counter += *b;
76271+ return 0;
76272+}
76273+
76274+reiser4_block_nr txnmgr_count_deleted_blocks(void)
76275+{
76276+ reiser4_block_nr result;
76277+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
76278+ txn_atom *atom;
76279+
76280+ result = 0;
76281+
76282+ spin_lock_txnmgr(tmgr);
76283+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
76284+ spin_lock_atom(atom);
76285+ if (atom_isopen(atom))
76286+ blocknr_set_iterator(
76287+ atom, &atom->delete_set,
76288+ count_deleted_blocks_actor, &result, 0);
76289+ spin_unlock_atom(atom);
76290+ }
76291+ spin_unlock_txnmgr(tmgr);
76292+
76293+ return result;
76294+}
76295+
76296+/*
76297+ * Local variables:
76298+ * c-indentation-style: "K&R"
76299+ * mode-name: "LC"
76300+ * c-basic-offset: 8
76301+ * tab-width: 8
76302+ * fill-column: 79
76303+ * End:
76304+ */
76305diff --git a/fs/reiser4/txnmgr.h b/fs/reiser4/txnmgr.h
76306new file mode 100644
76307index 0000000..6ad4b5a
76308--- /dev/null
76309+++ b/fs/reiser4/txnmgr.h
76310@@ -0,0 +1,708 @@
76311+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76312+ * reiser4/README */
76313+
76314+/* data-types and function declarations for transaction manager. See txnmgr.c
76315+ * for details. */
76316+
76317+#ifndef __REISER4_TXNMGR_H__
76318+#define __REISER4_TXNMGR_H__
76319+
76320+#include "forward.h"
76321+#include "dformat.h"
76322+
76323+#include <linux/fs.h>
76324+#include <linux/mm.h>
76325+#include <linux/types.h>
76326+#include <linux/spinlock.h>
76327+#include <asm/atomic.h>
76328+#include <linux/wait.h>
76329+
76330+/* TYPE DECLARATIONS */
76331+
76332+/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
76333+ A capture request dynamically assigns a block to the calling thread's transaction
76334+ handle. */
76335+typedef enum {
76336+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
76337+ atom should fuse in order to ensure that the block commits atomically with the
76338+ caller. */
76339+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
76340+
76341+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
76342+ willing to read a non-committed block without causing atoms to fuse. */
76343+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
76344+
76345+ /* A READ_MODIFY request indicates that a block will be read but that the caller
76346+ wishes for the block to be captured as it will be written. This capture request
76347+ mode is not currently used, but eventually it will be useful for preventing
76348+ deadlock in read-modify-write cycles. */
76349+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
76350+
76351+ /* A WRITE capture request indicates that a block will be modified and that atoms
76352+ should fuse to make the commit atomic. */
76353+ TXN_CAPTURE_WRITE = (1 << 3),
76354+
76355+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
76356+ exclusive type designation from extra bits that may be supplied -- see
76357+ below. */
76358+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
76359+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
76360+ TXN_CAPTURE_WRITE),
76361+
76362+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
76363+ indicate modification will occur. */
76364+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
76365+
76366+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
76367+ prefer not to sleep waiting for an aging atom to commit. */
76368+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
76369+
76370+ /* An option to reiser4_try_capture to prevent atom fusion, just simple
76371+ capturing is allowed */
76372+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
76373+
76374+ /* This macro selects only the exclusive capture request types, stripping out any
76375+ options that were supplied (i.e., NONBLOCKING). */
76376+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
76377+} txn_capture;
76378+
76379+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
76380+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
76381+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
76382+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
76383+typedef enum {
76384+ TXN_WRITE_FUSING = (1 << 0),
76385+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
76386+} txn_mode;
76387+
76388+/* Every atom has a stage, which is one of these exclusive values: */
76389+typedef enum {
76390+ /* Initially an atom is free. */
76391+ ASTAGE_FREE = 0,
76392+
76393+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
76394+ blocks and fuse with other atoms. */
76395+ ASTAGE_CAPTURE_FUSE = 1,
76396+
76397+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
76398+
76399+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
76400+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
76401+ atoms in the CAPTURE_FUSE stage. */
76402+ ASTAGE_CAPTURE_WAIT = 2,
76403+
76404+ /* Waiting for I/O before commit. Copy-on-capture (see
76405+ http://namesys.com/v4/v4.html). */
76406+ ASTAGE_PRE_COMMIT = 3,
76407+
76408+ /* Post-commit overwrite I/O. Steal-on-capture. */
76409+ ASTAGE_POST_COMMIT = 4,
76410+
76411+ /* Atom which waits for the removal of the last reference to (it? ) to
76412+ * be deleted from memory */
76413+ ASTAGE_DONE = 5,
76414+
76415+ /* invalid atom. */
76416+ ASTAGE_INVALID = 6,
76417+
76418+} txn_stage;
76419+
76420+/* Certain flags may be set in the txn_atom->flags field. */
76421+typedef enum {
76422+ /* Indicates that the atom should commit as soon as possible. */
76423+ ATOM_FORCE_COMMIT = (1 << 0),
76424+ /* to avoid endless loop, mark the atom (which was considered as too
76425+ * small) after failed attempt to fuse it. */
76426+ ATOM_CANCEL_FUSION = (1 << 1)
76427+} txn_flags;
76428+
76429+/* Flags for controlling commit_txnh */
76430+typedef enum {
76431+ /* Wait commit atom completion in commit_txnh */
76432+ TXNH_WAIT_COMMIT = 0x2,
76433+ /* Don't commit atom when this handle is closed */
76434+ TXNH_DONT_COMMIT = 0x4
76435+} txn_handle_flags_t;
76436+
76437+/* TYPE DEFINITIONS */
76438+
76439+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
76440+ fields, so typically an operation on the atom through either of these objects must (1)
76441+ lock the object, (2) read the atom pointer, (3) lock the atom.
76442+
76443+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
76444+ through the list of handles and pages held by the smaller of the two atoms. For each
76445+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
76446+ object, and (2) update the atom pointer.
76447+
76448+ You can see that there is a conflict of lock ordering here, so the more-complex
76449+ procedure should have priority, i.e., the fusing process has priority so that it is
76450+ guaranteed to make progress and to avoid restarts.
76451+
76452+ This decision, however, means additional complexity for aquiring the atom lock in the
76453+ first place.
76454+
76455+ The general original procedure followed in the code was:
76456+
76457+ TXN_OBJECT *obj = ...;
76458+ TXN_ATOM *atom;
76459+
76460+ spin_lock (& obj->_lock);
76461+
76462+ atom = obj->_atom;
76463+
76464+ if (! spin_trylock_atom (atom))
76465+ {
76466+ spin_unlock (& obj->_lock);
76467+ RESTART OPERATION, THERE WAS A RACE;
76468+ }
76469+
76470+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
76471+
76472+ It has however been found that this wastes CPU a lot in a manner that is
76473+ hard to profile. So, proper refcounting was added to atoms, and new
76474+ standard locking sequence is like following:
76475+
76476+ TXN_OBJECT *obj = ...;
76477+ TXN_ATOM *atom;
76478+
76479+ spin_lock (& obj->_lock);
76480+
76481+ atom = obj->_atom;
76482+
76483+ if (! spin_trylock_atom (atom))
76484+ {
76485+ atomic_inc (& atom->refcount);
76486+ spin_unlock (& obj->_lock);
76487+ spin_lock (&atom->_lock);
76488+ atomic_dec (& atom->refcount);
76489+ // HERE atom is locked
76490+ spin_unlock (&atom->_lock);
76491+ RESTART OPERATION, THERE WAS A RACE;
76492+ }
76493+
76494+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
76495+
76496+ (core of this is implemented in trylock_throttle() function)
76497+
76498+ See the jnode_get_atom() function for a common case.
76499+
76500+ As an additional (and important) optimization allowing to avoid restarts,
76501+ it is possible to re-check required pre-conditions at the HERE point in
76502+ code above and proceed without restarting if they are still satisfied.
76503+*/
76504+
76505+/* An atomic transaction: this is the underlying system representation
76506+ of a transaction, not the one seen by clients.
76507+
76508+ Invariants involving this data-type:
76509+
76510+ [sb-fake-allocated]
76511+*/
76512+struct txn_atom {
76513+ /* The spinlock protecting the atom, held during fusion and various other state
76514+ changes. */
76515+ spinlock_t alock;
76516+
76517+ /* The atom's reference counter, increasing (in case of a duplication
76518+ of an existing reference or when we are sure that some other
76519+ reference exists) may be done without taking spinlock, decrementing
76520+ of the ref. counter requires a spinlock to be held.
76521+
76522+ Each transaction handle counts in ->refcount. All jnodes count as
76523+ one reference acquired in atom_begin_andlock(), released in
76524+ commit_current_atom().
76525+ */
76526+ atomic_t refcount;
76527+
76528+ /* The atom_id identifies the atom in persistent records such as the log. */
76529+ __u32 atom_id;
76530+
76531+ /* Flags holding any of the txn_flags enumerated values (e.g.,
76532+ ATOM_FORCE_COMMIT). */
76533+ __u32 flags;
76534+
76535+ /* Number of open handles. */
76536+ __u32 txnh_count;
76537+
76538+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
76539+ dirty_nodes[level] and clean_nodes lists. */
76540+ __u32 capture_count;
76541+
76542+#if REISER4_DEBUG
76543+ int clean;
76544+ int dirty;
76545+ int ovrwr;
76546+ int wb;
76547+ int fq;
76548+#endif
76549+
76550+ __u32 flushed;
76551+
76552+ /* Current transaction stage. */
76553+ txn_stage stage;
76554+
76555+ /* Start time. */
76556+ unsigned long start_time;
76557+
76558+ /* The atom's delete set. It collects block numbers of the nodes
76559+ which were deleted during the transaction. */
76560+ struct list_head delete_set;
76561+
76562+ /* The atom's wandered_block mapping. */
76563+ struct list_head wandered_map;
76564+
76565+ /* The transaction's list of dirty captured nodes--per level. Index
76566+ by (level). dirty_nodes[0] is for znode-above-root */
76567+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76568+
76569+ /* The transaction's list of clean captured nodes. */
76570+ struct list_head clean_nodes;
76571+
76572+ /* The atom's overwrite set */
76573+ struct list_head ovrwr_nodes;
76574+
76575+ /* nodes which are being written to disk */
76576+ struct list_head writeback_nodes;
76577+
76578+ /* list of inodes */
76579+ struct list_head inodes;
76580+
76581+ /* List of handles associated with this atom. */
76582+ struct list_head txnh_list;
76583+
76584+ /* Transaction list link: list of atoms in the transaction manager. */
76585+ struct list_head atom_link;
76586+
76587+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76588+ struct list_head fwaitfor_list;
76589+
76590+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76591+ struct list_head fwaiting_list;
76592+
76593+ /* Numbers of objects which were deleted/created in this transaction
76594+ thereby numbers of objects IDs which were released/deallocated. */
76595+ int nr_objects_deleted;
76596+ int nr_objects_created;
76597+ /* number of blocks allocated during the transaction */
76598+ __u64 nr_blocks_allocated;
76599+ /* All atom's flush queue objects are on this list */
76600+ struct list_head flush_queues;
76601+#if REISER4_DEBUG
76602+ /* number of flush queues for this atom. */
76603+ int nr_flush_queues;
76604+ /* Number of jnodes which were removed from atom's lists and put
76605+ on flush_queue */
76606+ int num_queued;
76607+#endif
76608+ /* number of threads who wait for this atom to complete commit */
76609+ int nr_waiters;
76610+ /* number of threads which do jnode_flush() over this atom */
76611+ int nr_flushers;
76612+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
76613+ are submitted to disk by the reiser4_write_fq() routine. */
76614+ int nr_running_queues;
76615+ /* A counter of grabbed unformatted nodes, see a description of the
76616+ * reiser4 space reservation scheme at block_alloc.c */
76617+ reiser4_block_nr flush_reserved;
76618+#if REISER4_DEBUG
76619+ void *committer;
76620+#endif
76621+ struct super_block *super;
76622+};
76623+
76624+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76625+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76626+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76627+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76628+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76629+
76630+#define NODE_LIST(node) (node)->list
76631+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76632+ON_DEBUG(void
76633+ count_jnode(txn_atom *, jnode *, atom_list old_list,
76634+ atom_list new_list, int check_lists));
76635+
76636+typedef struct protected_jnodes {
76637+ struct list_head inatom; /* link to atom's list these structures */
76638+ struct list_head nodes; /* head of list of protected nodes */
76639+} protected_jnodes;
76640+
76641+/* A transaction handle: the client obtains and commits this handle which is assigned by
76642+ the system to a txn_atom. */
76643+struct txn_handle {
76644+ /* Spinlock protecting ->atom pointer */
76645+ spinlock_t hlock;
76646+
76647+ /* Flags for controlling commit_txnh() behavior */
76648+ /* from txn_handle_flags_t */
76649+ txn_handle_flags_t flags;
76650+
76651+ /* Whether it is READ_FUSING or WRITE_FUSING. */
76652+ txn_mode mode;
76653+
76654+ /* If assigned, the atom it is part of. */
76655+ txn_atom *atom;
76656+
76657+ /* Transaction list link. Head is in txn_atom. */
76658+ struct list_head txnh_link;
76659+};
76660+
76661+/* The transaction manager: one is contained in the reiser4_super_info_data */
76662+struct txn_mgr {
76663+ /* A spinlock protecting the atom list, id_count, flush_control */
76664+ spinlock_t tmgr_lock;
76665+
76666+ /* List of atoms. */
76667+ struct list_head atoms_list;
76668+
76669+ /* Number of atoms. */
76670+ int atom_count;
76671+
76672+ /* A counter used to assign atom->atom_id values. */
76673+ __u32 id_count;
76674+
76675+ /* a mutex object for commit serialization */
76676+ struct mutex commit_mutex;
76677+
76678+ /* a list of all txnmrgs served by particular daemon. */
76679+ struct list_head linkage;
76680+
76681+ /* description of daemon for this txnmgr */
76682+ ktxnmgrd_context *daemon;
76683+
76684+ /* parameters. Adjustable through mount options. */
76685+ unsigned int atom_max_size;
76686+ unsigned int atom_max_age;
76687+ unsigned int atom_min_size;
76688+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
76689+ unsigned int atom_max_flushers;
76690+ struct dentry *debugfs_atom_count;
76691+ struct dentry *debugfs_id_count;
76692+};
76693+
76694+/* FUNCTION DECLARATIONS */
76695+
76696+/* These are the externally (within Reiser4) visible transaction functions, therefore they
76697+ are prefixed with "txn_". For comments, see txnmgr.c. */
76698+
76699+extern int init_txnmgr_static(void);
76700+extern void done_txnmgr_static(void);
76701+
76702+extern void reiser4_init_txnmgr(txn_mgr *);
76703+extern void reiser4_done_txnmgr(txn_mgr *);
76704+
76705+extern int reiser4_txn_reserve(int reserved);
76706+
76707+extern void reiser4_txn_begin(reiser4_context * context);
76708+extern int reiser4_txn_end(reiser4_context * context);
76709+
76710+extern void reiser4_txn_restart(reiser4_context * context);
76711+extern void reiser4_txn_restart_current(void);
76712+
76713+extern int txnmgr_force_commit_all(struct super_block *, int);
76714+extern int current_atom_should_commit(void);
76715+
76716+extern jnode *find_first_dirty_jnode(txn_atom *, int);
76717+
76718+extern int commit_some_atoms(txn_mgr *);
76719+extern int force_commit_atom(txn_handle *);
76720+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76721+
76722+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76723+
76724+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
76725+
76726+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76727+ int alloc_value);
76728+extern void atom_dec_and_unlock(txn_atom * atom);
76729+
76730+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76731+extern int try_capture_page_to_invalidate(struct page *pg);
76732+
76733+extern void reiser4_uncapture_page(struct page *pg);
76734+extern void reiser4_uncapture_block(jnode *);
76735+extern void reiser4_uncapture_jnode(jnode *);
76736+
76737+extern int reiser4_capture_inode(struct inode *);
76738+extern int reiser4_uncapture_inode(struct inode *);
76739+
76740+extern txn_atom *get_current_atom_locked_nocheck(void);
76741+
76742+#if REISER4_DEBUG
76743+
76744+/**
76745+ * atom_is_protected - make sure that nobody but us can do anything with atom
76746+ * @atom: atom to be checked
76747+ *
76748+ * This is used to assert that atom either entered commit stages or is spin
76749+ * locked.
76750+ */
76751+static inline int atom_is_protected(txn_atom *atom)
76752+{
76753+ if (atom->stage >= ASTAGE_PRE_COMMIT)
76754+ return 1;
76755+ assert_spin_locked(&(atom->alock));
76756+ return 1;
76757+}
76758+
76759+#endif
76760+
76761+/* Get the current atom and spinlock it if current atom present. May not return NULL */
76762+static inline txn_atom *get_current_atom_locked(void)
76763+{
76764+ txn_atom *atom;
76765+
76766+ atom = get_current_atom_locked_nocheck();
76767+ assert("zam-761", atom != NULL);
76768+
76769+ return atom;
76770+}
76771+
76772+extern txn_atom *jnode_get_atom(jnode *);
76773+
76774+extern void reiser4_atom_wait_event(txn_atom *);
76775+extern void reiser4_atom_send_event(txn_atom *);
76776+
76777+extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76778+extern int reiser4_capture_super_block(struct super_block *s);
76779+int capture_bulk(jnode **, int count);
76780+
76781+/* See the comment on the function blocknrset.c:blocknr_set_add for the
76782+ calling convention of these three routines. */
76783+extern void blocknr_set_init(struct list_head * bset);
76784+extern void blocknr_set_destroy(struct list_head * bset);
76785+extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
76786+extern int blocknr_set_add_extent(txn_atom * atom,
76787+ struct list_head * bset,
76788+ blocknr_set_entry ** new_bsep,
76789+ const reiser4_block_nr * start,
76790+ const reiser4_block_nr * len);
76791+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
76792+ blocknr_set_entry ** new_bsep,
76793+ const reiser4_block_nr * a,
76794+ const reiser4_block_nr * b);
76795+
76796+typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76797+ const reiser4_block_nr *, void *);
76798+
76799+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
76800+ blocknr_set_actor_f actor, void *data,
76801+ int delete);
76802+
76803+/* flush code takes care about how to fuse flush queues */
76804+extern void flush_init_atom(txn_atom * atom);
76805+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76806+
76807+static inline void spin_lock_atom(txn_atom *atom)
76808+{
76809+ /* check that spinlocks of lower priorities are not held */
76810+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76811+ LOCK_CNT_NIL(spin_locked_atom) &&
76812+ LOCK_CNT_NIL(spin_locked_jnode) &&
76813+ LOCK_CNT_NIL(spin_locked_zlock) &&
76814+ LOCK_CNT_NIL(rw_locked_dk) &&
76815+ LOCK_CNT_NIL(rw_locked_tree)));
76816+
76817+ spin_lock(&(atom->alock));
76818+
76819+ LOCK_CNT_INC(spin_locked_atom);
76820+ LOCK_CNT_INC(spin_locked);
76821+}
76822+
76823+static inline void spin_lock_atom_nested(txn_atom *atom)
76824+{
76825+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76826+ LOCK_CNT_NIL(spin_locked_jnode) &&
76827+ LOCK_CNT_NIL(spin_locked_zlock) &&
76828+ LOCK_CNT_NIL(rw_locked_dk) &&
76829+ LOCK_CNT_NIL(rw_locked_tree)));
76830+
76831+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
76832+
76833+ LOCK_CNT_INC(spin_locked_atom);
76834+ LOCK_CNT_INC(spin_locked);
76835+}
76836+
76837+static inline int spin_trylock_atom(txn_atom *atom)
76838+{
76839+ if (spin_trylock(&(atom->alock))) {
76840+ LOCK_CNT_INC(spin_locked_atom);
76841+ LOCK_CNT_INC(spin_locked);
76842+ return 1;
76843+ }
76844+ return 0;
76845+}
76846+
76847+static inline void spin_unlock_atom(txn_atom *atom)
76848+{
76849+ assert_spin_locked(&(atom->alock));
76850+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76851+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76852+
76853+ LOCK_CNT_DEC(spin_locked_atom);
76854+ LOCK_CNT_DEC(spin_locked);
76855+
76856+ spin_unlock(&(atom->alock));
76857+}
76858+
76859+static inline void spin_lock_txnh(txn_handle *txnh)
76860+{
76861+ /* check that spinlocks of lower priorities are not held */
76862+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76863+ LOCK_CNT_NIL(spin_locked_zlock) &&
76864+ LOCK_CNT_NIL(rw_locked_tree)));
76865+
76866+ spin_lock(&(txnh->hlock));
76867+
76868+ LOCK_CNT_INC(spin_locked_txnh);
76869+ LOCK_CNT_INC(spin_locked);
76870+}
76871+
76872+static inline int spin_trylock_txnh(txn_handle *txnh)
76873+{
76874+ if (spin_trylock(&(txnh->hlock))) {
76875+ LOCK_CNT_INC(spin_locked_txnh);
76876+ LOCK_CNT_INC(spin_locked);
76877+ return 1;
76878+ }
76879+ return 0;
76880+}
76881+
76882+static inline void spin_unlock_txnh(txn_handle *txnh)
76883+{
76884+ assert_spin_locked(&(txnh->hlock));
76885+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76886+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76887+
76888+ LOCK_CNT_DEC(spin_locked_txnh);
76889+ LOCK_CNT_DEC(spin_locked);
76890+
76891+ spin_unlock(&(txnh->hlock));
76892+}
76893+
76894+#define spin_ordering_pred_txnmgr(tmgr) \
76895+ ( LOCK_CNT_NIL(spin_locked_atom) && \
76896+ LOCK_CNT_NIL(spin_locked_txnh) && \
76897+ LOCK_CNT_NIL(spin_locked_jnode) && \
76898+ LOCK_CNT_NIL(rw_locked_zlock) && \
76899+ LOCK_CNT_NIL(rw_locked_dk) && \
76900+ LOCK_CNT_NIL(rw_locked_tree) )
76901+
76902+static inline void spin_lock_txnmgr(txn_mgr *mgr)
76903+{
76904+ /* check that spinlocks of lower priorities are not held */
76905+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76906+ LOCK_CNT_NIL(spin_locked_txnh) &&
76907+ LOCK_CNT_NIL(spin_locked_jnode) &&
76908+ LOCK_CNT_NIL(spin_locked_zlock) &&
76909+ LOCK_CNT_NIL(rw_locked_dk) &&
76910+ LOCK_CNT_NIL(rw_locked_tree)));
76911+
76912+ spin_lock(&(mgr->tmgr_lock));
76913+
76914+ LOCK_CNT_INC(spin_locked_txnmgr);
76915+ LOCK_CNT_INC(spin_locked);
76916+}
76917+
76918+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76919+{
76920+ if (spin_trylock(&(mgr->tmgr_lock))) {
76921+ LOCK_CNT_INC(spin_locked_txnmgr);
76922+ LOCK_CNT_INC(spin_locked);
76923+ return 1;
76924+ }
76925+ return 0;
76926+}
76927+
76928+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76929+{
76930+ assert_spin_locked(&(mgr->tmgr_lock));
76931+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76932+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76933+
76934+ LOCK_CNT_DEC(spin_locked_txnmgr);
76935+ LOCK_CNT_DEC(spin_locked);
76936+
76937+ spin_unlock(&(mgr->tmgr_lock));
76938+}
76939+
76940+typedef enum {
76941+ FQ_IN_USE = 0x1
76942+} flush_queue_state_t;
76943+
76944+typedef struct flush_queue flush_queue_t;
76945+
76946+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76947+ is filled by the jnode_flush() routine, and written to disk under memory
76948+ pressure or at atom commit time. */
76949+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76950+ field and fq->prepped list can be modified if atom is spin-locked and fq
76951+ object is "in-use" state. For read-only traversal of the fq->prepped list
76952+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76953+ only have atom spin-locked. */
76954+struct flush_queue {
76955+ /* linkage element is the first in this structure to make debugging
76956+ easier. See field in atom struct for description of list. */
76957+ struct list_head alink;
76958+ /* A spinlock to protect changes of fq state and fq->atom pointer */
76959+ spinlock_t guard;
76960+ /* flush_queue state: [in_use | ready] */
76961+ flush_queue_state_t state;
76962+ /* A list which contains queued nodes, queued nodes are removed from any
76963+ * atom's list and put on this ->prepped one. */
76964+ struct list_head prepped;
76965+ /* number of submitted i/o requests */
76966+ atomic_t nr_submitted;
76967+ /* number of i/o errors */
76968+ atomic_t nr_errors;
76969+ /* An atom this flush queue is attached to */
76970+ txn_atom *atom;
76971+ /* A wait queue head to wait on i/o completion */
76972+ wait_queue_head_t wait;
76973+#if REISER4_DEBUG
76974+ /* A thread which took this fq in exclusive use, NULL if fq is free,
76975+ * used for debugging. */
76976+ struct task_struct *owner;
76977+#endif
76978+};
76979+
76980+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
76981+extern void reiser4_fq_put_nolock(flush_queue_t *);
76982+extern void reiser4_fq_put(flush_queue_t *);
76983+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
76984+extern void queue_jnode(flush_queue_t *, jnode *);
76985+
76986+extern int reiser4_write_fq(flush_queue_t *, long *, int);
76987+extern int current_atom_finish_all_fq(void);
76988+extern void init_atom_fq_parts(txn_atom *);
76989+
76990+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76991+
76992+extern void znode_make_dirty(znode * node);
76993+extern void jnode_make_dirty_locked(jnode * node);
76994+
76995+extern int reiser4_sync_atom(txn_atom * atom);
76996+
76997+#if REISER4_DEBUG
76998+extern int atom_fq_parts_are_clean(txn_atom *);
76999+#endif
77000+
77001+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
77002+extern flush_queue_t *get_fq_for_current_atom(void);
77003+
77004+void protected_jnodes_init(protected_jnodes * list);
77005+void protected_jnodes_done(protected_jnodes * list);
77006+void reiser4_invalidate_list(struct list_head * head);
77007+
77008+# endif /* __REISER4_TXNMGR_H__ */
77009+
77010+/* Make Linus happy.
77011+ Local variables:
77012+ c-indentation-style: "K&R"
77013+ mode-name: "LC"
77014+ c-basic-offset: 8
77015+ tab-width: 8
77016+ fill-column: 120
77017+ End:
77018+*/
77019diff --git a/fs/reiser4/type_safe_hash.h b/fs/reiser4/type_safe_hash.h
77020new file mode 100644
77021index 0000000..b2fdacd
77022--- /dev/null
77023+++ b/fs/reiser4/type_safe_hash.h
77024@@ -0,0 +1,320 @@
77025+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77026+ * reiser4/README */
77027+
77028+/* A hash table class that uses hash chains (singly-linked) and is
77029+ parametrized to provide type safety. */
77030+
77031+#ifndef __REISER4_TYPE_SAFE_HASH_H__
77032+#define __REISER4_TYPE_SAFE_HASH_H__
77033+
77034+#include "debug.h"
77035+
77036+#include <asm/errno.h>
77037+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
77038+ based on the object type. You need to declare the item type before
77039+ this definition, define it after this definition. */
77040+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
77041+ \
77042+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
77043+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
77044+ \
77045+struct PREFIX##_hash_table_ \
77046+{ \
77047+ ITEM_TYPE **_table; \
77048+ __u32 _buckets; \
77049+}; \
77050+ \
77051+struct PREFIX##_hash_link_ \
77052+{ \
77053+ ITEM_TYPE *_next; \
77054+}
77055+
77056+/* Step 2: Define the object type of the hash: give it field of type
77057+ PREFIX_hash_link. */
77058+
77059+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
77060+ the type and field name used in step 3. The arguments are:
77061+
77062+ ITEM_TYPE The item type being hashed
77063+ KEY_TYPE The type of key being hashed
77064+ KEY_NAME The name of the key field within the item
77065+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
77066+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
77067+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
77068+
77069+ It implements these functions:
77070+
77071+ prefix_hash_init Initialize the table given its size.
77072+ prefix_hash_insert Insert an item
77073+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
77074+ prefix_hash_find Find an item by key
77075+ prefix_hash_find_index Find an item w/ precomputed hash_index
77076+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
77077+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
77078+
77079+ If you'd like something to be done differently, feel free to ask me
77080+ for modifications. Additional features that could be added but
77081+ have not been:
77082+
77083+ prefix_hash_remove_key Find and remove an item by key
77084+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
77085+
77086+ The hash_function currently receives only the key as an argument,
77087+ meaning it must somehow know the number of buckets. If this is a
77088+ problem let me know.
77089+
77090+ This hash table uses a single-linked hash chain. This means
77091+ insertion is fast but deletion requires searching the chain.
77092+
77093+ There is also the doubly-linked hash chain approach, under which
77094+ deletion requires no search but the code is longer and it takes two
77095+ pointers per item.
77096+
77097+ The circularly-linked approach has the shortest code but requires
77098+ two pointers per bucket, doubling the size of the bucket array (in
77099+ addition to two pointers per item).
77100+*/
77101+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
77102+ \
77103+static __inline__ void \
77104+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
77105+ __u32 hash UNUSED_ARG) \
77106+{ \
77107+ assert("nikita-2780", hash < table->_buckets); \
77108+} \
77109+ \
77110+static __inline__ int \
77111+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
77112+ __u32 buckets) \
77113+{ \
77114+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
77115+ hash->_buckets = buckets; \
77116+ if (hash->_table == NULL) \
77117+ { \
77118+ return RETERR(-ENOMEM); \
77119+ } \
77120+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
77121+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
77122+ return 0; \
77123+} \
77124+ \
77125+static __inline__ void \
77126+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
77127+{ \
77128+ if (REISER4_DEBUG && hash->_table != NULL) { \
77129+ __u32 i; \
77130+ for (i = 0 ; i < hash->_buckets ; ++ i) \
77131+ assert("nikita-2905", hash->_table[i] == NULL); \
77132+ } \
77133+ if (hash->_table != NULL) \
77134+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
77135+ hash->_table = NULL; \
77136+} \
77137+ \
77138+static __inline__ void \
77139+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
77140+{ \
77141+ prefetch(item->LINK_NAME._next); \
77142+} \
77143+ \
77144+static __inline__ void \
77145+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
77146+ __u32 index) \
77147+{ \
77148+ prefetch(hash->_table[index]); \
77149+} \
77150+ \
77151+static __inline__ ITEM_TYPE* \
77152+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
77153+ __u32 hash_index, \
77154+ KEY_TYPE const *find_key) \
77155+{ \
77156+ ITEM_TYPE *item; \
77157+ \
77158+ PREFIX##_check_hash(hash, hash_index); \
77159+ \
77160+ for (item = hash->_table[hash_index]; \
77161+ item != NULL; \
77162+ item = item->LINK_NAME._next) \
77163+ { \
77164+ prefetch(item->LINK_NAME._next); \
77165+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
77166+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
77167+ { \
77168+ return item; \
77169+ } \
77170+ } \
77171+ \
77172+ return NULL; \
77173+} \
77174+ \
77175+static __inline__ ITEM_TYPE* \
77176+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
77177+ __u32 hash_index, \
77178+ KEY_TYPE const *find_key) \
77179+{ \
77180+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
77181+ \
77182+ PREFIX##_check_hash(hash, hash_index); \
77183+ \
77184+ while (*item != NULL) { \
77185+ prefetch(&(*item)->LINK_NAME._next); \
77186+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
77187+ ITEM_TYPE *found; \
77188+ \
77189+ found = *item; \
77190+ *item = found->LINK_NAME._next; \
77191+ found->LINK_NAME._next = hash->_table[hash_index]; \
77192+ hash->_table[hash_index] = found; \
77193+ return found; \
77194+ } \
77195+ item = &(*item)->LINK_NAME._next; \
77196+ } \
77197+ return NULL; \
77198+} \
77199+ \
77200+static __inline__ int \
77201+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
77202+ __u32 hash_index, \
77203+ ITEM_TYPE *del_item) \
77204+{ \
77205+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
77206+ \
77207+ PREFIX##_check_hash(hash, hash_index); \
77208+ \
77209+ while (*hash_item_p != NULL) { \
77210+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
77211+ if (*hash_item_p == del_item) { \
77212+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
77213+ return 1; \
77214+ } \
77215+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
77216+ } \
77217+ return 0; \
77218+} \
77219+ \
77220+static __inline__ void \
77221+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
77222+ __u32 hash_index, \
77223+ ITEM_TYPE *ins_item) \
77224+{ \
77225+ PREFIX##_check_hash(hash, hash_index); \
77226+ \
77227+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
77228+ hash->_table[hash_index] = ins_item; \
77229+} \
77230+ \
77231+static __inline__ void \
77232+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
77233+ __u32 hash_index, \
77234+ ITEM_TYPE *ins_item) \
77235+{ \
77236+ PREFIX##_check_hash(hash, hash_index); \
77237+ \
77238+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
77239+ smp_wmb(); \
77240+ hash->_table[hash_index] = ins_item; \
77241+} \
77242+ \
77243+static __inline__ ITEM_TYPE* \
77244+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
77245+ KEY_TYPE const *find_key) \
77246+{ \
77247+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
77248+} \
77249+ \
77250+static __inline__ ITEM_TYPE* \
77251+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
77252+ KEY_TYPE const *find_key) \
77253+{ \
77254+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
77255+} \
77256+ \
77257+static __inline__ int \
77258+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
77259+ ITEM_TYPE *del_item) \
77260+{ \
77261+ return PREFIX##_hash_remove_index (hash, \
77262+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
77263+} \
77264+ \
77265+static __inline__ int \
77266+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
77267+ ITEM_TYPE *del_item) \
77268+{ \
77269+ return PREFIX##_hash_remove (hash, del_item); \
77270+} \
77271+ \
77272+static __inline__ void \
77273+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
77274+ ITEM_TYPE *ins_item) \
77275+{ \
77276+ return PREFIX##_hash_insert_index (hash, \
77277+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
77278+} \
77279+ \
77280+static __inline__ void \
77281+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
77282+ ITEM_TYPE *ins_item) \
77283+{ \
77284+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
77285+ ins_item); \
77286+} \
77287+ \
77288+static __inline__ ITEM_TYPE * \
77289+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
77290+{ \
77291+ ITEM_TYPE *first; \
77292+ \
77293+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
77294+ first = hash->_table[ind]; \
77295+ if (first != NULL) \
77296+ break; \
77297+ } \
77298+ return first; \
77299+} \
77300+ \
77301+static __inline__ ITEM_TYPE * \
77302+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
77303+ ITEM_TYPE *item) \
77304+{ \
77305+ ITEM_TYPE *next; \
77306+ \
77307+ if (item == NULL) \
77308+ return NULL; \
77309+ next = item->LINK_NAME._next; \
77310+ if (next == NULL) \
77311+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
77312+ return next; \
77313+} \
77314+ \
77315+typedef struct {} PREFIX##_hash_dummy
77316+
77317+#define for_all_ht_buckets(table, head) \
77318+for ((head) = &(table) -> _table[ 0 ] ; \
77319+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
77320+
77321+#define for_all_in_bucket(bucket, item, next, field) \
77322+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
77323+ (item) != NULL ; \
77324+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
77325+
77326+#define for_all_in_htable(table, prefix, item, next) \
77327+for ((item) = prefix ## _hash_first ((table), 0), \
77328+ (next) = prefix ## _hash_next ((table), (item)) ; \
77329+ (item) != NULL ; \
77330+ (item) = (next), \
77331+ (next) = prefix ## _hash_next ((table), (item)))
77332+
77333+/* __REISER4_TYPE_SAFE_HASH_H__ */
77334+#endif
77335+
77336+/* Make Linus happy.
77337+ Local variables:
77338+ c-indentation-style: "K&R"
77339+ mode-name: "LC"
77340+ c-basic-offset: 8
77341+ tab-width: 8
77342+ fill-column: 120
77343+ End:
77344+*/
77345diff --git a/fs/reiser4/vfs_ops.c b/fs/reiser4/vfs_ops.c
77346new file mode 100644
77347index 0000000..31afd3e
77348--- /dev/null
77349+++ b/fs/reiser4/vfs_ops.c
77350@@ -0,0 +1,259 @@
77351+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77352+ * reiser4/README */
77353+
77354+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
77355+ here. */
77356+
77357+#include "forward.h"
77358+#include "debug.h"
77359+#include "dformat.h"
77360+#include "coord.h"
77361+#include "plugin/item/item.h"
77362+#include "plugin/file/file.h"
77363+#include "plugin/security/perm.h"
77364+#include "plugin/disk_format/disk_format.h"
77365+#include "plugin/plugin.h"
77366+#include "plugin/plugin_set.h"
77367+#include "plugin/object.h"
77368+#include "txnmgr.h"
77369+#include "jnode.h"
77370+#include "znode.h"
77371+#include "block_alloc.h"
77372+#include "tree.h"
77373+#include "vfs_ops.h"
77374+#include "inode.h"
77375+#include "page_cache.h"
77376+#include "ktxnmgrd.h"
77377+#include "super.h"
77378+#include "reiser4.h"
77379+#include "entd.h"
77380+#include "status_flags.h"
77381+#include "flush.h"
77382+#include "dscale.h"
77383+
77384+#include <linux/profile.h>
77385+#include <linux/types.h>
77386+#include <linux/mount.h>
77387+#include <linux/vfs.h>
77388+#include <linux/mm.h>
77389+#include <linux/buffer_head.h>
77390+#include <linux/dcache.h>
77391+#include <linux/list.h>
77392+#include <linux/pagemap.h>
77393+#include <linux/slab.h>
77394+#include <linux/seq_file.h>
77395+#include <linux/init.h>
77396+#include <linux/module.h>
77397+#include <linux/writeback.h>
77398+#include <linux/blkdev.h>
77399+#include <linux/quotaops.h>
77400+#include <linux/security.h>
77401+#include <linux/reboot.h>
77402+#include <linux/rcupdate.h>
77403+
77404+/* update inode stat-data by calling plugin */
77405+int reiser4_update_sd(struct inode *object)
77406+{
77407+ file_plugin *fplug;
77408+
77409+ assert("nikita-2338", object != NULL);
77410+ /* check for read-only file system. */
77411+ if (IS_RDONLY(object))
77412+ return 0;
77413+
77414+ fplug = inode_file_plugin(object);
77415+ assert("nikita-2339", fplug != NULL);
77416+ return fplug->write_sd_by_inode(object);
77417+}
77418+
77419+/* helper function: increase inode nlink count and call plugin method to save
77420+ updated stat-data.
77421+
77422+ Used by link/create and during creation of dot and dotdot in mkdir
77423+*/
77424+int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
77425+ struct inode *parent /* parent where new entry will be */
77426+ ,
77427+ int write_sd_p /* true if stat-data has to be
77428+ * updated */ )
77429+{
77430+ file_plugin *fplug;
77431+ int result;
77432+
77433+ assert("nikita-1351", object != NULL);
77434+
77435+ fplug = inode_file_plugin(object);
77436+ assert("nikita-1445", fplug != NULL);
77437+
77438+ /* ask plugin whether it can add yet another link to this
77439+ object */
77440+ if (!fplug->can_add_link(object))
77441+ return RETERR(-EMLINK);
77442+
77443+ assert("nikita-2211", fplug->add_link != NULL);
77444+ /* call plugin to do actual addition of link */
77445+ result = fplug->add_link(object, parent);
77446+
77447+ /* optionally update stat data */
77448+ if (result == 0 && write_sd_p)
77449+ result = fplug->write_sd_by_inode(object);
77450+ return result;
77451+}
77452+
77453+/* helper function: decrease inode nlink count and call plugin method to save
77454+ updated stat-data.
77455+
77456+ Used by unlink/create
77457+*/
77458+int reiser4_del_nlink(struct inode *object /* object from which link is
77459+ * removed */ ,
77460+ struct inode *parent /* parent where entry was */ ,
77461+ int write_sd_p /* true is stat-data has to be
77462+ * updated */ )
77463+{
77464+ file_plugin *fplug;
77465+ int result;
77466+
77467+ assert("nikita-1349", object != NULL);
77468+
77469+ fplug = inode_file_plugin(object);
77470+ assert("nikita-1350", fplug != NULL);
77471+ assert("nikita-1446", object->i_nlink > 0);
77472+ assert("nikita-2210", fplug->rem_link != NULL);
77473+
77474+ /* call plugin to do actual deletion of link */
77475+ result = fplug->rem_link(object, parent);
77476+
77477+ /* optionally update stat data */
77478+ if (result == 0 && write_sd_p)
77479+ result = fplug->write_sd_by_inode(object);
77480+ return result;
77481+}
77482+
77483+/* Release reiser4 dentry. This is d_op->d_release() method. */
77484+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
77485+{
77486+ reiser4_free_dentry_fsdata(dentry);
77487+}
77488+
77489+/*
77490+ * Called by reiser4_sync_inodes(), during speculative write-back (through
77491+ * pdflush, or balance_dirty_pages()).
77492+ */
77493+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
77494+{
77495+ long written = 0;
77496+ int repeats = 0;
77497+ int result;
77498+ struct address_space *mapping;
77499+
77500+ /*
77501+ * Performs early flushing, trying to free some memory. If there is
77502+ * nothing to flush, commits some atoms.
77503+ */
77504+
77505+ /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
77506+ sys_fsync(). */
77507+ if (wbc->sync_mode != WB_SYNC_NONE) {
77508+ txnmgr_force_commit_all(sb, 0);
77509+ return;
77510+ }
77511+
77512+ BUG_ON(reiser4_get_super_fake(sb) == NULL);
77513+ mapping = reiser4_get_super_fake(sb)->i_mapping;
77514+ do {
77515+ long nr_submitted = 0;
77516+ jnode *node = NULL;
77517+
77518+ /* do not put more requests to overload write queue */
77519+ if (wbc->nonblocking &&
77520+ bdi_write_congested(mapping->backing_dev_info)) {
77521+ blk_run_address_space(mapping);
77522+ wbc->encountered_congestion = 1;
77523+ break;
77524+ }
77525+ repeats++;
77526+ BUG_ON(wbc->nr_to_write <= 0);
77527+
77528+ if (get_current_context()->entd) {
77529+ entd_context *ent = get_entd_context(sb);
77530+
77531+ if (ent->cur_request->node)
77532+ /*
77533+ * this is ent thread and it managed to capture
77534+ * requested page itself - start flush from
77535+ * that page
77536+ */
77537+ node = jref(ent->cur_request->node);
77538+ }
77539+
77540+ result = flush_some_atom(node, &nr_submitted, wbc,
77541+ JNODE_FLUSH_WRITE_BLOCKS);
77542+ if (result != 0)
77543+ warning("nikita-31001", "Flush failed: %i", result);
77544+ if (node)
77545+ jput(node);
77546+ if (!nr_submitted)
77547+ break;
77548+
77549+ wbc->nr_to_write -= nr_submitted;
77550+ written += nr_submitted;
77551+ } while (wbc->nr_to_write > 0);
77552+}
77553+
77554+void reiser4_throttle_write(struct inode *inode)
77555+{
77556+ reiser4_txn_restart_current();
77557+ balance_dirty_pages_ratelimited(inode->i_mapping);
77558+}
77559+
77560+const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77561+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
77562+ * beginning of device */
77563+
77564+/*
77565+ * Reiser4 initialization/shutdown.
77566+ *
77567+ * Code below performs global reiser4 initialization that is done either as
77568+ * part of kernel initialization (when reiser4 is statically built-in), or
77569+ * during reiser4 module load (when compiled as module).
77570+ */
77571+
77572+void reiser4_handle_error(void)
77573+{
77574+ struct super_block *sb = reiser4_get_current_sb();
77575+
77576+ if (!sb)
77577+ return;
77578+ reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77579+ "Filesystem error occured");
77580+ switch (get_super_private(sb)->onerror) {
77581+ case 0:
77582+ reiser4_panic("foobar-42", "Filesystem error occured\n");
77583+ case 1:
77584+ default:
77585+ if (sb->s_flags & MS_RDONLY)
77586+ return;
77587+ sb->s_flags |= MS_RDONLY;
77588+ break;
77589+ }
77590+}
77591+
77592+struct dentry_operations reiser4_dentry_operations = {
77593+ .d_revalidate = NULL,
77594+ .d_hash = NULL,
77595+ .d_compare = NULL,
77596+ .d_delete = NULL,
77597+ .d_release = reiser4_d_release,
77598+ .d_iput = NULL,
77599+};
77600+
77601+/* Make Linus happy.
77602+ Local variables:
77603+ c-indentation-style: "K&R"
77604+ mode-name: "LC"
77605+ c-basic-offset: 8
77606+ tab-width: 8
77607+ fill-column: 120
77608+ End:
77609+*/
77610diff --git a/fs/reiser4/vfs_ops.h b/fs/reiser4/vfs_ops.h
77611new file mode 100644
77612index 0000000..03e16ce
77613--- /dev/null
77614+++ b/fs/reiser4/vfs_ops.h
77615@@ -0,0 +1,53 @@
77616+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77617+ * reiser4/README */
77618+
77619+/* vfs_ops.c's exported symbols */
77620+
77621+#if !defined( __FS_REISER4_VFS_OPS_H__ )
77622+#define __FS_REISER4_VFS_OPS_H__
77623+
77624+#include "forward.h"
77625+#include "coord.h"
77626+#include "seal.h"
77627+#include "plugin/file/file.h"
77628+#include "super.h"
77629+#include "readahead.h"
77630+
77631+#include <linux/types.h> /* for loff_t */
77632+#include <linux/fs.h> /* for struct address_space */
77633+#include <linux/dcache.h> /* for struct dentry */
77634+#include <linux/mm.h>
77635+#include <linux/backing-dev.h>
77636+
77637+/* address space operations */
77638+int reiser4_writepage(struct page *, struct writeback_control *);
77639+int reiser4_set_page_dirty(struct page *);
77640+void reiser4_invalidatepage(struct page *, unsigned long offset);
77641+int reiser4_releasepage(struct page *, gfp_t);
77642+
77643+extern int reiser4_update_sd(struct inode *);
77644+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77645+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77646+
77647+extern int reiser4_start_up_io(struct page *page);
77648+extern void reiser4_throttle_write(struct inode *);
77649+extern int jnode_is_releasable(jnode *);
77650+
77651+#define CAPTURE_APAGE_BURST (1024l)
77652+void reiser4_writeout(struct super_block *, struct writeback_control *);
77653+
77654+extern void reiser4_handle_error(void);
77655+
77656+/* __FS_REISER4_VFS_OPS_H__ */
77657+#endif
77658+
77659+/* Make Linus happy.
77660+ Local variables:
77661+ c-indentation-style: "K&R"
77662+ mode-name: "LC"
77663+ c-basic-offset: 8
77664+ tab-width: 8
77665+ fill-column: 120
77666+ scroll-step: 1
77667+ End:
77668+*/
77669diff --git a/fs/reiser4/wander.c b/fs/reiser4/wander.c
77670new file mode 100644
77671index 0000000..6d1d1d9
77672--- /dev/null
77673+++ b/fs/reiser4/wander.c
77674@@ -0,0 +1,1797 @@
77675+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77676+ * reiser4/README */
77677+
77678+/* Reiser4 Wandering Log */
77679+
77680+/* You should read http://www.namesys.com/txn-doc.html
77681+
77682+ That describes how filesystem operations are performed as atomic
77683+ transactions, and how we try to arrange it so that we can write most of the
77684+ data only once while performing the operation atomically.
77685+
77686+ For the purposes of this code, it is enough for it to understand that it
77687+ has been told a given block should be written either once, or twice (if
77688+ twice then once to the wandered location and once to the real location).
77689+
77690+ This code guarantees that those blocks that are defined to be part of an
77691+ atom either all take effect or none of them take effect.
77692+
77693+ Relocate set nodes are submitted to write by the jnode_flush() routine, and
77694+ the overwrite set is submitted by reiser4_write_log(). This is because with
77695+ the overwrite set we seek to optimize writes, and with the relocate set we
77696+ seek to cause disk order to correlate with the parent first pre-order.
77697+
77698+ reiser4_write_log() allocates and writes wandered blocks and maintains
77699+ additional on-disk structures of the atom as wander records (each wander
77700+ record occupies one block) for storing of the "wandered map" (a table which
77701+ contains a relation between wandered and real block numbers) and other
77702+ information which might be needed at transaction recovery time.
77703+
77704+ The wander records are unidirectionally linked into a circle: each wander
77705+ record contains a block number of the next wander record, the last wander
77706+ record points to the first one.
77707+
77708+ One wander record (named "tx head" in this file) has a format which is
77709+ different from the other wander records. The "tx head" has a reference to the
77710+ "tx head" block of the previously committed atom. Also, "tx head" contains
77711+ fs information (the free blocks counter, and the oid allocator state) which
77712+ is logged in a special way .
77713+
77714+ There are two journal control blocks, named journal header and journal
77715+ footer which have fixed on-disk locations. The journal header has a
77716+ reference to the "tx head" block of the last committed atom. The journal
77717+ footer points to the "tx head" of the last flushed atom. The atom is
77718+ "played" when all blocks from its overwrite set are written to disk the
77719+ second time (i.e. written to their real locations).
77720+
77721+ NOTE: People who know reiserfs internals and its journal structure might be
77722+ confused with these terms journal footer and journal header. There is a table
77723+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
77724+
77725+ REISER3 TERM | REISER4 TERM | DESCRIPTION
77726+ --------------------+-----------------------+----------------------------
77727+ commit record | journal header | atomic write of this record
77728+ | | ends transaction commit
77729+ --------------------+-----------------------+----------------------------
77730+ journal header | journal footer | atomic write of this record
77731+ | | ends post-commit writes.
77732+ | | After successful
77733+ | | writing of this journal
77734+ | | blocks (in reiser3) or
77735+ | | wandered blocks/records are
77736+ | | free for re-use.
77737+ --------------------+-----------------------+----------------------------
77738+
77739+ The atom commit process is the following:
77740+
77741+ 1. The overwrite set is taken from atom's clean list, and its size is
77742+ counted.
77743+
77744+ 2. The number of necessary wander records (including tx head) is calculated,
77745+ and the wander record blocks are allocated.
77746+
77747+ 3. Allocate wandered blocks and populate wander records by wandered map.
77748+
77749+ 4. submit write requests for wander records and wandered blocks.
77750+
77751+ 5. wait until submitted write requests complete.
77752+
77753+ 6. update journal header: change the pointer to the block number of just
77754+ written tx head, submit an i/o for modified journal header block and wait
77755+ for i/o completion.
77756+
77757+ NOTE: The special logging for bitmap blocks and some reiser4 super block
77758+ fields makes processes of atom commit, flush and recovering a bit more
77759+ complex (see comments in the source code for details).
77760+
77761+ The atom playing process is the following:
77762+
77763+ 1. Write atom's overwrite set in-place.
77764+
77765+ 2. Wait on i/o.
77766+
77767+ 3. Update journal footer: change the pointer to block number of tx head
77768+ block of the atom we currently flushing, submit an i/o, wait on i/o
77769+ completion.
77770+
77771+ 4. Free disk space which was used for wandered blocks and wander records.
77772+
77773+ After the freeing of wandered blocks and wander records we have that journal
77774+ footer points to the on-disk structure which might be overwritten soon.
77775+ Neither the log writer nor the journal recovery procedure use that pointer
77776+ for accessing the data. When the journal recovery procedure finds the oldest
77777+ transaction it compares the journal footer pointer value with the "prev_tx"
77778+ pointer value in tx head, if values are equal the oldest not flushed
77779+ transaction is found.
77780+
77781+ NOTE on disk space leakage: the information about of what blocks and how many
77782+ blocks are allocated for wandered blocks, wandered records is not written to
77783+ the disk because of special logging for bitmaps and some super blocks
77784+ counters. After a system crash we the reiser4 does not remember those
77785+ objects allocation, thus we have no such a kind of disk space leakage.
77786+*/
77787+
77788+/* Special logging of reiser4 super block fields. */
77789+
77790+/* There are some reiser4 super block fields (free block count and OID allocator
77791+ state (number of files and next free OID) which are logged separately from
77792+ super block to avoid unnecessary atom fusion.
77793+
77794+ So, the reiser4 super block can be not captured by a transaction with
77795+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
77796+ the reiser4 on-disk super block is not touched when such a transaction is
77797+ committed and flushed. Those "counters logged specially" are logged in "tx
77798+ head" blocks and in the journal footer block.
77799+
77800+ A step-by-step description of special logging:
77801+
77802+ 0. The per-atom information about deleted or created files and allocated or
77803+ freed blocks is collected during the transaction. The atom's
77804+ ->nr_objects_created and ->nr_objects_deleted are for object
77805+ deletion/creation tracking, the numbers of allocated and freed blocks are
77806+ calculated using atom's delete set and atom's capture list -- all new and
77807+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
77808+ bit set.
77809+
77810+ 1. The "logged specially" reiser4 super block fields have their "committed"
77811+ versions in the reiser4 in-memory super block. They get modified only at
77812+ atom commit time. The atom's commit thread has an exclusive access to those
77813+ "committed" fields because the log writer implementation supports only one
77814+ atom commit a time (there is a per-fs "commit" mutex). At
77815+ that time "committed" counters are modified using per-atom information
77816+ collected during the transaction. These counters are stored on disk as a
77817+ part of tx head block when atom is committed.
77818+
77819+ 2. When the atom is flushed the value of the free block counter and the OID
77820+ allocator state get written to the journal footer block. A special journal
77821+ procedure (journal_recover_sb_data()) takes those values from the journal
77822+ footer and updates the reiser4 in-memory super block.
77823+
77824+ NOTE: That means free block count and OID allocator state are logged
77825+ separately from the reiser4 super block regardless of the fact that the
77826+ reiser4 super block has fields to store both the free block counter and the
77827+ OID allocator.
77828+
77829+ Writing the whole super block at commit time requires knowing true values of
77830+ all its fields without changes made by not yet committed transactions. It is
77831+ possible by having their "committed" version of the super block like the
77832+ reiser4 bitmap blocks have "committed" and "working" versions. However,
77833+ another scheme was implemented which stores special logged values in the
77834+ unused free space inside transaction head block. In my opinion it has an
77835+ advantage of not writing whole super block when only part of it was
77836+ modified. */
77837+
77838+#include "debug.h"
77839+#include "dformat.h"
77840+#include "txnmgr.h"
77841+#include "jnode.h"
77842+#include "znode.h"
77843+#include "block_alloc.h"
77844+#include "page_cache.h"
77845+#include "wander.h"
77846+#include "reiser4.h"
77847+#include "super.h"
77848+#include "vfs_ops.h"
77849+#include "writeout.h"
77850+#include "inode.h"
77851+#include "entd.h"
77852+
77853+#include <linux/types.h>
77854+#include <linux/fs.h> /* for struct super_block */
77855+#include <linux/mm.h> /* for struct page */
77856+#include <linux/pagemap.h>
77857+#include <linux/bio.h> /* for struct bio */
77858+#include <linux/blkdev.h>
77859+
77860+static int write_jnodes_to_disk_extent(
77861+ jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77862+
77863+/* The commit_handle is a container for objects needed at atom commit time */
77864+struct commit_handle {
77865+ /* A pointer to atom's list of OVRWR nodes */
77866+ struct list_head *overwrite_set;
77867+ /* atom's overwrite set size */
77868+ int overwrite_set_size;
77869+ /* jnodes for wander record blocks */
77870+ struct list_head tx_list;
77871+ /* number of wander records */
77872+ __u32 tx_size;
77873+ /* 'committed' sb counters are saved here until atom is completely
77874+ flushed */
77875+ __u64 free_blocks;
77876+ __u64 nr_files;
77877+ __u64 next_oid;
77878+ /* A pointer to the atom which is being committed */
77879+ txn_atom *atom;
77880+ /* A pointer to current super block */
77881+ struct super_block *super;
77882+ /* The counter of modified bitmaps */
77883+ reiser4_block_nr nr_bitmap;
77884+};
77885+
77886+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77887+{
77888+ memset(ch, 0, sizeof(struct commit_handle));
77889+ INIT_LIST_HEAD(&ch->tx_list);
77890+
77891+ ch->atom = atom;
77892+ ch->super = reiser4_get_current_sb();
77893+}
77894+
77895+static void done_commit_handle(struct commit_handle *ch)
77896+{
77897+ assert("zam-690", list_empty(&ch->tx_list));
77898+}
77899+
77900+static inline int reiser4_use_write_barrier(struct super_block * s)
77901+{
77902+ return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77903+}
77904+
77905+static void disable_write_barrier(struct super_block * s)
77906+{
77907+ notice("zam-1055", "%s does not support write barriers,"
77908+ " using synchronous write instead.", s->s_id);
77909+ set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77910+}
77911+
77912+/* fill journal header block data */
77913+static void format_journal_header(struct commit_handle *ch)
77914+{
77915+ struct reiser4_super_info_data *sbinfo;
77916+ struct journal_header *header;
77917+ jnode *txhead;
77918+
77919+ sbinfo = get_super_private(ch->super);
77920+ assert("zam-479", sbinfo != NULL);
77921+ assert("zam-480", sbinfo->journal_header != NULL);
77922+
77923+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77924+
77925+ jload(sbinfo->journal_header);
77926+
77927+ header = (struct journal_header *)jdata(sbinfo->journal_header);
77928+ assert("zam-484", header != NULL);
77929+
77930+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77931+ &header->last_committed_tx);
77932+
77933+ jrelse(sbinfo->journal_header);
77934+}
77935+
77936+/* fill journal footer block data */
77937+static void format_journal_footer(struct commit_handle *ch)
77938+{
77939+ struct reiser4_super_info_data *sbinfo;
77940+ struct journal_footer *footer;
77941+ jnode *tx_head;
77942+
77943+ sbinfo = get_super_private(ch->super);
77944+
77945+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77946+
77947+ assert("zam-493", sbinfo != NULL);
77948+ assert("zam-494", sbinfo->journal_header != NULL);
77949+
77950+ check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77951+
77952+ footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77953+ assert("zam-495", footer != NULL);
77954+
77955+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77956+ &footer->last_flushed_tx);
77957+ put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77958+
77959+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77960+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77961+
77962+ jrelse(sbinfo->journal_footer);
77963+}
77964+
77965+/* wander record capacity depends on current block size */
77966+static int wander_record_capacity(const struct super_block *super)
77967+{
77968+ return (super->s_blocksize -
77969+ sizeof(struct wander_record_header)) /
77970+ sizeof(struct wander_entry);
77971+}
77972+
77973+/* Fill first wander record (tx head) in accordance with supplied given data */
77974+static void format_tx_head(struct commit_handle *ch)
77975+{
77976+ jnode *tx_head;
77977+ jnode *next;
77978+ struct tx_header *header;
77979+
77980+ tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77981+ assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77982+
77983+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77984+ if (&ch->tx_list == &next->capture_link)
77985+ next = tx_head;
77986+
77987+ header = (struct tx_header *)jdata(tx_head);
77988+
77989+ assert("zam-460", header != NULL);
77990+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77991+
77992+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77993+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77994+
77995+ put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77996+ put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77997+ &header->prev_tx);
77998+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77999+ put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
78000+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
78001+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
78002+}
78003+
78004+/* prepare ordinary wander record block (fill all service fields) */
78005+static void
78006+format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
78007+{
78008+ struct wander_record_header *LRH;
78009+ jnode *next;
78010+
78011+ assert("zam-464", node != NULL);
78012+
78013+ LRH = (struct wander_record_header *)jdata(node);
78014+ next = list_entry(node->capture_link.next, jnode, capture_link);
78015+
78016+ if (&ch->tx_list == &next->capture_link)
78017+ next = list_entry(ch->tx_list.next, jnode, capture_link);
78018+
78019+ assert("zam-465", LRH != NULL);
78020+ assert("zam-463",
78021+ ch->super->s_blocksize > sizeof(struct wander_record_header));
78022+
78023+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
78024+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
78025+
78026+ put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
78027+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
78028+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
78029+}
78030+
78031+/* add one wandered map entry to formatted wander record */
78032+static void
78033+store_entry(jnode * node, int index, const reiser4_block_nr * a,
78034+ const reiser4_block_nr * b)
78035+{
78036+ char *data;
78037+ struct wander_entry *pairs;
78038+
78039+ data = jdata(node);
78040+ assert("zam-451", data != NULL);
78041+
78042+ pairs =
78043+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
78044+
78045+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
78046+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
78047+}
78048+
78049+/* currently, wander records contains contain only wandered map, which depend on
78050+ overwrite set size */
78051+static void get_tx_size(struct commit_handle *ch)
78052+{
78053+ assert("zam-440", ch->overwrite_set_size != 0);
78054+ assert("zam-695", ch->tx_size == 0);
78055+
78056+ /* count all ordinary wander records
78057+ (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
78058+ for tx head block */
78059+ ch->tx_size =
78060+ (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
78061+ 2;
78062+}
78063+
78064+/* A special structure for using in store_wmap_actor() for saving its state
78065+ between calls */
78066+struct store_wmap_params {
78067+ jnode *cur; /* jnode of current wander record to fill */
78068+ int idx; /* free element index in wander record */
78069+ int capacity; /* capacity */
78070+
78071+#if REISER4_DEBUG
78072+ struct list_head *tx_list;
78073+#endif
78074+};
78075+
78076+/* an actor for use in blocknr_set_iterator routine which populates the list
78077+ of pre-formatted wander records by wandered map info */
78078+static int
78079+store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
78080+ const reiser4_block_nr * b, void *data)
78081+{
78082+ struct store_wmap_params *params = data;
78083+
78084+ if (params->idx >= params->capacity) {
78085+ /* a new wander record should be taken from the tx_list */
78086+ params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
78087+ assert("zam-454",
78088+ params->tx_list != &params->cur->capture_link);
78089+
78090+ params->idx = 0;
78091+ }
78092+
78093+ store_entry(params->cur, params->idx, a, b);
78094+ params->idx++;
78095+
78096+ return 0;
78097+}
78098+
78099+/* This function is called after Relocate set gets written to disk, Overwrite
78100+ set is written to wandered locations and all wander records are written
78101+ also. Updated journal header blocks contains a pointer (block number) to
78102+ first wander record of the just written transaction */
78103+static int update_journal_header(struct commit_handle *ch, int use_barrier)
78104+{
78105+ struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
78106+ jnode *jh = sbinfo->journal_header;
78107+ jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
78108+ int ret;
78109+
78110+ format_journal_header(ch);
78111+
78112+ ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
78113+ use_barrier ? WRITEOUT_BARRIER : 0);
78114+ if (ret)
78115+ return ret;
78116+
78117+ // blk_run_address_space(sbinfo->fake->i_mapping);
78118+ /*blk_run_queues(); */
78119+
78120+ ret = jwait_io(jh, WRITE);
78121+
78122+ if (ret)
78123+ return ret;
78124+
78125+ sbinfo->last_committed_tx = *jnode_get_block(head);
78126+
78127+ return 0;
78128+}
78129+
78130+/* This function is called after write-back is finished. We update journal
78131+ footer block and free blocks which were occupied by wandered blocks and
78132+ transaction wander records */
78133+static int update_journal_footer(struct commit_handle *ch, int use_barrier)
78134+{
78135+ reiser4_super_info_data *sbinfo = get_super_private(ch->super);
78136+
78137+ jnode *jf = sbinfo->journal_footer;
78138+
78139+ int ret;
78140+
78141+ format_journal_footer(ch);
78142+
78143+ ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
78144+ use_barrier ? WRITEOUT_BARRIER : 0);
78145+ if (ret)
78146+ return ret;
78147+
78148+ // blk_run_address_space(sbinfo->fake->i_mapping);
78149+ /*blk_run_queue(); */
78150+
78151+ ret = jwait_io(jf, WRITE);
78152+ if (ret)
78153+ return ret;
78154+
78155+ return 0;
78156+}
78157+
78158+/* free block numbers of wander records of already written in place transaction */
78159+static void dealloc_tx_list(struct commit_handle *ch)
78160+{
78161+ while (!list_empty(&ch->tx_list)) {
78162+ jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
78163+ list_del(&cur->capture_link);
78164+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
78165+ reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
78166+ BA_FORMATTED);
78167+
78168+ unpin_jnode_data(cur);
78169+ reiser4_drop_io_head(cur);
78170+ }
78171+}
78172+
78173+/* An actor for use in block_nr_iterator() routine which frees wandered blocks
78174+ from atom's overwrite set. */
78175+static int
78176+dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
78177+ const reiser4_block_nr * a UNUSED_ARG,
78178+ const reiser4_block_nr * b, void *data UNUSED_ARG)
78179+{
78180+
78181+ assert("zam-499", b != NULL);
78182+ assert("zam-500", *b != 0);
78183+ assert("zam-501", !reiser4_blocknr_is_fake(b));
78184+
78185+ reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
78186+ return 0;
78187+}
78188+
78189+/* free wandered block locations of already written in place transaction */
78190+static void dealloc_wmap(struct commit_handle *ch)
78191+{
78192+ assert("zam-696", ch->atom != NULL);
78193+
78194+ blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
78195+ dealloc_wmap_actor, NULL, 1);
78196+}
78197+
78198+/* helper function for alloc wandered blocks, which refill set of block
78199+ numbers needed for wandered blocks */
78200+static int
78201+get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
78202+{
78203+ reiser4_blocknr_hint hint;
78204+ int ret;
78205+
78206+ reiser4_block_nr wide_len = count;
78207+
78208+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
78209+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
78210+ reserved allocation area so as to get the best qualities of fixed
78211+ journals? */
78212+ reiser4_blocknr_hint_init(&hint);
78213+ hint.block_stage = BLOCK_GRABBED;
78214+
78215+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
78216+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
78217+ *len = (int)wide_len;
78218+
78219+ return ret;
78220+}
78221+
78222+/*
78223+ * roll back changes made before issuing BIO in the case of IO error.
78224+ */
78225+static void undo_bio(struct bio *bio)
78226+{
78227+ int i;
78228+
78229+ for (i = 0; i < bio->bi_vcnt; ++i) {
78230+ struct page *pg;
78231+ jnode *node;
78232+
78233+ pg = bio->bi_io_vec[i].bv_page;
78234+ end_page_writeback(pg);
78235+ node = jprivate(pg);
78236+ spin_lock_jnode(node);
78237+ JF_CLR(node, JNODE_WRITEBACK);
78238+ JF_SET(node, JNODE_DIRTY);
78239+ spin_unlock_jnode(node);
78240+ }
78241+ bio_put(bio);
78242+}
78243+
78244+/* put overwrite set back to atom's clean list */
78245+static void put_overwrite_set(struct commit_handle *ch)
78246+{
78247+ jnode *cur;
78248+
78249+ list_for_each_entry(cur, ch->overwrite_set, capture_link)
78250+ jrelse_tail(cur);
78251+}
78252+
78253+/* Count overwrite set size, grab disk space for wandered blocks allocation.
78254+ Since we have a separate list for atom's overwrite set we just scan the list,
78255+ count bitmap and other not leaf nodes which wandered blocks allocation we
78256+ have to grab space for. */
78257+static int get_overwrite_set(struct commit_handle *ch)
78258+{
78259+ int ret;
78260+ jnode *cur;
78261+ __u64 nr_not_leaves = 0;
78262+#if REISER4_DEBUG
78263+ __u64 nr_formatted_leaves = 0;
78264+ __u64 nr_unformatted_leaves = 0;
78265+#endif
78266+
78267+ assert("zam-697", ch->overwrite_set_size == 0);
78268+
78269+ ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
78270+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78271+
78272+ while (ch->overwrite_set != &cur->capture_link) {
78273+ jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
78274+
78275+ /* Count bitmap locks for getting correct statistics what number
78276+ * of blocks were cleared by the transaction commit. */
78277+ if (jnode_get_type(cur) == JNODE_BITMAP)
78278+ ch->nr_bitmap++;
78279+
78280+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
78281+ || jnode_get_type(cur) == JNODE_BITMAP);
78282+
78283+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
78284+ /* we replace fake znode by another (real)
78285+ znode which is suggested by disk_layout
78286+ plugin */
78287+
78288+ /* FIXME: it looks like fake znode should be
78289+ replaced by jnode supplied by
78290+ disk_layout. */
78291+
78292+ struct super_block *s = reiser4_get_current_sb();
78293+ reiser4_super_info_data *sbinfo =
78294+ get_current_super_private();
78295+
78296+ if (sbinfo->df_plug->log_super) {
78297+ jnode *sj = sbinfo->df_plug->log_super(s);
78298+
78299+ assert("zam-593", sj != NULL);
78300+
78301+ if (IS_ERR(sj))
78302+ return PTR_ERR(sj);
78303+
78304+ spin_lock_jnode(sj);
78305+ JF_SET(sj, JNODE_OVRWR);
78306+ insert_into_atom_ovrwr_list(ch->atom, sj);
78307+ spin_unlock_jnode(sj);
78308+
78309+ /* jload it as the rest of overwrite set */
78310+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
78311+
78312+ ch->overwrite_set_size++;
78313+ }
78314+ spin_lock_jnode(cur);
78315+ reiser4_uncapture_block(cur);
78316+ jput(cur);
78317+
78318+ } else {
78319+ int ret;
78320+ ch->overwrite_set_size++;
78321+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
78322+ if (ret)
78323+ reiser4_panic("zam-783",
78324+ "cannot load e-flushed jnode back (ret = %d)\n",
78325+ ret);
78326+ }
78327+
78328+ /* Count not leaves here because we have to grab disk space
78329+ * for wandered blocks. They were not counted as "flush
78330+ * reserved". Counting should be done _after_ nodes are pinned
78331+ * into memory by jload(). */
78332+ if (!jnode_is_leaf(cur))
78333+ nr_not_leaves++;
78334+ else {
78335+#if REISER4_DEBUG
78336+ /* at this point @cur either has JNODE_FLUSH_RESERVED
78337+ * or is eflushed. Locking is not strong enough to
78338+ * write an assertion checking for this. */
78339+ if (jnode_is_znode(cur))
78340+ nr_formatted_leaves++;
78341+ else
78342+ nr_unformatted_leaves++;
78343+#endif
78344+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
78345+ }
78346+
78347+ cur = next;
78348+ }
78349+
78350+ /* Grab space for writing (wandered blocks) of not leaves found in
78351+ * overwrite set. */
78352+ ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
78353+ if (ret)
78354+ return ret;
78355+
78356+ /* Disk space for allocation of wandered blocks of leaf nodes already
78357+ * reserved as "flush reserved", move it to grabbed space counter. */
78358+ spin_lock_atom(ch->atom);
78359+ assert("zam-940",
78360+ nr_formatted_leaves + nr_unformatted_leaves <=
78361+ ch->atom->flush_reserved);
78362+ flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
78363+ spin_unlock_atom(ch->atom);
78364+
78365+ return ch->overwrite_set_size;
78366+}
78367+
78368+/**
78369+ * write_jnodes_to_disk_extent - submit write request
78370+ * @head:
78371+ * @first: first jnode of the list
78372+ * @nr: number of jnodes on the list
78373+ * @block_p:
78374+ * @fq:
78375+ * @flags: used to decide whether page is to get PG_reclaim flag
78376+ *
78377+ * Submits a write request for @nr jnodes beginning from the @first, other
78378+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
78379+ * will be written to the disk region of @nr blocks starting with @block_p block
78380+ * number. If @fq is not NULL it means that waiting for i/o completion will be
78381+ * done more efficiently by using flush_queue_t objects.
78382+ * This function is the one which writes list of jnodes in batch mode. It does
78383+ * all low-level things as bio construction and page states manipulation.
78384+ *
78385+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
78386+ * aggregated in this function instead of being left to the layers below
78387+ *
78388+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
78389+ * Why that layer needed? Why BIOs cannot be constructed here?
78390+ */
78391+static int write_jnodes_to_disk_extent(
78392+ jnode *first, int nr, const reiser4_block_nr *block_p,
78393+ flush_queue_t *fq, int flags)
78394+{
78395+ struct super_block *super = reiser4_get_current_sb();
78396+ int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
78397+ int max_blocks;
78398+ jnode *cur = first;
78399+ reiser4_block_nr block;
78400+
78401+ assert("zam-571", first != NULL);
78402+ assert("zam-572", block_p != NULL);
78403+ assert("zam-570", nr > 0);
78404+
78405+ block = *block_p;
78406+ max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
78407+
78408+ while (nr > 0) {
78409+ struct bio *bio;
78410+ int nr_blocks = min(nr, max_blocks);
78411+ int i;
78412+ int nr_used;
78413+
78414+ bio = bio_alloc(GFP_NOIO, nr_blocks);
78415+ if (!bio)
78416+ return RETERR(-ENOMEM);
78417+
78418+ bio->bi_bdev = super->s_bdev;
78419+ bio->bi_sector = block * (super->s_blocksize >> 9);
78420+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
78421+ struct page *pg;
78422+
78423+ pg = jnode_page(cur);
78424+ assert("zam-573", pg != NULL);
78425+
78426+ page_cache_get(pg);
78427+
78428+ lock_and_wait_page_writeback(pg);
78429+
78430+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
78431+ /*
78432+ * underlying device is satiated. Stop adding
78433+ * pages to the bio.
78434+ */
78435+ unlock_page(pg);
78436+ page_cache_release(pg);
78437+ break;
78438+ }
78439+
78440+ spin_lock_jnode(cur);
78441+ assert("nikita-3166",
78442+ pg->mapping == jnode_get_mapping(cur));
78443+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
78444+#if REISER4_DEBUG
78445+ spin_lock(&cur->load);
78446+ assert("nikita-3165", !jnode_is_releasable(cur));
78447+ spin_unlock(&cur->load);
78448+#endif
78449+ JF_SET(cur, JNODE_WRITEBACK);
78450+ JF_CLR(cur, JNODE_DIRTY);
78451+ ON_DEBUG(cur->written++);
78452+ spin_unlock_jnode(cur);
78453+
78454+ ClearPageError(pg);
78455+ set_page_writeback(pg);
78456+
78457+ if (get_current_context()->entd) {
78458+ /* this is ent thread */
78459+ entd_context *ent = get_entd_context(super);
78460+ struct wbq *rq, *next;
78461+
78462+ spin_lock(&ent->guard);
78463+
78464+ if (pg == ent->cur_request->page) {
78465+ /*
78466+ * entd is called for this page. This
78467+ * request is not in th etodo list
78468+ */
78469+ ent->cur_request->written = 1;
78470+ } else {
78471+ /*
78472+ * if we have written a page for which writepage
78473+ * is called for - move request to another list.
78474+ */
78475+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
78476+ assert("", rq->magic == WBQ_MAGIC);
78477+ if (pg == rq->page) {
78478+ /*
78479+ * remove request from
78480+ * entd's queue, but do
78481+ * not wake up a thread
78482+ * which put this
78483+ * request
78484+ */
78485+ list_del_init(&rq->link);
78486+ ent->nr_todo_reqs --;
78487+ list_add_tail(&rq->link, &ent->done_list);
78488+ ent->nr_done_reqs ++;
78489+ rq->written = 1;
78490+ break;
78491+ }
78492+ }
78493+ }
78494+ spin_unlock(&ent->guard);
78495+ }
78496+
78497+ clear_page_dirty_for_io(pg);
78498+
78499+ unlock_page(pg);
78500+
78501+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78502+ nr_used++;
78503+ }
78504+ if (nr_used > 0) {
78505+ assert("nikita-3453",
78506+ bio->bi_size == super->s_blocksize * nr_used);
78507+ assert("nikita-3454", bio->bi_vcnt == nr_used);
78508+
78509+ /* Check if we are allowed to write at all */
78510+ if (super->s_flags & MS_RDONLY)
78511+ undo_bio(bio);
78512+ else {
78513+ int not_supported;
78514+
78515+ add_fq_to_bio(fq, bio);
78516+ bio_get(bio);
78517+ reiser4_submit_bio(write_op, bio);
78518+ not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
78519+ bio_put(bio);
78520+ if (not_supported)
78521+ return -EOPNOTSUPP;
78522+ }
78523+
78524+ block += nr_used - 1;
78525+ update_blocknr_hint_default(super, &block);
78526+ block += 1;
78527+ } else {
78528+ bio_put(bio);
78529+ }
78530+ nr -= nr_used;
78531+ }
78532+
78533+ return 0;
78534+}
78535+
78536+/* This is a procedure which recovers a contiguous sequences of disk block
78537+ numbers in the given list of j-nodes and submits write requests on this
78538+ per-sequence basis */
78539+int
78540+write_jnode_list(struct list_head *head, flush_queue_t *fq,
78541+ long *nr_submitted, int flags)
78542+{
78543+ int ret;
78544+ jnode *beg = list_entry(head->next, jnode, capture_link);
78545+
78546+ while (head != &beg->capture_link) {
78547+ int nr = 1;
78548+ jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78549+
78550+ while (head != &cur->capture_link) {
78551+ if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78552+ break;
78553+ ++nr;
78554+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78555+ }
78556+
78557+ ret = write_jnodes_to_disk_extent(
78558+ beg, nr, jnode_get_block(beg), fq, flags);
78559+ if (ret)
78560+ return ret;
78561+
78562+ if (nr_submitted)
78563+ *nr_submitted += nr;
78564+
78565+ beg = cur;
78566+ }
78567+
78568+ return 0;
78569+}
78570+
78571+/* add given wandered mapping to atom's wandered map */
78572+static int
78573+add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78574+{
78575+ int ret;
78576+ blocknr_set_entry *new_bsep = NULL;
78577+ reiser4_block_nr block;
78578+
78579+ txn_atom *atom;
78580+
78581+ assert("zam-568", block_p != NULL);
78582+ block = *block_p;
78583+ assert("zam-569", len > 0);
78584+
78585+ while ((len--) > 0) {
78586+ do {
78587+ atom = get_current_atom_locked();
78588+ assert("zam-536",
78589+ !reiser4_blocknr_is_fake(jnode_get_block(cur)));
78590+ ret =
78591+ blocknr_set_add_pair(atom, &atom->wandered_map,
78592+ &new_bsep,
78593+ jnode_get_block(cur), &block);
78594+ } while (ret == -E_REPEAT);
78595+
78596+ if (ret) {
78597+ /* deallocate blocks which were not added to wandered
78598+ map */
78599+ reiser4_block_nr wide_len = len;
78600+
78601+ reiser4_dealloc_blocks(&block, &wide_len,
78602+ BLOCK_NOT_COUNTED,
78603+ BA_FORMATTED
78604+ /* formatted, without defer */ );
78605+
78606+ return ret;
78607+ }
78608+
78609+ spin_unlock_atom(atom);
78610+
78611+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78612+ ++block;
78613+ }
78614+
78615+ return 0;
78616+}
78617+
78618+/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78619+ submit IO for allocated blocks. We assume that current atom is in a stage
78620+ when any atom fusion is impossible and atom is unlocked and it is safe. */
78621+static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78622+{
78623+ reiser4_block_nr block;
78624+
78625+ int rest;
78626+ int len;
78627+ int ret;
78628+
78629+ jnode *cur;
78630+
78631+ assert("zam-534", ch->overwrite_set_size > 0);
78632+
78633+ rest = ch->overwrite_set_size;
78634+
78635+ cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78636+ while (ch->overwrite_set != &cur->capture_link) {
78637+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78638+
78639+ ret = get_more_wandered_blocks(rest, &block, &len);
78640+ if (ret)
78641+ return ret;
78642+
78643+ rest -= len;
78644+
78645+ ret = add_region_to_wmap(cur, len, &block);
78646+ if (ret)
78647+ return ret;
78648+
78649+ ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78650+ if (ret)
78651+ return ret;
78652+
78653+ while ((len--) > 0) {
78654+ assert("zam-604",
78655+ ch->overwrite_set != &cur->capture_link);
78656+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78657+ }
78658+ }
78659+
78660+ return 0;
78661+}
78662+
78663+/* allocate given number of nodes over the journal area and link them into a
78664+ list, return pointer to the first jnode in the list */
78665+static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78666+{
78667+ reiser4_blocknr_hint hint;
78668+ reiser4_block_nr allocated = 0;
78669+ reiser4_block_nr first, len;
78670+ jnode *cur;
78671+ jnode *txhead;
78672+ int ret;
78673+ reiser4_context *ctx;
78674+ reiser4_super_info_data *sbinfo;
78675+
78676+ assert("zam-698", ch->tx_size > 0);
78677+ assert("zam-699", list_empty_careful(&ch->tx_list));
78678+
78679+ ctx = get_current_context();
78680+ sbinfo = get_super_private(ctx->super);
78681+
78682+ while (allocated < (unsigned)ch->tx_size) {
78683+ len = (ch->tx_size - allocated);
78684+
78685+ reiser4_blocknr_hint_init(&hint);
78686+
78687+ hint.block_stage = BLOCK_GRABBED;
78688+
78689+ /* FIXME: there should be some block allocation policy for
78690+ nodes which contain wander records */
78691+
78692+ /* We assume that disk space for wandered record blocks can be
78693+ * taken from reserved area. */
78694+ ret = reiser4_alloc_blocks(&hint, &first, &len,
78695+ BA_FORMATTED | BA_RESERVED |
78696+ BA_USE_DEFAULT_SEARCH_START);
78697+ reiser4_blocknr_hint_done(&hint);
78698+
78699+ if (ret)
78700+ return ret;
78701+
78702+ allocated += len;
78703+
78704+ /* create jnodes for all wander records */
78705+ while (len--) {
78706+ cur = reiser4_alloc_io_head(&first);
78707+
78708+ if (cur == NULL) {
78709+ ret = RETERR(-ENOMEM);
78710+ goto free_not_assigned;
78711+ }
78712+
78713+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
78714+
78715+ if (ret != 0) {
78716+ jfree(cur);
78717+ goto free_not_assigned;
78718+ }
78719+
78720+ pin_jnode_data(cur);
78721+
78722+ list_add_tail(&cur->capture_link, &ch->tx_list);
78723+
78724+ first++;
78725+ }
78726+ }
78727+
78728+ { /* format a on-disk linked list of wander records */
78729+ int serial = 1;
78730+
78731+ txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78732+ format_tx_head(ch);
78733+
78734+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78735+ while (&ch->tx_list != &cur->capture_link) {
78736+ format_wander_record(ch, cur, serial++);
78737+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78738+ }
78739+ }
78740+
78741+ { /* Fill wander records with Wandered Set */
78742+ struct store_wmap_params params;
78743+ txn_atom *atom;
78744+
78745+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78746+
78747+ params.idx = 0;
78748+ params.capacity =
78749+ wander_record_capacity(reiser4_get_current_sb());
78750+
78751+ atom = get_current_atom_locked();
78752+ blocknr_set_iterator(atom, &atom->wandered_map,
78753+ &store_wmap_actor, &params, 0);
78754+ spin_unlock_atom(atom);
78755+ }
78756+
78757+ { /* relse all jnodes from tx_list */
78758+ cur = list_entry(ch->tx_list.next, jnode, capture_link);
78759+ while (&ch->tx_list != &cur->capture_link) {
78760+ jrelse(cur);
78761+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
78762+ }
78763+ }
78764+
78765+ ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78766+
78767+ return ret;
78768+
78769+ free_not_assigned:
78770+ /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78771+ caller takes care about invalidating of tx list */
78772+ reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78773+
78774+ return ret;
78775+}
78776+
78777+static int commit_tx(struct commit_handle *ch)
78778+{
78779+ flush_queue_t *fq;
78780+ int barrier;
78781+ int ret;
78782+
78783+ /* Grab more space for wandered records. */
78784+ ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78785+ if (ret)
78786+ return ret;
78787+
78788+ fq = get_fq_for_current_atom();
78789+ if (IS_ERR(fq))
78790+ return PTR_ERR(fq);
78791+
78792+ spin_unlock_atom(fq->atom);
78793+ do {
78794+ ret = alloc_wandered_blocks(ch, fq);
78795+ if (ret)
78796+ break;
78797+ ret = alloc_tx(ch, fq);
78798+ if (ret)
78799+ break;
78800+ } while (0);
78801+
78802+ reiser4_fq_put(fq);
78803+ if (ret)
78804+ return ret;
78805+ repeat_wo_barrier:
78806+ barrier = reiser4_use_write_barrier(ch->super);
78807+ if (!barrier) {
78808+ ret = current_atom_finish_all_fq();
78809+ if (ret)
78810+ return ret;
78811+ }
78812+ ret = update_journal_header(ch, barrier);
78813+ if (barrier) {
78814+ if (ret) {
78815+ if (ret == -EOPNOTSUPP) {
78816+ disable_write_barrier(ch->super);
78817+ goto repeat_wo_barrier;
78818+ }
78819+ return ret;
78820+ }
78821+ ret = current_atom_finish_all_fq();
78822+ }
78823+ return ret;
78824+}
78825+
78826+static int write_tx_back(struct commit_handle * ch)
78827+{
78828+ flush_queue_t *fq;
78829+ int ret;
78830+ int barrier;
78831+
78832+ reiser4_post_commit_hook();
78833+ fq = get_fq_for_current_atom();
78834+ if (IS_ERR(fq))
78835+ return PTR_ERR(fq);
78836+ spin_unlock_atom(fq->atom);
78837+ ret = write_jnode_list(
78838+ ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78839+ reiser4_fq_put(fq);
78840+ if (ret)
78841+ return ret;
78842+ repeat_wo_barrier:
78843+ barrier = reiser4_use_write_barrier(ch->super);
78844+ if (!barrier) {
78845+ ret = current_atom_finish_all_fq();
78846+ if (ret)
78847+ return ret;
78848+ }
78849+ ret = update_journal_footer(ch, barrier);
78850+ if (barrier) {
78851+ if (ret) {
78852+ if (ret == -EOPNOTSUPP) {
78853+ disable_write_barrier(ch->super);
78854+ goto repeat_wo_barrier;
78855+ }
78856+ return ret;
78857+ }
78858+ ret = current_atom_finish_all_fq();
78859+ }
78860+ if (ret)
78861+ return ret;
78862+ reiser4_post_write_back_hook();
78863+ return 0;
78864+}
78865+
78866+/* We assume that at this moment all captured blocks are marked as RELOC or
78867+ WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78868+ are submitted to write.
78869+*/
78870+
78871+int reiser4_write_logs(long *nr_submitted)
78872+{
78873+ txn_atom *atom;
78874+ struct super_block *super = reiser4_get_current_sb();
78875+ reiser4_super_info_data *sbinfo = get_super_private(super);
78876+ struct commit_handle ch;
78877+ int ret;
78878+
78879+ writeout_mode_enable();
78880+
78881+ /* block allocator may add j-nodes to the clean_list */
78882+ ret = reiser4_pre_commit_hook();
78883+ if (ret)
78884+ return ret;
78885+
78886+ /* No locks are required if we take atom which stage >=
78887+ * ASTAGE_PRE_COMMIT */
78888+ atom = get_current_context()->trans->atom;
78889+ assert("zam-965", atom != NULL);
78890+
78891+ /* relocate set is on the atom->clean_nodes list after
78892+ * current_atom_complete_writes() finishes. It can be safely
78893+ * uncaptured after commit_mutex is locked, because any atom that
78894+ * captures these nodes is guaranteed to commit after current one.
78895+ *
78896+ * This can only be done after reiser4_pre_commit_hook(), because it is where
78897+ * early flushed jnodes with CREATED bit are transferred to the
78898+ * overwrite list. */
78899+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
78900+ spin_lock_atom(atom);
78901+ /* There might be waiters for the relocate nodes which we have
78902+ * released, wake them up. */
78903+ reiser4_atom_send_event(atom);
78904+ spin_unlock_atom(atom);
78905+
78906+ if (REISER4_DEBUG) {
78907+ int level;
78908+
78909+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78910+ assert("nikita-3352",
78911+ list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78912+ }
78913+
78914+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78915+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78916+
78917+ init_commit_handle(&ch, atom);
78918+
78919+ ch.free_blocks = sbinfo->blocks_free_committed;
78920+ ch.nr_files = sbinfo->nr_files_committed;
78921+ /* ZAM-FIXME-HANS: email me what the contention level is for the super
78922+ * lock. */
78923+ ch.next_oid = oid_next(super);
78924+
78925+ /* count overwrite set and place it in a separate list */
78926+ ret = get_overwrite_set(&ch);
78927+
78928+ if (ret <= 0) {
78929+ /* It is possible that overwrite set is empty here, it means
78930+ all captured nodes are clean */
78931+ goto up_and_ret;
78932+ }
78933+
78934+ /* Inform the caller about what number of dirty pages will be
78935+ * submitted to disk. */
78936+ *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78937+
78938+ /* count all records needed for storing of the wandered set */
78939+ get_tx_size(&ch);
78940+
78941+ ret = commit_tx(&ch);
78942+ if (ret)
78943+ goto up_and_ret;
78944+
78945+ spin_lock_atom(atom);
78946+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
78947+ spin_unlock_atom(atom);
78948+
78949+ ret = write_tx_back(&ch);
78950+ reiser4_post_write_back_hook();
78951+
78952+ up_and_ret:
78953+ if (ret) {
78954+ /* there could be fq attached to current atom; the only way to
78955+ remove them is: */
78956+ current_atom_finish_all_fq();
78957+ }
78958+
78959+ /* free blocks of flushed transaction */
78960+ dealloc_tx_list(&ch);
78961+ dealloc_wmap(&ch);
78962+
78963+ put_overwrite_set(&ch);
78964+
78965+ done_commit_handle(&ch);
78966+
78967+ writeout_mode_disable();
78968+
78969+ return ret;
78970+}
78971+
78972+/* consistency checks for journal data/control blocks: header, footer, log
78973+ records, transactions head blocks. All functions return zero on success. */
78974+
78975+static int check_journal_header(const jnode * node UNUSED_ARG)
78976+{
78977+ /* FIXME: journal header has no magic field yet. */
78978+ return 0;
78979+}
78980+
78981+/* wait for write completion for all jnodes from given list */
78982+static int wait_on_jnode_list(struct list_head *head)
78983+{
78984+ jnode *scan;
78985+ int ret = 0;
78986+
78987+ list_for_each_entry(scan, head, capture_link) {
78988+ struct page *pg = jnode_page(scan);
78989+
78990+ if (pg) {
78991+ if (PageWriteback(pg))
78992+ wait_on_page_writeback(pg);
78993+
78994+ if (PageError(pg))
78995+ ret++;
78996+ }
78997+ }
78998+
78999+ return ret;
79000+}
79001+
79002+static int check_journal_footer(const jnode * node UNUSED_ARG)
79003+{
79004+ /* FIXME: journal footer has no magic field yet. */
79005+ return 0;
79006+}
79007+
79008+static int check_tx_head(const jnode * node)
79009+{
79010+ struct tx_header *header = (struct tx_header *)jdata(node);
79011+
79012+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
79013+ warning("zam-627", "tx head at block %s corrupted\n",
79014+ sprint_address(jnode_get_block(node)));
79015+ return RETERR(-EIO);
79016+ }
79017+
79018+ return 0;
79019+}
79020+
79021+static int check_wander_record(const jnode * node)
79022+{
79023+ struct wander_record_header *RH =
79024+ (struct wander_record_header *)jdata(node);
79025+
79026+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
79027+ 0) {
79028+ warning("zam-628", "wander record at block %s corrupted\n",
79029+ sprint_address(jnode_get_block(node)));
79030+ return RETERR(-EIO);
79031+ }
79032+
79033+ return 0;
79034+}
79035+
79036+/* fill commit_handler structure by everything what is needed for update_journal_footer */
79037+static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
79038+{
79039+ struct tx_header *TXH;
79040+ int ret;
79041+
79042+ ret = jload(tx_head);
79043+ if (ret)
79044+ return ret;
79045+
79046+ TXH = (struct tx_header *)jdata(tx_head);
79047+
79048+ ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
79049+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
79050+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
79051+
79052+ jrelse(tx_head);
79053+
79054+ list_add(&tx_head->capture_link, &ch->tx_list);
79055+
79056+ return 0;
79057+}
79058+
79059+/* replay one transaction: restore and write overwrite set in place */
79060+static int replay_transaction(const struct super_block *s,
79061+ jnode * tx_head,
79062+ const reiser4_block_nr * log_rec_block_p,
79063+ const reiser4_block_nr * end_block,
79064+ unsigned int nr_wander_records)
79065+{
79066+ reiser4_block_nr log_rec_block = *log_rec_block_p;
79067+ struct commit_handle ch;
79068+ LIST_HEAD(overwrite_set);
79069+ jnode *log;
79070+ int ret;
79071+
79072+ init_commit_handle(&ch, NULL);
79073+ ch.overwrite_set = &overwrite_set;
79074+
79075+ restore_commit_handle(&ch, tx_head);
79076+
79077+ while (log_rec_block != *end_block) {
79078+ struct wander_record_header *header;
79079+ struct wander_entry *entry;
79080+
79081+ int i;
79082+
79083+ if (nr_wander_records == 0) {
79084+ warning("zam-631",
79085+ "number of wander records in the linked list"
79086+ " greater than number stored in tx head.\n");
79087+ ret = RETERR(-EIO);
79088+ goto free_ow_set;
79089+ }
79090+
79091+ log = reiser4_alloc_io_head(&log_rec_block);
79092+ if (log == NULL)
79093+ return RETERR(-ENOMEM);
79094+
79095+ ret = jload(log);
79096+ if (ret < 0) {
79097+ reiser4_drop_io_head(log);
79098+ return ret;
79099+ }
79100+
79101+ ret = check_wander_record(log);
79102+ if (ret) {
79103+ jrelse(log);
79104+ reiser4_drop_io_head(log);
79105+ return ret;
79106+ }
79107+
79108+ header = (struct wander_record_header *)jdata(log);
79109+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
79110+
79111+ entry = (struct wander_entry *)(header + 1);
79112+
79113+ /* restore overwrite set from wander record content */
79114+ for (i = 0; i < wander_record_capacity(s); i++) {
79115+ reiser4_block_nr block;
79116+ jnode *node;
79117+
79118+ block = le64_to_cpu(get_unaligned(&entry->wandered));
79119+ if (block == 0)
79120+ break;
79121+
79122+ node = reiser4_alloc_io_head(&block);
79123+ if (node == NULL) {
79124+ ret = RETERR(-ENOMEM);
79125+ /*
79126+ * FIXME-VS:???
79127+ */
79128+ jrelse(log);
79129+ reiser4_drop_io_head(log);
79130+ goto free_ow_set;
79131+ }
79132+
79133+ ret = jload(node);
79134+
79135+ if (ret < 0) {
79136+ reiser4_drop_io_head(node);
79137+ /*
79138+ * FIXME-VS:???
79139+ */
79140+ jrelse(log);
79141+ reiser4_drop_io_head(log);
79142+ goto free_ow_set;
79143+ }
79144+
79145+ block = le64_to_cpu(get_unaligned(&entry->original));
79146+
79147+ assert("zam-603", block != 0);
79148+
79149+ jnode_set_block(node, &block);
79150+
79151+ list_add_tail(&node->capture_link, ch.overwrite_set);
79152+
79153+ ++entry;
79154+ }
79155+
79156+ jrelse(log);
79157+ reiser4_drop_io_head(log);
79158+
79159+ --nr_wander_records;
79160+ }
79161+
79162+ if (nr_wander_records != 0) {
79163+ warning("zam-632", "number of wander records in the linked list"
79164+ " less than number stored in tx head.\n");
79165+ ret = RETERR(-EIO);
79166+ goto free_ow_set;
79167+ }
79168+
79169+ { /* write wandered set in place */
79170+ write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
79171+ ret = wait_on_jnode_list(ch.overwrite_set);
79172+
79173+ if (ret) {
79174+ ret = RETERR(-EIO);
79175+ goto free_ow_set;
79176+ }
79177+ }
79178+
79179+ ret = update_journal_footer(&ch, 0);
79180+
79181+ free_ow_set:
79182+
79183+ while (!list_empty(ch.overwrite_set)) {
79184+ jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
79185+ list_del_init(&cur->capture_link);
79186+ jrelse(cur);
79187+ reiser4_drop_io_head(cur);
79188+ }
79189+
79190+ list_del_init(&tx_head->capture_link);
79191+
79192+ done_commit_handle(&ch);
79193+
79194+ return ret;
79195+}
79196+
79197+/* find oldest committed and not played transaction and play it. The transaction
79198+ * was committed and journal header block was updated but the blocks from the
79199+ * process of writing the atom's overwrite set in-place and updating of journal
79200+ * footer block were not completed. This function completes the process by
79201+ * recovering the atom's overwrite set from their wandered locations and writes
79202+ * them in-place and updating the journal footer. */
79203+static int replay_oldest_transaction(struct super_block *s)
79204+{
79205+ reiser4_super_info_data *sbinfo = get_super_private(s);
79206+ jnode *jf = sbinfo->journal_footer;
79207+ unsigned int total;
79208+ struct journal_footer *F;
79209+ struct tx_header *T;
79210+
79211+ reiser4_block_nr prev_tx;
79212+ reiser4_block_nr last_flushed_tx;
79213+ reiser4_block_nr log_rec_block = 0;
79214+
79215+ jnode *tx_head;
79216+
79217+ int ret;
79218+
79219+ if ((ret = jload(jf)) < 0)
79220+ return ret;
79221+
79222+ F = (struct journal_footer *)jdata(jf);
79223+
79224+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
79225+
79226+ jrelse(jf);
79227+
79228+ if (sbinfo->last_committed_tx == last_flushed_tx) {
79229+ /* all transactions are replayed */
79230+ return 0;
79231+ }
79232+
79233+ prev_tx = sbinfo->last_committed_tx;
79234+
79235+ /* searching for oldest not flushed transaction */
79236+ while (1) {
79237+ tx_head = reiser4_alloc_io_head(&prev_tx);
79238+ if (!tx_head)
79239+ return RETERR(-ENOMEM);
79240+
79241+ ret = jload(tx_head);
79242+ if (ret < 0) {
79243+ reiser4_drop_io_head(tx_head);
79244+ return ret;
79245+ }
79246+
79247+ ret = check_tx_head(tx_head);
79248+ if (ret) {
79249+ jrelse(tx_head);
79250+ reiser4_drop_io_head(tx_head);
79251+ return ret;
79252+ }
79253+
79254+ T = (struct tx_header *)jdata(tx_head);
79255+
79256+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
79257+
79258+ if (prev_tx == last_flushed_tx)
79259+ break;
79260+
79261+ jrelse(tx_head);
79262+ reiser4_drop_io_head(tx_head);
79263+ }
79264+
79265+ total = le32_to_cpu(get_unaligned(&T->total));
79266+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
79267+
79268+ pin_jnode_data(tx_head);
79269+ jrelse(tx_head);
79270+
79271+ ret =
79272+ replay_transaction(s, tx_head, &log_rec_block,
79273+ jnode_get_block(tx_head), total - 1);
79274+
79275+ unpin_jnode_data(tx_head);
79276+ reiser4_drop_io_head(tx_head);
79277+
79278+ if (ret)
79279+ return ret;
79280+ return -E_REPEAT;
79281+}
79282+
79283+/* The reiser4 journal current implementation was optimized to not to capture
79284+ super block if certain super blocks fields are modified. Currently, the set
79285+ is (<free block count>, <OID allocator>). These fields are logged by
79286+ special way which includes storing them in each transaction head block at
79287+ atom commit time and writing that information to journal footer block at
79288+ atom flush time. For getting info from journal footer block to the
79289+ in-memory super block there is a special function
79290+ reiser4_journal_recover_sb_data() which should be called after disk format
79291+ plugin re-reads super block after journal replaying.
79292+*/
79293+
79294+/* get the information from journal footer in-memory super block */
79295+int reiser4_journal_recover_sb_data(struct super_block *s)
79296+{
79297+ reiser4_super_info_data *sbinfo = get_super_private(s);
79298+ struct journal_footer *jf;
79299+ int ret;
79300+
79301+ assert("zam-673", sbinfo->journal_footer != NULL);
79302+
79303+ ret = jload(sbinfo->journal_footer);
79304+ if (ret != 0)
79305+ return ret;
79306+
79307+ ret = check_journal_footer(sbinfo->journal_footer);
79308+ if (ret != 0)
79309+ goto out;
79310+
79311+ jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
79312+
79313+ /* was there at least one flushed transaction? */
79314+ if (jf->last_flushed_tx) {
79315+
79316+ /* restore free block counter logged in this transaction */
79317+ reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
79318+
79319+ /* restore oid allocator state */
79320+ oid_init_allocator(s,
79321+ le64_to_cpu(get_unaligned(&jf->nr_files)),
79322+ le64_to_cpu(get_unaligned(&jf->next_oid)));
79323+ }
79324+ out:
79325+ jrelse(sbinfo->journal_footer);
79326+ return ret;
79327+}
79328+
79329+/* reiser4 replay journal procedure */
79330+int reiser4_journal_replay(struct super_block *s)
79331+{
79332+ reiser4_super_info_data *sbinfo = get_super_private(s);
79333+ jnode *jh, *jf;
79334+ struct journal_header *header;
79335+ int nr_tx_replayed = 0;
79336+ int ret;
79337+
79338+ assert("zam-582", sbinfo != NULL);
79339+
79340+ jh = sbinfo->journal_header;
79341+ jf = sbinfo->journal_footer;
79342+
79343+ if (!jh || !jf) {
79344+ /* it is possible that disk layout does not support journal
79345+ structures, we just warn about this */
79346+ warning("zam-583",
79347+ "journal control blocks were not loaded by disk layout plugin. "
79348+ "journal replaying is not possible.\n");
79349+ return 0;
79350+ }
79351+
79352+ /* Take free block count from journal footer block. The free block
79353+ counter value corresponds the last flushed transaction state */
79354+ ret = jload(jf);
79355+ if (ret < 0)
79356+ return ret;
79357+
79358+ ret = check_journal_footer(jf);
79359+ if (ret) {
79360+ jrelse(jf);
79361+ return ret;
79362+ }
79363+
79364+ jrelse(jf);
79365+
79366+ /* store last committed transaction info in reiser4 in-memory super
79367+ block */
79368+ ret = jload(jh);
79369+ if (ret < 0)
79370+ return ret;
79371+
79372+ ret = check_journal_header(jh);
79373+ if (ret) {
79374+ jrelse(jh);
79375+ return ret;
79376+ }
79377+
79378+ header = (struct journal_header *)jdata(jh);
79379+ sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
79380+
79381+ jrelse(jh);
79382+
79383+ /* replay committed transactions */
79384+ while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
79385+ nr_tx_replayed++;
79386+
79387+ return ret;
79388+}
79389+
79390+/* load journal control block (either journal header or journal footer block) */
79391+static int
79392+load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
79393+{
79394+ int ret;
79395+
79396+ *node = reiser4_alloc_io_head(block);
79397+ if (!(*node))
79398+ return RETERR(-ENOMEM);
79399+
79400+ ret = jload(*node);
79401+
79402+ if (ret) {
79403+ reiser4_drop_io_head(*node);
79404+ *node = NULL;
79405+ return ret;
79406+ }
79407+
79408+ pin_jnode_data(*node);
79409+ jrelse(*node);
79410+
79411+ return 0;
79412+}
79413+
79414+/* unload journal header or footer and free jnode */
79415+static void unload_journal_control_block(jnode ** node)
79416+{
79417+ if (*node) {
79418+ unpin_jnode_data(*node);
79419+ reiser4_drop_io_head(*node);
79420+ *node = NULL;
79421+ }
79422+}
79423+
79424+/* release journal control blocks */
79425+void reiser4_done_journal_info(struct super_block *s)
79426+{
79427+ reiser4_super_info_data *sbinfo = get_super_private(s);
79428+
79429+ assert("zam-476", sbinfo != NULL);
79430+
79431+ unload_journal_control_block(&sbinfo->journal_header);
79432+ unload_journal_control_block(&sbinfo->journal_footer);
79433+ rcu_barrier();
79434+}
79435+
79436+/* load journal control blocks */
79437+int reiser4_init_journal_info(struct super_block *s)
79438+{
79439+ reiser4_super_info_data *sbinfo = get_super_private(s);
79440+ journal_location *loc;
79441+ int ret;
79442+
79443+ loc = &sbinfo->jloc;
79444+
79445+ assert("zam-651", loc != NULL);
79446+ assert("zam-652", loc->header != 0);
79447+ assert("zam-653", loc->footer != 0);
79448+
79449+ ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
79450+
79451+ if (ret)
79452+ return ret;
79453+
79454+ ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
79455+
79456+ if (ret) {
79457+ unload_journal_control_block(&sbinfo->journal_header);
79458+ }
79459+
79460+ return ret;
79461+}
79462+
79463+/* Make Linus happy.
79464+ Local variables:
79465+ c-indentation-style: "K&R"
79466+ mode-name: "LC"
79467+ c-basic-offset: 8
79468+ tab-width: 8
79469+ fill-column: 80
79470+ End:
79471+*/
79472diff --git a/fs/reiser4/wander.h b/fs/reiser4/wander.h
79473new file mode 100644
79474index 0000000..8746710
79475--- /dev/null
79476+++ b/fs/reiser4/wander.h
79477@@ -0,0 +1,135 @@
79478+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
79479+
79480+#if !defined (__FS_REISER4_WANDER_H__)
79481+#define __FS_REISER4_WANDER_H__
79482+
79483+#include "dformat.h"
79484+
79485+#include <linux/fs.h> /* for struct super_block */
79486+
79487+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
79488+
79489+#define TX_HEADER_MAGIC "TxMagic4"
79490+#define WANDER_RECORD_MAGIC "LogMagc4"
79491+
79492+#define TX_HEADER_MAGIC_SIZE (8)
79493+#define WANDER_RECORD_MAGIC_SIZE (8)
79494+
79495+/* journal header block format */
79496+struct journal_header {
79497+ /* last written transaction head location */
79498+ d64 last_committed_tx;
79499+};
79500+
79501+typedef struct journal_location {
79502+ reiser4_block_nr footer;
79503+ reiser4_block_nr header;
79504+} journal_location;
79505+
79506+/* The wander.c head comment describes usage and semantic of all these structures */
79507+/* journal footer block format */
79508+struct journal_footer {
79509+ /* last flushed transaction location. */
79510+ /* This block number is no more valid after the transaction it points
79511+ to gets flushed, this number is used only at journal replaying time
79512+ for detection of the end of on-disk list of committed transactions
79513+ which were not flushed completely */
79514+ d64 last_flushed_tx;
79515+
79516+ /* free block counter is written in journal footer at transaction
79517+ flushing , not in super block because free blocks counter is logged
79518+ by another way than super block fields (root pointer, for
79519+ example). */
79520+ d64 free_blocks;
79521+
79522+ /* number of used OIDs and maximal used OID are logged separately from
79523+ super block */
79524+ d64 nr_files;
79525+ d64 next_oid;
79526+};
79527+
79528+/* Each wander record (except the first one) has unified format with wander
79529+ record header followed by an array of log entries */
79530+struct wander_record_header {
79531+ /* when there is no predefined location for wander records, this magic
79532+ string should help reiser4fsck. */
79533+ char magic[WANDER_RECORD_MAGIC_SIZE];
79534+
79535+ /* transaction id */
79536+ d64 id;
79537+
79538+ /* total number of wander records in current transaction */
79539+ d32 total;
79540+
79541+ /* this block number in transaction */
79542+ d32 serial;
79543+
79544+ /* number of previous block in commit */
79545+ d64 next_block;
79546+};
79547+
79548+/* The first wander record (transaction head) of written transaction has the
79549+ special format */
79550+struct tx_header {
79551+ /* magic string makes first block in transaction different from other
79552+ logged blocks, it should help fsck. */
79553+ char magic[TX_HEADER_MAGIC_SIZE];
79554+
79555+ /* transaction id */
79556+ d64 id;
79557+
79558+ /* total number of records (including this first tx head) in the
79559+ transaction */
79560+ d32 total;
79561+
79562+ /* align next field to 8-byte boundary; this field always is zero */
79563+ d32 padding;
79564+
79565+ /* block number of previous transaction head */
79566+ d64 prev_tx;
79567+
79568+ /* next wander record location */
79569+ d64 next_block;
79570+
79571+ /* committed versions of free blocks counter */
79572+ d64 free_blocks;
79573+
79574+ /* number of used OIDs (nr_files) and maximal used OID are logged
79575+ separately from super block */
79576+ d64 nr_files;
79577+ d64 next_oid;
79578+};
79579+
79580+/* A transaction gets written to disk as a set of wander records (each wander
79581+ record size is fs block) */
79582+
79583+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79584+ by zeroes */
79585+struct wander_entry {
79586+ d64 original; /* block original location */
79587+ d64 wandered; /* block wandered location */
79588+};
79589+
79590+/* REISER4 JOURNAL WRITER FUNCTIONS */
79591+
79592+extern int reiser4_write_logs(long *);
79593+extern int reiser4_journal_replay(struct super_block *);
79594+extern int reiser4_journal_recover_sb_data(struct super_block *);
79595+
79596+extern int reiser4_init_journal_info(struct super_block *);
79597+extern void reiser4_done_journal_info(struct super_block *);
79598+
79599+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79600+
79601+#endif /* __FS_REISER4_WANDER_H__ */
79602+
79603+/* Make Linus happy.
79604+ Local variables:
79605+ c-indentation-style: "K&R"
79606+ mode-name: "LC"
79607+ c-basic-offset: 8
79608+ tab-width: 8
79609+ fill-column: 80
79610+ scroll-step: 1
79611+ End:
79612+*/
79613diff --git a/fs/reiser4/writeout.h b/fs/reiser4/writeout.h
79614new file mode 100644
79615index 0000000..446b63b
79616--- /dev/null
79617+++ b/fs/reiser4/writeout.h
79618@@ -0,0 +1,21 @@
79619+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
79620+
79621+#if !defined (__FS_REISER4_WRITEOUT_H__)
79622+
79623+#define WRITEOUT_SINGLE_STREAM (0x1)
79624+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
79625+#define WRITEOUT_BARRIER (0x4)
79626+
79627+extern int reiser4_get_writeout_flags(void);
79628+
79629+#endif /* __FS_REISER4_WRITEOUT_H__ */
79630+
79631+/* Make Linus happy.
79632+ Local variables:
79633+ c-indentation-style: "K&R"
79634+ mode-name: "LC"
79635+ c-basic-offset: 8
79636+ tab-width: 8
79637+ fill-column: 80
79638+ End:
79639+*/
79640diff --git a/fs/reiser4/znode.c b/fs/reiser4/znode.c
79641new file mode 100644
79642index 0000000..b695111
79643--- /dev/null
79644+++ b/fs/reiser4/znode.c
79645@@ -0,0 +1,1029 @@
79646+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79647+ * reiser4/README */
79648+/* Znode manipulation functions. */
79649+/* Znode is the in-memory header for a tree node. It is stored
79650+ separately from the node itself so that it does not get written to
79651+ disk. In this respect znode is like buffer head or page head. We
79652+ also use znodes for additional reiser4 specific purposes:
79653+
79654+ . they are organized into tree structure which is a part of whole
79655+ reiser4 tree.
79656+ . they are used to implement node grained locking
79657+ . they are used to keep additional state associated with a
79658+ node
79659+ . they contain links to lists used by the transaction manager
79660+
79661+ Znode is attached to some variable "block number" which is instance of
79662+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79663+ appropriate node being actually loaded in memory. Existence of znode itself
79664+ is regulated by reference count (->x_count) in it. Each time thread
79665+ acquires reference to znode through call to zget(), ->x_count is
79666+ incremented and decremented on call to zput(). Data (content of node) are
79667+ brought in memory through call to zload(), which also increments ->d_count
79668+ reference counter. zload can block waiting on IO. Call to zrelse()
79669+ decreases this counter. Also, ->c_count keeps track of number of child
79670+ znodes and prevents parent znode from being recycled until all of its
79671+ children are. ->c_count is decremented whenever child goes out of existence
79672+ (being actually recycled in zdestroy()) which can be some time after last
79673+ reference to this child dies if we support some form of LRU cache for
79674+ znodes.
79675+
79676+*/
79677+/* EVERY ZNODE'S STORY
79678+
79679+ 1. His infancy.
79680+
79681+ Once upon a time, the znode was born deep inside of zget() by call to
79682+ zalloc(). At the return from zget() znode had:
79683+
79684+ . reference counter (x_count) of 1
79685+ . assigned block number, marked as used in bitmap
79686+ . pointer to parent znode. Root znode parent pointer points
79687+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
79688+ . hash table linkage
79689+ . no data loaded from disk
79690+ . no node plugin
79691+ . no sibling linkage
79692+
79693+ 2. His childhood
79694+
79695+ Each node is either brought into memory as a result of tree traversal, or
79696+ created afresh, creation of the root being a special case of the latter. In
79697+ either case it's inserted into sibling list. This will typically require
79698+ some ancillary tree traversing, but ultimately both sibling pointers will
79699+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79700+ zjnode.state.
79701+
79702+ 3. His youth.
79703+
79704+ If znode is bound to already existing node in a tree, its content is read
79705+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79706+ in zjnode.state and zdata() function starts to return non null for this
79707+ znode. zload() further calls zparse() that determines which node layout
79708+ this node is rendered in, and sets ->nplug on success.
79709+
79710+ If znode is for new node just created, memory for it is allocated and
79711+ zinit_new() function is called to initialise data, according to selected
79712+ node layout.
79713+
79714+ 4. His maturity.
79715+
79716+ After this point, znode lingers in memory for some time. Threads can
79717+ acquire references to znode either by blocknr through call to zget(), or by
79718+ following a pointer to unallocated znode from internal item. Each time
79719+ reference to znode is obtained, x_count is increased. Thread can read/write
79720+ lock znode. Znode data can be loaded through calls to zload(), d_count will
79721+ be increased appropriately. If all references to znode are released
79722+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
79723+ still cached in the hash table in the hope that it will be accessed
79724+ shortly.
79725+
79726+ There are two ways in which znode existence can be terminated:
79727+
79728+ . sudden death: node bound to this znode is removed from the tree
79729+ . overpopulation: znode is purged out of memory due to memory pressure
79730+
79731+ 5. His death.
79732+
79733+ Death is complex process.
79734+
79735+ When we irrevocably commit ourselves to decision to remove node from the
79736+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79737+ znode. This is done either in ->kill_hook() of internal item or in
79738+ reiser4_kill_root() function when tree root is removed.
79739+
79740+ At this moment znode still has:
79741+
79742+ . locks held on it, necessary write ones
79743+ . references to it
79744+ . disk block assigned to it
79745+ . data loaded from the disk
79746+ . pending requests for lock
79747+
79748+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79749+ deletion. Node deletion includes two phases. First all ways to get
79750+ references to that znode (sibling and parent links and hash lookup using
79751+ block number stored in parent node) should be deleted -- it is done through
79752+ sibling_list_remove(), also we assume that nobody uses down link from
79753+ parent node due to its nonexistence or proper parent node locking and
79754+ nobody uses parent pointers from children due to absence of them. Second we
79755+ invalidate all pending lock requests which still are on znode's lock
79756+ request queue, this is done by reiser4_invalidate_lock(). Another
79757+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
79758+ Once it set all requesters are forced to return -EINVAL from
79759+ longterm_lock_znode(). Future locking attempts are not possible because all
79760+ ways to get references to that znode are removed already. Last, node is
79761+ uncaptured from transaction.
79762+
79763+ When last reference to the dying znode is just about to be released,
79764+ block number for this lock is released and znode is removed from the
79765+ hash table.
79766+
79767+ Now znode can be recycled.
79768+
79769+ [it's possible to free bitmap block and remove znode from the hash
79770+ table when last lock is released. This will result in having
79771+ referenced but completely orphaned znode]
79772+
79773+ 6. Limbo
79774+
79775+ As have been mentioned above znodes with reference counter 0 are
79776+ still cached in a hash table. Once memory pressure increases they are
79777+ purged out of there [this requires something like LRU list for
79778+ efficient implementation. LRU list would also greatly simplify
79779+ implementation of coord cache that would in this case morph to just
79780+ scanning some initial segment of LRU list]. Data loaded into
79781+ unreferenced znode are flushed back to the durable storage if
79782+ necessary and memory is freed. Znodes themselves can be recycled at
79783+ this point too.
79784+
79785+*/
79786+
79787+#include "debug.h"
79788+#include "dformat.h"
79789+#include "key.h"
79790+#include "coord.h"
79791+#include "plugin/plugin_header.h"
79792+#include "plugin/node/node.h"
79793+#include "plugin/plugin.h"
79794+#include "txnmgr.h"
79795+#include "jnode.h"
79796+#include "znode.h"
79797+#include "block_alloc.h"
79798+#include "tree.h"
79799+#include "tree_walk.h"
79800+#include "super.h"
79801+#include "reiser4.h"
79802+
79803+#include <linux/pagemap.h>
79804+#include <linux/spinlock.h>
79805+#include <linux/slab.h>
79806+#include <linux/err.h>
79807+
79808+static z_hash_table *get_htable(reiser4_tree *,
79809+ const reiser4_block_nr * const blocknr);
79810+static z_hash_table *znode_get_htable(const znode *);
79811+static void zdrop(znode *);
79812+
79813+/* hash table support */
79814+
79815+/* compare two block numbers for equality. Used by hash-table macros */
79816+static inline int
79817+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79818+{
79819+ assert("nikita-534", b1 != NULL);
79820+ assert("nikita-535", b2 != NULL);
79821+
79822+ return *b1 == *b2;
79823+}
79824+
79825+/* Hash znode by block number. Used by hash-table macros */
79826+/* Audited by: umka (2002.06.11) */
79827+static inline __u32
79828+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79829+{
79830+ assert("nikita-536", b != NULL);
79831+
79832+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79833+}
79834+
79835+/* The hash table definition */
79836+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
79837+#define KFREE(ptr, size) kfree(ptr)
79838+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79839+ blknrhashfn, blknreq);
79840+#undef KFREE
79841+#undef KMALLOC
79842+
79843+/* slab for znodes */
79844+static struct kmem_cache *znode_cache;
79845+
79846+int znode_shift_order;
79847+
79848+/**
79849+ * init_znodes - create znode cache
79850+ *
79851+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79852+ */
79853+int init_znodes(void)
79854+{
79855+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79856+ SLAB_HWCACHE_ALIGN |
79857+ SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79858+ if (znode_cache == NULL)
79859+ return RETERR(-ENOMEM);
79860+
79861+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79862+ ++znode_shift_order);
79863+ --znode_shift_order;
79864+ return 0;
79865+}
79866+
79867+/**
79868+ * done_znodes - delete znode cache
79869+ *
79870+ * This is called on reiser4 module unloading or system shutdown.
79871+ */
79872+void done_znodes(void)
79873+{
79874+ destroy_reiser4_cache(&znode_cache);
79875+}
79876+
79877+/* call this to initialise tree of znodes */
79878+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79879+{
79880+ int result;
79881+ assert("umka-050", tree != NULL);
79882+
79883+ rwlock_init(&tree->dk_lock);
79884+
79885+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79886+ if (result != 0)
79887+ return result;
79888+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79889+ return result;
79890+}
79891+
79892+/* free this znode */
79893+void zfree(znode * node /* znode to free */ )
79894+{
79895+ assert("nikita-465", node != NULL);
79896+ assert("nikita-2120", znode_page(node) == NULL);
79897+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
79898+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79899+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79900+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79901+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79902+ assert("nikita-3293", !znode_is_right_connected(node));
79903+ assert("nikita-3294", !znode_is_left_connected(node));
79904+ assert("nikita-3295", node->left == NULL);
79905+ assert("nikita-3296", node->right == NULL);
79906+
79907+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
79908+
79909+ kmem_cache_free(znode_cache, node);
79910+}
79911+
79912+/* call this to free tree of znodes */
79913+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79914+{
79915+ znode *node;
79916+ znode *next;
79917+ z_hash_table *ztable;
79918+
79919+ /* scan znode hash-tables and kill all znodes, then free hash tables
79920+ * themselves. */
79921+
79922+ assert("nikita-795", tree != NULL);
79923+
79924+ ztable = &tree->zhash_table;
79925+
79926+ if (ztable->_table != NULL) {
79927+ for_all_in_htable(ztable, z, node, next) {
79928+ node->c_count = 0;
79929+ node->in_parent.node = NULL;
79930+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79931+ zdrop(node);
79932+ }
79933+
79934+ z_hash_done(&tree->zhash_table);
79935+ }
79936+
79937+ ztable = &tree->zfake_table;
79938+
79939+ if (ztable->_table != NULL) {
79940+ for_all_in_htable(ztable, z, node, next) {
79941+ node->c_count = 0;
79942+ node->in_parent.node = NULL;
79943+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79944+ zdrop(node);
79945+ }
79946+
79947+ z_hash_done(&tree->zfake_table);
79948+ }
79949+}
79950+
79951+/* ZNODE STRUCTURES */
79952+
79953+/* allocate fresh znode */
79954+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79955+{
79956+ znode *node;
79957+
79958+ node = kmem_cache_alloc(znode_cache, gfp_flag);
79959+ return node;
79960+}
79961+
79962+/* Initialize fields of znode
79963+ @node: znode to initialize;
79964+ @parent: parent znode;
79965+ @tree: tree we are in. */
79966+void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79967+{
79968+ assert("nikita-466", node != NULL);
79969+ assert("umka-268", current_tree != NULL);
79970+
79971+ memset(node, 0, sizeof *node);
79972+
79973+ assert("umka-051", tree != NULL);
79974+
79975+ jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79976+ reiser4_init_lock(&node->lock);
79977+ init_parent_coord(&node->in_parent, parent);
79978+}
79979+
79980+/*
79981+ * remove znode from indices. This is called jput() when last reference on
79982+ * znode is released.
79983+ */
79984+void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79985+{
79986+ assert("nikita-2108", node != NULL);
79987+ assert("nikita-470", node->c_count == 0);
79988+ assert_rw_write_locked(&(tree->tree_lock));
79989+
79990+ /* remove reference to this znode from cbk cache */
79991+ cbk_cache_invalidate(node, tree);
79992+
79993+ /* update c_count of parent */
79994+ if (znode_parent(node) != NULL) {
79995+ assert("nikita-472", znode_parent(node)->c_count > 0);
79996+ /* father, onto your hands I forward my spirit... */
79997+ znode_parent(node)->c_count--;
79998+ node->in_parent.node = NULL;
79999+ } else {
80000+ /* orphaned znode?! Root? */
80001+ }
80002+
80003+ /* remove znode from hash-table */
80004+ z_hash_remove_rcu(znode_get_htable(node), node);
80005+}
80006+
80007+/* zdrop() -- Remove znode from the tree.
80008+
80009+ This is called when znode is removed from the memory. */
80010+static void zdrop(znode * node /* znode to finish with */ )
80011+{
80012+ jdrop(ZJNODE(node));
80013+}
80014+
80015+/*
80016+ * put znode into right place in the hash table. This is called by relocate
80017+ * code.
80018+ */
80019+int znode_rehash(znode * node /* node to rehash */ ,
80020+ const reiser4_block_nr * new_block_nr /* new block number */ )
80021+{
80022+ z_hash_table *oldtable;
80023+ z_hash_table *newtable;
80024+ reiser4_tree *tree;
80025+
80026+ assert("nikita-2018", node != NULL);
80027+
80028+ tree = znode_get_tree(node);
80029+ oldtable = znode_get_htable(node);
80030+ newtable = get_htable(tree, new_block_nr);
80031+
80032+ write_lock_tree(tree);
80033+ /* remove znode from hash-table */
80034+ z_hash_remove_rcu(oldtable, node);
80035+
80036+ /* assertion no longer valid due to RCU */
80037+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
80038+
80039+ /* update blocknr */
80040+ znode_set_block(node, new_block_nr);
80041+ node->zjnode.key.z = *new_block_nr;
80042+
80043+ /* insert it into hash */
80044+ z_hash_insert_rcu(newtable, node);
80045+ write_unlock_tree(tree);
80046+ return 0;
80047+}
80048+
80049+/* ZNODE LOOKUP, GET, PUT */
80050+
80051+/* zlook() - get znode with given block_nr in a hash table or return NULL
80052+
80053+ If result is non-NULL then the znode's x_count is incremented. Internal version
80054+ accepts pre-computed hash index. The hash table is accessed under caller's
80055+ tree->hash_lock.
80056+*/
80057+znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
80058+{
80059+ znode *result;
80060+ __u32 hash;
80061+ z_hash_table *htable;
80062+
80063+ assert("jmacd-506", tree != NULL);
80064+ assert("jmacd-507", blocknr != NULL);
80065+
80066+ htable = get_htable(tree, blocknr);
80067+ hash = blknrhashfn(htable, blocknr);
80068+
80069+ rcu_read_lock();
80070+ result = z_hash_find_index(htable, hash, blocknr);
80071+
80072+ if (result != NULL) {
80073+ add_x_ref(ZJNODE(result));
80074+ result = znode_rip_check(tree, result);
80075+ }
80076+ rcu_read_unlock();
80077+
80078+ return result;
80079+}
80080+
80081+/* return hash table where znode with block @blocknr is (or should be)
80082+ * stored */
80083+static z_hash_table *get_htable(reiser4_tree * tree,
80084+ const reiser4_block_nr * const blocknr)
80085+{
80086+ z_hash_table *table;
80087+ if (is_disk_addr_unallocated(blocknr))
80088+ table = &tree->zfake_table;
80089+ else
80090+ table = &tree->zhash_table;
80091+ return table;
80092+}
80093+
80094+/* return hash table where znode @node is (or should be) stored */
80095+static z_hash_table *znode_get_htable(const znode * node)
80096+{
80097+ return get_htable(znode_get_tree(node), znode_get_block(node));
80098+}
80099+
80100+/* zget() - get znode from hash table, allocating it if necessary.
80101+
80102+ First a call to zlook, locating a x-referenced znode if one
80103+ exists. If znode is not found, allocate new one and return. Result
80104+ is returned with x_count reference increased.
80105+
80106+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
80107+ LOCK ORDERING: NONE
80108+*/
80109+znode *zget(reiser4_tree * tree,
80110+ const reiser4_block_nr * const blocknr,
80111+ znode * parent, tree_level level, gfp_t gfp_flag)
80112+{
80113+ znode *result;
80114+ __u32 hashi;
80115+
80116+ z_hash_table *zth;
80117+
80118+ assert("jmacd-512", tree != NULL);
80119+ assert("jmacd-513", blocknr != NULL);
80120+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
80121+
80122+ zth = get_htable(tree, blocknr);
80123+ hashi = blknrhashfn(zth, blocknr);
80124+
80125+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
80126+ implemented. */
80127+
80128+ z_hash_prefetch_bucket(zth, hashi);
80129+
80130+ rcu_read_lock();
80131+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
80132+ we obtain an reference (x_count) but the znode remains unlocked.
80133+ Have to worry about race conditions later. */
80134+ result = z_hash_find_index(zth, hashi, blocknr);
80135+ /* According to the current design, the hash table lock protects new
80136+ znode references. */
80137+ if (result != NULL) {
80138+ add_x_ref(ZJNODE(result));
80139+ /* NOTE-NIKITA it should be so, but special case during
80140+ creation of new root makes such assertion highly
80141+ complicated. */
80142+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
80143+ (ZF_ISSET(result, JNODE_ORPHAN)
80144+ && (znode_parent(result) == NULL)));
80145+ result = znode_rip_check(tree, result);
80146+ }
80147+
80148+ rcu_read_unlock();
80149+
80150+ if (!result) {
80151+ znode *shadow;
80152+
80153+ result = zalloc(gfp_flag);
80154+ if (!result) {
80155+ return ERR_PTR(RETERR(-ENOMEM));
80156+ }
80157+
80158+ zinit(result, parent, tree);
80159+ ZJNODE(result)->blocknr = *blocknr;
80160+ ZJNODE(result)->key.z = *blocknr;
80161+ result->level = level;
80162+
80163+ write_lock_tree(tree);
80164+
80165+ shadow = z_hash_find_index(zth, hashi, blocknr);
80166+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
80167+ jnode_list_remove(ZJNODE(result));
80168+ zfree(result);
80169+ result = shadow;
80170+ } else {
80171+ result->version = znode_build_version(tree);
80172+ z_hash_insert_index_rcu(zth, hashi, result);
80173+
80174+ if (parent != NULL)
80175+ ++parent->c_count;
80176+ }
80177+
80178+ add_x_ref(ZJNODE(result));
80179+
80180+ write_unlock_tree(tree);
80181+ }
80182+#if REISER4_DEBUG
80183+ if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
80184+ reiser4_check_block(blocknr, 1);
80185+#endif
80186+ /* Check for invalid tree level, return -EIO */
80187+ if (unlikely(znode_get_level(result) != level)) {
80188+ warning("jmacd-504",
80189+ "Wrong level for cached block %llu: %i expecting %i",
80190+ (unsigned long long)(*blocknr), znode_get_level(result),
80191+ level);
80192+ zput(result);
80193+ return ERR_PTR(RETERR(-EIO));
80194+ }
80195+
80196+ assert("nikita-1227", znode_invariant(result));
80197+
80198+ return result;
80199+}
80200+
80201+/* ZNODE PLUGINS/DATA */
80202+
80203+/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
80204+ stored at the fixed offset from the beginning of the node. */
80205+static node_plugin *znode_guess_plugin(const znode * node /* znode to guess
80206+ * plugin of */ )
80207+{
80208+ reiser4_tree *tree;
80209+
80210+ assert("nikita-1053", node != NULL);
80211+ assert("nikita-1055", zdata(node) != NULL);
80212+
80213+ tree = znode_get_tree(node);
80214+ assert("umka-053", tree != NULL);
80215+
80216+ if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
80217+ return tree->nplug;
80218+ } else {
80219+ return node_plugin_by_disk_id
80220+ (tree, &((common_node_header *) zdata(node))->plugin_id);
80221+#ifdef GUESS_EXISTS
80222+ reiser4_plugin *plugin;
80223+
80224+ /* NOTE-NIKITA add locking here when dynamic plugins will be
80225+ * implemented */
80226+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
80227+ if ((plugin->u.node.guess != NULL)
80228+ && plugin->u.node.guess(node))
80229+ return plugin;
80230+ }
80231+ warning("nikita-1057", "Cannot guess node plugin");
80232+ print_znode("node", node);
80233+ return NULL;
80234+#endif
80235+ }
80236+}
80237+
80238+/* parse node header and install ->node_plugin */
80239+int zparse(znode * node /* znode to parse */ )
80240+{
80241+ int result;
80242+
80243+ assert("nikita-1233", node != NULL);
80244+ assert("nikita-2370", zdata(node) != NULL);
80245+
80246+ if (node->nplug == NULL) {
80247+ node_plugin *nplug;
80248+
80249+ nplug = znode_guess_plugin(node);
80250+ if (likely(nplug != NULL)) {
80251+ result = nplug->parse(node);
80252+ if (likely(result == 0))
80253+ node->nplug = nplug;
80254+ } else {
80255+ result = RETERR(-EIO);
80256+ }
80257+ } else
80258+ result = 0;
80259+ return result;
80260+}
80261+
80262+/* zload with readahead */
80263+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
80264+{
80265+ int result;
80266+
80267+ assert("nikita-484", node != NULL);
80268+ assert("nikita-1377", znode_invariant(node));
80269+ assert("jmacd-7771", !znode_above_root(node));
80270+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
80271+ assert("nikita-3016", reiser4_schedulable());
80272+
80273+ if (info)
80274+ formatted_readahead(node, info);
80275+
80276+ result = jload(ZJNODE(node));
80277+ assert("nikita-1378", znode_invariant(node));
80278+ return result;
80279+}
80280+
80281+/* load content of node into memory */
80282+int zload(znode * node)
80283+{
80284+ return zload_ra(node, NULL);
80285+}
80286+
80287+/* call node plugin to initialise newly allocated node. */
80288+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
80289+{
80290+ return jinit_new(ZJNODE(node), gfp_flags);
80291+}
80292+
80293+/* drop reference to node data. When last reference is dropped, data are
80294+ unloaded. */
80295+void zrelse(znode * node /* znode to release references to */ )
80296+{
80297+ assert("nikita-1381", znode_invariant(node));
80298+
80299+ jrelse(ZJNODE(node));
80300+}
80301+
80302+/* returns free space in node */
80303+unsigned znode_free_space(znode * node /* znode to query */ )
80304+{
80305+ assert("nikita-852", node != NULL);
80306+ return node_plugin_by_node(node)->free_space(node);
80307+}
80308+
80309+/* left delimiting key of znode */
80310+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
80311+{
80312+ assert("nikita-958", node != NULL);
80313+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
80314+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
80315+ assert("nikita-30671", node->rd_key_version != 0);
80316+ return &node->rd_key;
80317+}
80318+
80319+/* right delimiting key of znode */
80320+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
80321+{
80322+ assert("nikita-974", node != NULL);
80323+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
80324+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
80325+ assert("nikita-30681", node->ld_key_version != 0);
80326+ return &node->ld_key;
80327+}
80328+
80329+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
80330+ )
80331+
80332+/* update right-delimiting key of @node */
80333+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
80334+{
80335+ assert("nikita-2937", node != NULL);
80336+ assert("nikita-2939", key != NULL);
80337+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
80338+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
80339+ assert("nikita-2944",
80340+ znode_is_any_locked(node) ||
80341+ znode_get_level(node) != LEAF_LEVEL ||
80342+ keyge(key, &node->rd_key) ||
80343+ keyeq(&node->rd_key, reiser4_min_key()) ||
80344+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
80345+
80346+ node->rd_key = *key;
80347+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
80348+ return &node->rd_key;
80349+}
80350+
80351+/* update left-delimiting key of @node */
80352+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
80353+{
80354+ assert("nikita-2940", node != NULL);
80355+ assert("nikita-2941", key != NULL);
80356+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
80357+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
80358+ assert("nikita-2943",
80359+ znode_is_any_locked(node) || keyeq(&node->ld_key,
80360+ reiser4_min_key()));
80361+
80362+ node->ld_key = *key;
80363+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
80364+ return &node->ld_key;
80365+}
80366+
80367+/* true if @key is inside key range for @node */
80368+int znode_contains_key(znode * node /* znode to look in */ ,
80369+ const reiser4_key * key /* key to look for */ )
80370+{
80371+ assert("nikita-1237", node != NULL);
80372+ assert("nikita-1238", key != NULL);
80373+
80374+ /* left_delimiting_key <= key <= right_delimiting_key */
80375+ return keyle(znode_get_ld_key(node), key)
80376+ && keyle(key, znode_get_rd_key(node));
80377+}
80378+
80379+/* same as znode_contains_key(), but lock dk lock */
80380+int znode_contains_key_lock(znode * node /* znode to look in */ ,
80381+ const reiser4_key * key /* key to look for */ )
80382+{
80383+ int result;
80384+
80385+ assert("umka-056", node != NULL);
80386+ assert("umka-057", key != NULL);
80387+
80388+ read_lock_dk(znode_get_tree(node));
80389+ result = znode_contains_key(node, key);
80390+ read_unlock_dk(znode_get_tree(node));
80391+ return result;
80392+}
80393+
80394+/* get parent pointer, assuming tree is not locked */
80395+znode *znode_parent_nolock(const znode * node /* child znode */ )
80396+{
80397+ assert("nikita-1444", node != NULL);
80398+ return node->in_parent.node;
80399+}
80400+
80401+/* get parent pointer of znode */
80402+znode *znode_parent(const znode * node /* child znode */ )
80403+{
80404+ assert("nikita-1226", node != NULL);
80405+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
80406+ return znode_parent_nolock(node);
80407+}
80408+
80409+/* detect uber znode used to protect in-superblock tree root pointer */
80410+int znode_above_root(const znode * node /* znode to query */ )
80411+{
80412+ assert("umka-059", node != NULL);
80413+
80414+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
80415+}
80416+
80417+/* check that @node is root---that its block number is recorder in the tree as
80418+ that of root node */
80419+#if REISER4_DEBUG
80420+static int znode_is_true_root(const znode * node /* znode to query */ )
80421+{
80422+ assert("umka-060", node != NULL);
80423+ assert("umka-061", current_tree != NULL);
80424+
80425+ return disk_addr_eq(znode_get_block(node),
80426+ &znode_get_tree(node)->root_block);
80427+}
80428+#endif
80429+
80430+/* check that @node is root */
80431+int znode_is_root(const znode * node /* znode to query */ )
80432+{
80433+ assert("nikita-1206", node != NULL);
80434+
80435+ return znode_get_level(node) == znode_get_tree(node)->height;
80436+}
80437+
80438+/* Returns true is @node was just created by zget() and wasn't ever loaded
80439+ into memory. */
80440+/* NIKITA-HANS: yes */
80441+int znode_just_created(const znode * node)
80442+{
80443+ assert("nikita-2188", node != NULL);
80444+ return (znode_page(node) == NULL);
80445+}
80446+
80447+/* obtain updated ->znode_epoch. See seal.c for description. */
80448+__u64 znode_build_version(reiser4_tree * tree)
80449+{
80450+ __u64 result;
80451+
80452+ spin_lock(&tree->epoch_lock);
80453+ result = ++tree->znode_epoch;
80454+ spin_unlock(&tree->epoch_lock);
80455+ return result;
80456+}
80457+
80458+void init_load_count(load_count * dh)
80459+{
80460+ assert("nikita-2105", dh != NULL);
80461+ memset(dh, 0, sizeof *dh);
80462+}
80463+
80464+void done_load_count(load_count * dh)
80465+{
80466+ assert("nikita-2106", dh != NULL);
80467+ if (dh->node != NULL) {
80468+ for (; dh->d_ref > 0; --dh->d_ref)
80469+ zrelse(dh->node);
80470+ dh->node = NULL;
80471+ }
80472+}
80473+
80474+static int incr_load_count(load_count * dh)
80475+{
80476+ int result;
80477+
80478+ assert("nikita-2110", dh != NULL);
80479+ assert("nikita-2111", dh->node != NULL);
80480+
80481+ result = zload(dh->node);
80482+ if (result == 0)
80483+ ++dh->d_ref;
80484+ return result;
80485+}
80486+
80487+int incr_load_count_znode(load_count * dh, znode * node)
80488+{
80489+ assert("nikita-2107", dh != NULL);
80490+ assert("nikita-2158", node != NULL);
80491+ assert("nikita-2109",
80492+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
80493+
80494+ dh->node = node;
80495+ return incr_load_count(dh);
80496+}
80497+
80498+int incr_load_count_jnode(load_count * dh, jnode * node)
80499+{
80500+ if (jnode_is_znode(node)) {
80501+ return incr_load_count_znode(dh, JZNODE(node));
80502+ }
80503+ return 0;
80504+}
80505+
80506+void copy_load_count(load_count * new, load_count * old)
80507+{
80508+ int ret = 0;
80509+ done_load_count(new);
80510+ new->node = old->node;
80511+ new->d_ref = 0;
80512+
80513+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
80514+ }
80515+
80516+ assert("jmacd-87589", ret == 0);
80517+}
80518+
80519+void move_load_count(load_count * new, load_count * old)
80520+{
80521+ done_load_count(new);
80522+ new->node = old->node;
80523+ new->d_ref = old->d_ref;
80524+ old->node = NULL;
80525+ old->d_ref = 0;
80526+}
80527+
80528+/* convert parent pointer into coord */
80529+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
80530+{
80531+ assert("nikita-3204", pcoord != NULL);
80532+ assert("nikita-3205", coord != NULL);
80533+
80534+ coord_init_first_unit_nocheck(coord, pcoord->node);
80535+ coord_set_item_pos(coord, pcoord->item_pos);
80536+ coord->between = AT_UNIT;
80537+}
80538+
80539+/* pack coord into parent_coord_t */
80540+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80541+{
80542+ assert("nikita-3206", pcoord != NULL);
80543+ assert("nikita-3207", coord != NULL);
80544+
80545+ pcoord->node = coord->node;
80546+ pcoord->item_pos = coord->item_pos;
80547+}
80548+
80549+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80550+ look for comments there) */
80551+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80552+{
80553+ pcoord->node = (znode *) node;
80554+ pcoord->item_pos = (unsigned short)~0;
80555+}
80556+
80557+#if REISER4_DEBUG
80558+
80559+/* debugging aid: znode invariant */
80560+static int znode_invariant_f(const znode * node /* znode to check */ ,
80561+ char const **msg /* where to store error
80562+ * message, if any */ )
80563+{
80564+#define _ergo(ant, con) \
80565+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80566+
80567+#define _equi(e1, e2) \
80568+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80569+
80570+#define _check(exp) ((*msg) = #exp, (exp))
80571+
80572+ return jnode_invariant_f(ZJNODE(node), msg) &&
80573+ /* [znode-fake] invariant */
80574+ /* fake znode doesn't have a parent, and */
80575+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80576+ /* there is another way to express this very check, and */
80577+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80578+ /* it has special block number, and */
80579+ _ergo(znode_get_level(node) == 0,
80580+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80581+ /* it is the only znode with such block number, and */
80582+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
80583+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80584+ /* it is parent of the tree root node */
80585+ _ergo(znode_is_true_root(node),
80586+ znode_above_root(znode_parent(node))) &&
80587+ /* [znode-level] invariant */
80588+ /* level of parent znode is one larger than that of child,
80589+ except for the fake znode, and */
80590+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80591+ znode_get_level(znode_parent(node)) ==
80592+ znode_get_level(node) + 1) &&
80593+ /* left neighbor is at the same level, and */
80594+ _ergo(znode_is_left_connected(node) && node->left != NULL,
80595+ znode_get_level(node) == znode_get_level(node->left)) &&
80596+ /* right neighbor is at the same level */
80597+ _ergo(znode_is_right_connected(node) && node->right != NULL,
80598+ znode_get_level(node) == znode_get_level(node->right)) &&
80599+ /* [znode-connected] invariant */
80600+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80601+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80602+ _ergo(!znode_is_root(node) && node->left != NULL,
80603+ znode_is_right_connected(node->left) &&
80604+ node->left->right == node) &&
80605+ _ergo(!znode_is_root(node) && node->right != NULL,
80606+ znode_is_left_connected(node->right) &&
80607+ node->right->left == node) &&
80608+ /* [znode-c_count] invariant */
80609+ /* for any znode, c_count of its parent is greater than 0 */
80610+ _ergo(znode_parent(node) != NULL &&
80611+ !znode_above_root(znode_parent(node)),
80612+ znode_parent(node)->c_count > 0) &&
80613+ /* leaves don't have children */
80614+ _ergo(znode_get_level(node) == LEAF_LEVEL,
80615+ node->c_count == 0) &&
80616+ _check(node->zjnode.jnodes.prev != NULL) &&
80617+ _check(node->zjnode.jnodes.next != NULL) &&
80618+ /* orphan doesn't have a parent */
80619+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80620+ /* [znode-modify] invariant */
80621+ /* if znode is not write-locked, its checksum remains
80622+ * invariant */
80623+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80624+ * cannot check this. */
80625+ /* [znode-refs] invariant */
80626+ /* only referenced znode can be long-term locked */
80627+ _ergo(znode_is_locked(node),
80628+ atomic_read(&ZJNODE(node)->x_count) != 0);
80629+}
80630+
80631+/* debugging aid: check znode invariant and panic if it doesn't hold */
80632+int znode_invariant(znode * node /* znode to check */ )
80633+{
80634+ char const *failed_msg;
80635+ int result;
80636+
80637+ assert("umka-063", node != NULL);
80638+ assert("umka-064", current_tree != NULL);
80639+
80640+ spin_lock_znode(node);
80641+ read_lock_tree(znode_get_tree(node));
80642+ result = znode_invariant_f(node, &failed_msg);
80643+ if (!result) {
80644+ /* print_znode("corrupted node", node); */
80645+ warning("jmacd-555", "Condition %s failed", failed_msg);
80646+ }
80647+ read_unlock_tree(znode_get_tree(node));
80648+ spin_unlock_znode(node);
80649+ return result;
80650+}
80651+
80652+/* return non-0 iff data are loaded into znode */
80653+int znode_is_loaded(const znode * node /* znode to query */ )
80654+{
80655+ assert("nikita-497", node != NULL);
80656+ return jnode_is_loaded(ZJNODE(node));
80657+}
80658+
80659+unsigned long znode_times_locked(const znode * z)
80660+{
80661+ return z->times_locked;
80662+}
80663+
80664+#endif /* REISER4_DEBUG */
80665+
80666+/* Make Linus happy.
80667+ Local variables:
80668+ c-indentation-style: "K&R"
80669+ mode-name: "LC"
80670+ c-basic-offset: 8
80671+ tab-width: 8
80672+ fill-column: 120
80673+ End:
80674+*/
80675diff --git a/fs/reiser4/znode.h b/fs/reiser4/znode.h
80676new file mode 100644
80677index 0000000..4699d0f
80678--- /dev/null
80679+++ b/fs/reiser4/znode.h
80680@@ -0,0 +1,434 @@
80681+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80682+ * reiser4/README */
80683+
80684+/* Declaration of znode (Zam's node). See znode.c for more details. */
80685+
80686+#ifndef __ZNODE_H__
80687+#define __ZNODE_H__
80688+
80689+#include "forward.h"
80690+#include "debug.h"
80691+#include "dformat.h"
80692+#include "key.h"
80693+#include "coord.h"
80694+#include "plugin/node/node.h"
80695+#include "jnode.h"
80696+#include "lock.h"
80697+#include "readahead.h"
80698+
80699+#include <linux/types.h>
80700+#include <linux/spinlock.h>
80701+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
80702+#include <asm/atomic.h>
80703+#include <asm/semaphore.h>
80704+
80705+/* znode tracks its position within parent (internal item in a parent node,
80706+ * that contains znode's block number). */
80707+typedef struct parent_coord {
80708+ znode *node;
80709+ pos_in_node_t item_pos;
80710+} parent_coord_t;
80711+
80712+/* &znode - node in a reiser4 tree.
80713+
80714+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80715+ cacheline pressure.
80716+
80717+ Locking:
80718+
80719+ Long term: data in a disk node attached to this znode are protected
80720+ by long term, deadlock aware lock ->lock;
80721+
80722+ Spin lock: the following fields are protected by the spin lock:
80723+
80724+ ->lock
80725+
80726+ Following fields are protected by the global tree lock:
80727+
80728+ ->left
80729+ ->right
80730+ ->in_parent
80731+ ->c_count
80732+
80733+ Following fields are protected by the global delimiting key lock (dk_lock):
80734+
80735+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
80736+ ->rd_key
80737+
80738+ Following fields are protected by the long term lock:
80739+
80740+ ->nr_items
80741+
80742+ ->node_plugin is never changed once set. This means that after code made
80743+ itself sure that field is valid it can be accessed without any additional
80744+ locking.
80745+
80746+ ->level is immutable.
80747+
80748+ Invariants involving this data-type:
80749+
80750+ [znode-fake]
80751+ [znode-level]
80752+ [znode-connected]
80753+ [znode-c_count]
80754+ [znode-refs]
80755+ [jnode-refs]
80756+ [jnode-queued]
80757+ [znode-modify]
80758+
80759+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80760+ Suggestions for how to do that are desired.*/
80761+struct znode {
80762+ /* Embedded jnode. */
80763+ jnode zjnode;
80764+
80765+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
80766+
80767+ pos_in_node and pos_in_unit are only hints that are cached to
80768+ speed up lookups during balancing. They are not required to be up to
80769+ date. Synched in find_child_ptr().
80770+
80771+ This value allows us to avoid expensive binary searches.
80772+
80773+ in_parent->node points to the parent of this node, and is NOT a
80774+ hint.
80775+ */
80776+ parent_coord_t in_parent;
80777+
80778+ /*
80779+ * sibling list pointers
80780+ */
80781+
80782+ /* left-neighbor */
80783+ znode *left;
80784+ /* right-neighbor */
80785+ znode *right;
80786+
80787+ /* long term lock on node content. This lock supports deadlock
80788+ detection. See lock.c
80789+ */
80790+ zlock lock;
80791+
80792+ /* You cannot remove from memory a node that has children in
80793+ memory. This is because we rely on the fact that parent of given
80794+ node can always be reached without blocking for io. When reading a
80795+ node into memory you must increase the c_count of its parent, when
80796+ removing it from memory you must decrease the c_count. This makes
80797+ the code simpler, and the cases where it is suboptimal are truly
80798+ obscure.
80799+ */
80800+ int c_count;
80801+
80802+ /* plugin of node attached to this znode. NULL if znode is not
80803+ loaded. */
80804+ node_plugin *nplug;
80805+
80806+ /* version of znode data. This is increased on each modification. This
80807+ * is necessary to implement seals (see seal.[ch]) efficiently. */
80808+ __u64 version;
80809+
80810+ /* left delimiting key. Necessary to efficiently perform
80811+ balancing with node-level locking. Kept in memory only. */
80812+ reiser4_key ld_key;
80813+ /* right delimiting key. */
80814+ reiser4_key rd_key;
80815+
80816+ /* znode's tree level */
80817+ __u16 level;
80818+ /* number of items in this node. This field is modified by node
80819+ * plugin. */
80820+ __u16 nr_items;
80821+
80822+#if REISER4_DEBUG
80823+ void *creator;
80824+ reiser4_key first_key;
80825+ unsigned long times_locked;
80826+ int left_version; /* when node->left was updated */
80827+ int right_version; /* when node->right was updated */
80828+ int ld_key_version; /* when node->ld_key was updated */
80829+ int rd_key_version; /* when node->rd_key was updated */
80830+#endif
80831+
80832+} __attribute__ ((aligned(16)));
80833+
80834+ON_DEBUG(extern atomic_t delim_key_version;
80835+ )
80836+
80837+/* In general I think these macros should not be exposed. */
80838+#define znode_is_locked(node) (lock_is_locked(&node->lock))
80839+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
80840+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
80841+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
80842+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
80843+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80844+/* Macros for accessing the znode state. */
80845+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
80846+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
80847+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
80848+extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80849+ znode * parent, tree_level level, gfp_t gfp_flag);
80850+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80851+extern int zload(znode * node);
80852+extern int zload_ra(znode * node, ra_info_t * info);
80853+extern int zinit_new(znode * node, gfp_t gfp_flags);
80854+extern void zrelse(znode * node);
80855+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80856+
80857+/* size of data in znode */
80858+static inline unsigned
80859+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80860+{
80861+ assert("nikita-1416", node != NULL);
80862+ return PAGE_CACHE_SIZE;
80863+}
80864+
80865+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80866+ coord_t * coord);
80867+extern void coord_to_parent_coord(const coord_t * coord,
80868+ parent_coord_t * pcoord);
80869+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80870+
80871+extern unsigned znode_free_space(znode * node);
80872+
80873+extern reiser4_key *znode_get_rd_key(znode * node);
80874+extern reiser4_key *znode_get_ld_key(znode * node);
80875+
80876+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80877+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80878+
80879+/* `connected' state checks */
80880+static inline int znode_is_right_connected(const znode * node)
80881+{
80882+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80883+}
80884+
80885+static inline int znode_is_left_connected(const znode * node)
80886+{
80887+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80888+}
80889+
80890+static inline int znode_is_connected(const znode * node)
80891+{
80892+ return znode_is_right_connected(node) && znode_is_left_connected(node);
80893+}
80894+
80895+extern int znode_shift_order;
80896+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80897+extern void znode_remove(znode *, reiser4_tree *);
80898+extern znode *znode_parent(const znode * node);
80899+extern znode *znode_parent_nolock(const znode * node);
80900+extern int znode_above_root(const znode * node);
80901+extern int init_znodes(void);
80902+extern void done_znodes(void);
80903+extern int znodes_tree_init(reiser4_tree * ztree);
80904+extern void znodes_tree_done(reiser4_tree * ztree);
80905+extern int znode_contains_key(znode * node, const reiser4_key * key);
80906+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80907+extern unsigned znode_save_free_space(znode * node);
80908+extern unsigned znode_recover_free_space(znode * node);
80909+extern znode *zalloc(gfp_t gfp_flag);
80910+extern void zinit(znode *, const znode * parent, reiser4_tree *);
80911+extern int zparse(znode * node);
80912+
80913+extern int znode_just_created(const znode * node);
80914+
80915+extern void zfree(znode * node);
80916+
80917+#if REISER4_DEBUG
80918+extern void print_znode(const char *prefix, const znode * node);
80919+#else
80920+#define print_znode( p, n ) noop
80921+#endif
80922+
80923+/* Make it look like various znode functions exist instead of treating znodes as
80924+ jnodes in znode-specific code. */
80925+#define znode_page(x) jnode_page ( ZJNODE(x) )
80926+#define zdata(x) jdata ( ZJNODE(x) )
80927+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
80928+#define znode_created(x) jnode_created ( ZJNODE(x) )
80929+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
80930+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
80931+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
80932+
80933+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
80934+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
80935+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
80936+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
80937+
80938+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
80939+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
80940+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
80941+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
80942+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80943+
80944+#if REISER4_DEBUG
80945+extern int znode_x_count_is_protected(const znode * node);
80946+extern int znode_invariant(znode * node);
80947+#endif
80948+
80949+/* acquire reference to @node */
80950+static inline znode *zref(znode * node)
80951+{
80952+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
80953+ return JZNODE(jref(ZJNODE(node)));
80954+}
80955+
80956+/* release reference to @node */
80957+static inline void zput(znode * node)
80958+{
80959+ assert("nikita-3564", znode_invariant(node));
80960+ jput(ZJNODE(node));
80961+}
80962+
80963+/* get the level field for a znode */
80964+static inline tree_level znode_get_level(const znode * node)
80965+{
80966+ return node->level;
80967+}
80968+
80969+/* get the level field for a jnode */
80970+static inline tree_level jnode_get_level(const jnode * node)
80971+{
80972+ if (jnode_is_znode(node))
80973+ return znode_get_level(JZNODE(node));
80974+ else
80975+ /* unformatted nodes are all at the LEAF_LEVEL and for
80976+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
80977+ return LEAF_LEVEL;
80978+}
80979+
80980+/* true if jnode is on leaf level */
80981+static inline int jnode_is_leaf(const jnode * node)
80982+{
80983+ if (jnode_is_znode(node))
80984+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80985+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80986+ return 1;
80987+ return 0;
80988+}
80989+
80990+/* return znode's tree */
80991+static inline reiser4_tree *znode_get_tree(const znode * node)
80992+{
80993+ assert("nikita-2692", node != NULL);
80994+ return jnode_get_tree(ZJNODE(node));
80995+}
80996+
80997+/* resolve race with zput */
80998+static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80999+{
81000+ jnode *j;
81001+
81002+ j = jnode_rip_sync(tree, ZJNODE(node));
81003+ if (likely(j != NULL))
81004+ node = JZNODE(j);
81005+ else
81006+ node = NULL;
81007+ return node;
81008+}
81009+
81010+#if defined(REISER4_DEBUG)
81011+int znode_is_loaded(const znode * node /* znode to query */ );
81012+#endif
81013+
81014+extern __u64 znode_build_version(reiser4_tree * tree);
81015+
81016+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
81017+ must load the data for a node in many places. We could do this by simply calling
81018+ zload() everywhere, the difficulty arises when we must release the loaded data by
81019+ calling zrelse. In a function with many possible error/return paths, it requires extra
81020+ work to figure out which exit paths must call zrelse and those which do not. The data
81021+ handle automatically calls zrelse for every zload that it is responsible for. In that
81022+ sense, it acts much like a lock_handle.
81023+*/
81024+typedef struct load_count {
81025+ znode *node;
81026+ int d_ref;
81027+} load_count;
81028+
81029+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
81030+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
81031+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
81032+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
81033+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
81034+ * don't require zload/zrelse treatment). */
81035+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
81036+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
81037+
81038+/* Variable initializers for load_count. */
81039+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
81040+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
81041+/* A convenience macro for use in assertions or debug-only code, where loaded
81042+ data is only required to perform the debugging check. This macro
81043+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
81044+#define WITH_DATA( node, exp ) \
81045+({ \
81046+ long __with_dh_result; \
81047+ znode *__with_dh_node; \
81048+ \
81049+ __with_dh_node = ( node ); \
81050+ __with_dh_result = zload( __with_dh_node ); \
81051+ if( __with_dh_result == 0 ) { \
81052+ __with_dh_result = ( long )( exp ); \
81053+ zrelse( __with_dh_node ); \
81054+ } \
81055+ __with_dh_result; \
81056+})
81057+
81058+/* Same as above, but accepts a return value in case zload fails. */
81059+#define WITH_DATA_RET( node, ret, exp ) \
81060+({ \
81061+ int __with_dh_result; \
81062+ znode *__with_dh_node; \
81063+ \
81064+ __with_dh_node = ( node ); \
81065+ __with_dh_result = zload( __with_dh_node ); \
81066+ if( __with_dh_result == 0 ) { \
81067+ __with_dh_result = ( int )( exp ); \
81068+ zrelse( __with_dh_node ); \
81069+ } else \
81070+ __with_dh_result = ( ret ); \
81071+ __with_dh_result; \
81072+})
81073+
81074+#define WITH_COORD(coord, exp) \
81075+({ \
81076+ coord_t *__coord; \
81077+ \
81078+ __coord = (coord); \
81079+ coord_clear_iplug(__coord); \
81080+ WITH_DATA(__coord->node, exp); \
81081+})
81082+
81083+#if REISER4_DEBUG
81084+#define STORE_COUNTERS \
81085+ reiser4_lock_counters_info __entry_counters = \
81086+ *reiser4_lock_counters()
81087+#define CHECK_COUNTERS \
81088+ON_DEBUG_CONTEXT( \
81089+({ \
81090+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
81091+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
81092+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
81093+ assert("nikita-2159", \
81094+ !memcmp(&__entry_counters, reiser4_lock_counters(), \
81095+ sizeof __entry_counters)); \
81096+}) )
81097+
81098+#else
81099+#define STORE_COUNTERS
81100+#define CHECK_COUNTERS noop
81101+#endif
81102+
81103+/* __ZNODE_H__ */
81104+#endif
81105+
81106+/* Make Linus happy.
81107+ Local variables:
81108+ c-indentation-style: "K&R"
81109+ mode-name: "LC"
81110+ c-basic-offset: 8
81111+ tab-width: 8
81112+ fill-column: 120
81113+ End:
81114+*/
81115diff --git a/include/linux/fs.h b/include/linux/fs.h
81116index 1410e53..dd12411 100644
81117--- a/include/linux/fs.h
81118+++ b/include/linux/fs.h
81119@@ -1165,6 +1165,8 @@ struct super_operations {
81120 void (*clear_inode) (struct inode *);
81121 void (*umount_begin) (struct vfsmount *, int);
81122
81123+ void (*sync_inodes) (struct super_block *sb,
81124+ struct writeback_control *wbc);
81125 int (*show_options)(struct seq_file *, struct vfsmount *);
81126 int (*show_stats)(struct seq_file *, struct vfsmount *);
81127 #ifdef CONFIG_QUOTA
81128@@ -1583,6 +1585,7 @@ extern int invalidate_inode_pages2(struct address_space *mapping);
81129 extern int invalidate_inode_pages2_range(struct address_space *mapping,
81130 pgoff_t start, pgoff_t end);
81131 extern int write_inode_now(struct inode *, int);
81132+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
81133 extern int filemap_fdatawrite(struct address_space *);
81134 extern int filemap_flush(struct address_space *);
81135 extern int filemap_fdatawait(struct address_space *);
81136diff --git a/lib/radix-tree.c b/lib/radix-tree.c
81137index d69ddbe..ed3e15f 100644
81138--- a/lib/radix-tree.c
81139+++ b/lib/radix-tree.c
81140@@ -151,6 +151,7 @@ int radix_tree_preload(gfp_t gfp_mask)
81141 out:
81142 return ret;
81143 }
81144+EXPORT_SYMBOL(radix_tree_preload);
81145
81146 static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
81147 int offset)
81148diff --git a/mm/filemap.c b/mm/filemap.c
81149index 8332c77..b16d2cb 100644
81150--- a/mm/filemap.c
81151+++ b/mm/filemap.c
81152@@ -121,6 +121,7 @@ void __remove_from_page_cache(struct page *page)
81153 mapping->nrpages--;
81154 __dec_zone_page_state(page, NR_FILE_PAGES);
81155 }
81156+EXPORT_SYMBOL(__remove_from_page_cache);
81157
81158 void remove_from_page_cache(struct page *page)
81159 {
81160@@ -132,6 +133,7 @@ void remove_from_page_cache(struct page *page)
81161 __remove_from_page_cache(page);
81162 write_unlock_irq(&mapping->tree_lock);
81163 }
81164+EXPORT_SYMBOL(remove_from_page_cache);
81165
81166 static int sync_page(void *word)
81167 {
81168@@ -465,6 +467,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
81169 lru_cache_add(page);
81170 return ret;
81171 }
81172+EXPORT_SYMBOL(add_to_page_cache_lru);
81173
81174 #ifdef CONFIG_NUMA
81175 struct page *__page_cache_alloc(gfp_t gfp)
81176@@ -738,6 +741,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
81177 read_unlock_irq(&mapping->tree_lock);
81178 return ret;
81179 }
81180+EXPORT_SYMBOL(find_get_pages);
81181
81182 /**
81183 * find_get_pages_contig - gang contiguous pagecache lookup
81184@@ -798,6 +802,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
81185 read_unlock_irq(&mapping->tree_lock);
81186 return ret;
81187 }
81188+EXPORT_SYMBOL(find_get_pages_tag);
81189
81190 /**
81191 * grab_cache_page_nowait - returns locked page at given index in given cache
81192diff --git a/mm/readahead.c b/mm/readahead.c
81193index 0f539e8..9db41de 100644
81194--- a/mm/readahead.c
81195+++ b/mm/readahead.c
81196@@ -568,6 +568,7 @@ void handle_ra_miss(struct address_space *mapping,
81197 ra->flags &= ~RA_FLAG_INCACHE;
81198 ra->cache_hit = 0;
81199 }
81200+EXPORT_SYMBOL_GPL(handle_ra_miss);
81201
81202 /*
81203 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a